In [7]:
!pip install pandas
!pip install pathlib



In [8]:
import pandas as pd
from pathlib import Path

In [9]:
#expects data frame & file name as input
#appends the data from the data frame to the csv. if there are duplicates the new data will overwrite the old
#if no file in folder the new data is saved as csv
def append_df(df, file_name):
  my_file = Path(file_name)
  if my_file.exists():
    print("Appending to existing file named " + file_name)
    orig_df = pd.read_csv(file_name)
    print("Old Data Frame: ")
    print(orig_df)
    new_df = pd.concat([orig_df, df], ignore_index=True).drop_duplicates(subset=['news'], keep='last')
    print("New Data Frame: ")
    print(new_df)
    update_csv(new_df, file_name)
  else:
    print("Creating new file named" + file_name)
    df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')

In [10]:
#expects data frame and file name as input
#saves the data frame to the csv file
#expected to be used for overwriting a csv with updated data
def update_csv(df, file_name):
  print("Overwriting " + file_name)
  df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')

In [13]:
#removes duplicates preemptively to avoid labeling already labeled data
#duplicates may occur if already labeled sentences are added again to the unlabeled dataset during workflow
print("Deleting duplicates")

#unlabeled dataset
file_name = "news_headlines.csv"
#labeled dataset
new_file_name = "news_headlines_sentiment.csv"
news_sentiment_df = pd.DataFrame(columns=["news", "sentiment"])
orig_file = Path(file_name)
new_file = Path(new_file_name)
if orig_file.exists() and new_file.exists():
  #read unlabeled dataset. erroneous rows are skipped to increase robustness of the labeling process
  df = pd.read_csv(file_name, encoding='utf-8-sig', error_bad_lines=False)
  print("Loaded " + file_name)
  #go through the unlabeled dataset
  for index, row in df.iterrows():

    new_element = [row["news"], 3]
    #save labeled sentence to new file
    #create data frame only containing the current row
    if len(news_sentiment_df)!=0:
      news_sentiment_df.iloc[0] = new_element
    else:
      news_sentiment_df.loc[len(news_sentiment_df)] = new_element
    #read labeled dataset as data frame
    orig_df = pd.read_csv(new_file)
    #concat labeled dataset and current row of unlabeled dataset and drop duplicates
    new_df = pd.concat([orig_df, news_sentiment_df], ignore_index=True).drop_duplicates(subset=['news'], keep='last')

    #if sizes are same the sentence was already in the labeled dataset => delete from unlabeled dataset and update csv
    if (orig_df.size == new_df.size):
      index_name = df[df["news"] == row["news"]].index
      df.drop(index_name, inplace=True)
      update_csv(df, file_name)
      print("Duplicate removed")

else:
  print("File not Found")

Deleting duplicates
Loaded news_headlines.csv
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headlines.csv
Duplicate removed
Overwriting news_headli

In [14]:
#shows sentence by sentence from unlabeled dataset, show it and labels it according to input. update labeled dataset in the end
#warning about possible error
print("WARNING: EDITING CSV FILE WITH EXCEL MAY CORRUPT FILE\n")
#unlabeled dataset
file_name = "news_headlines.csv"
#labeled dataset
new_file = "news_headlines_sentiment.csv"
news_sentiment_df = pd.DataFrame(columns=["news", "sentiment"])
my_file = Path(file_name)
if my_file.exists():
  #load and go through unlabeled dataset
  df = pd.read_csv(file_name, encoding='utf-8-sig', error_bad_lines=False)
  print("Loaded " + file_name)
  for index, row in df.iterrows():

    user_input = -1
    range = [0, 1, 2]
    #ask for user input until acceptable number is entered
    while user_input not in range:
      print("####################################################################")
      print(row["news"])
      try:
        user_input = int(input("Positive: 0\nNegative: 1\nNeutral: 2\n"))
      except ValueError as err:
        print("\nPlease enter an Integer!\n")
        pass
    new_element = 0
    #prepare labeled row according to input
    if user_input == 0:
      new_element = [row["news"], 0]
    elif user_input == 1:
      new_element = [row["news"], 1]
    elif user_input == 2:
      new_element = [row["news"], 2]

    #save labeled sentence to labeled dataset
    news_sentiment_df.loc[len(news_sentiment_df)] = new_element
    append_df(news_sentiment_df, new_file)

    #delete sentence from unlabeled dataset
    index_name = df[df["news"] == row["news"]].index
    df.drop(index_name, inplace=True)
    update_csv(df, file_name)

else:
  print("File not Found")


Loaded news_headlines.csv
####################################################################
Comcast Stock Climbs As Activist Investor Trian Fund Maneuvers For Changes
Positive: 0
Negative: 1
Neutral: 2
0
Appending to existing file named news_headlines_sentiment.csv
Old Data Frame: 
                                                   news  sentiment
0     UPDATE 3-Brazil economy back to 2009 size afte...          0
1     GLOBAL MARKETS-Manufacturing data lifts stocks...          1
2     TREASURIES-Yields move higher after U.S. manuf...          2
3     UPDATE 2-Dollar weakness lifts pound to 8-mont...          2
4     UPDATE 1-U.S. House Oversight Committee to sub...          0
...                                                 ...        ...
7995  Trian Investment in Comcast Fuels Debate on Br...          0
7996                               Is Roku Stock a Buy?          1
7997                10 Most Profitable TV Shows in 2020          2
7998  Comcasts Amy Banse Transitions to Sen

KeyboardInterrupt: ignored