In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import spacy

In [8]:
#Data Loading
col_names = ['Sentiment', 'ID', 'User', 'Text']
SentimentTwtData = pd.read_csv('data/SentimentTwtTrain.csv', usecols=[0,1,4,5], encoding='latin1', names=col_names)
ElectionTwts2016 = pd.read_csv('data/2016ElectionTwts.csv', usecols=[0,1,9,11])
Biden2020Twts = pd.read_csv('data/2020BidenTwts.csv', usecols=[2,7])
Trump2020Twts = pd.read_csv('data/2020TrumpTwts.csv', usecols=[2,7])
for dset in [Biden2020Twts, Trump2020Twts]:
    dset.dropna(subset=['tweet'], inplace=True)
    print(dset.head(3))
ElectionTwts2016.dropna(subset=['tweet_text'], inplace=True)
print(ElectionTwts2016.head(3))
SentimentTwtData.dropna(subset=['Text'], inplace=True)
print(SentimentTwtData.head(3))

  Biden2020Twts = pd.read_csv('data/2020BidenTwts.csv', usecols=[2,7])


                                               tweet           user_name
0  #Elecciones2020 | En #Florida: #JoeBiden dice ...  El Sol Latino News
1  #HunterBiden #HunterBidenEmails #JoeBiden #Joe...         Cheri A. 🇺🇸
2  @IslandGirlPRV @BradBeauregardJ @MeidasTouch T...          Flag Waver
                                               tweet           user_name
0  #Elecciones2020 | En #Florida: #JoeBiden dice ...  El Sol Latino News
1  Usa 2020, Trump contro Facebook e Twitter: cop...             Tgcom24
2  #Trump: As a student I used to hear for years,...              snarke
      id  candidate_id lang                                         tweet_text
0  57486             3   tr  Ne farkınız var DAIŞ, El-Kaide, El Nusra, YPG ...
1  57536             3   en  @BarackObama the way you showed up in Louisian...
2  57586             3   en  White house not taking tax avoid seriously! Ap...
   Sentiment          ID             User  \
0          0  1467810369  _TheSpecialOne_   
1         

In [9]:
#This line of code is combining the two datasets together and turning the row indicies into either Biden or Trump
Combined2020Twts = pd.concat([Biden2020Twts, Trump2020Twts], keys=['Biden', 'Trump'])
#This turns those indicies we created into a new column, the level=0 is for which set of indicies we are pulling out
Combined2020Twts.reset_index(level=0, inplace=True)
Combined2020Twts.rename(columns={'level_0': 'Candidate'}, inplace=True)
Combined2020Twts.drop_duplicates(subset=['tweet'])
Combined2020Twts.drop(['user_name'], axis=1, inplace=True)
print(Combined2020Twts.head())

  Candidate                                              tweet
0     Biden  #Elecciones2020 | En #Florida: #JoeBiden dice ...
1     Biden  #HunterBiden #HunterBidenEmails #JoeBiden #Joe...
2     Biden  @IslandGirlPRV @BradBeauregardJ @MeidasTouch T...
3     Biden  @chrislongview Watching and setting dvr. Let’s...
4     Biden  #censorship #HunterBiden #Biden #BidenEmails #...


In [10]:
#Drop a few more unnecessary data points and rename "text" cols
ElectionTwts2016.drop(['id','lang'], axis=1, inplace=True)
SentimentTwtData.drop(['ID','User'], axis=1, inplace=True)
ElectionTwts2016.rename(columns={'tweet_text': 'Text'}, inplace=True)
Combined2020Twts.rename(columns={'tweet': 'Text'}, inplace=True)
print(ElectionTwts2016.head())
print(SentimentTwtData.head())

   candidate_id                                               Text
0             3  Ne farkınız var DAIŞ, El-Kaide, El Nusra, YPG ...
1             3  @BarackObama the way you showed up in Louisian...
2             3  White house not taking tax avoid seriously! Ap...
3             2  politico: .realDonaldTrump, HillaryClinton ple...
4             1  She's over due by 250 days. https://t.co/IfOO9...
   Sentiment                                               Text
0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          0  is upset that he can't update his Facebook by ...
2          0  @Kenichan I dived many times for the ball. Man...
3          0    my whole body feels itchy and like its on fire 
4          0  @nationwideclass no, it's not behaving at all....


In [18]:
#Processes the data
nlp = spacy.load('en_core_web_sm')
def preprocess_text(dset):
    count = 0
    processed_text = []
    total = len(dset['Text'])
    for text in dset['Text'].values: #loops through all values in 'Text' col
        count+=1
        #Replaces @usernames with "name", Cleans text of HTMl tags, URLS, removes extra spaces, and makes the text lowercase
        text = re.sub(r'@\w+', 'username', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'http\S+', '', text)    
        text = re.sub(r'\s+', ' ', text).strip()
        text = text.lower()
        
        result = []
        if nlp(text).lang_ == 'en': #if the data point is in english, proceed
            for token in nlp(text): #loops through each word or "token" in the datapoint
                if token.is_alpha and not token.is_stop: #ignores non-alphabetic tokens and "stopword" tokens
                    result.append(token.lemma_) #keeps the lemmatization of the token
            processed_text.append(' '.join(result)) #joins the tokens back together with a space between
        if (count % 5000) == 0:
            
            print(f"{count} / {total}: {(count/total)*100}%")
    return processed_text

In [None]:
ProcessedSentimentTwtData = preprocess_text(SentimentTwtData)
print('Finished')

In [None]:
ProcessedCombined2020Twts = preprocess_text(Combined2020Twts)
print('Finished')

In [None]:
ProcessedElectionTwts2016 = preprocess_text(ElectionTwts2016)
print('Finished')

In [None]:
#Split the data randomly, 80% train, 20% test
train_size = int(.8 * len(SentimentTwtData))
test_size = len(SentimentTwtData) - train_size
trainSentiment_dset, testSentiment_dset = random_split(SentimentTwtData, [train_size, test_size])
print(len(SentimentTwtData))

In [45]:
#Saving the files
file_path = os.path.join('data', 'Combined2020Twts.csv')
Combined2020Twts.to_csv(file_path, index=False)

file_path = os.path.join('data', 'CleanElectionTwts2016.csv')
ElectionTwts2016.to_csv(file_path, index=False)

file_path = os.path.join('data', 'TrainSentimentTwtData.csv')
trainSentiment_dset.to_csv(file_path, index=False)

file_path = os.path.join('data', 'TestSentimentTwtData.csv')
testSentiment_dset.to_csv(file_path, index=False)