In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import re
import spacy
from sklearn.model_selection import train_test_split

In [21]:
#Data Loading 1
col_names = ['Sentiment', 'ID', 'User', 'Text']
SentimentTwtData = pd.read_csv('data/SentimentTwtTrain.csv', usecols=[0,1,4,5], encoding='latin1', names=col_names)
ElectionTwts2016 = pd.read_csv('data/2016ElectionTwts.csv', usecols=[0,1,9,11])

In [20]:
#Fixing bad data issues
chunksize = 10000
for chunk in pd.read_csv('data/2020TrumpTwts.csv', usecols=[2], dtype='str', chunksize=chunksize):
    continue
    #print(chunk.head(1))

In [22]:
#Data Loading 2
Biden2020Twts = pd.read_csv('data/2020BidenTwts.csv', usecols=[2], dtype='str')
Trump2020Twts = pd.read_csv('data/2020TrumpTwts.csv', usecols=[2], dtype='str')
for dset in [Biden2020Twts, Trump2020Twts]:
    dset.dropna(subset=['tweet'], inplace=True)
    print(dset.head(3))
ElectionTwts2016.dropna(subset=['tweet_text'], inplace=True)
print(ElectionTwts2016.head(3))
SentimentTwtData.dropna(subset=['Text'], inplace=True)
print(SentimentTwtData.head(3))

                                               tweet
0  #Elecciones2020 | En #Florida: #JoeBiden dice ...
1  #HunterBiden #HunterBidenEmails #JoeBiden #Joe...
2  @IslandGirlPRV @BradBeauregardJ @MeidasTouch T...
                                               tweet
0  #Elecciones2020 | En #Florida: #JoeBiden dice ...
1  Usa 2020, Trump contro Facebook e Twitter: cop...
2  #Trump: As a student I used to hear for years,...
      id  candidate_id lang                                         tweet_text
0  57486             3   tr  Ne farkınız var DAIŞ, El-Kaide, El Nusra, YPG ...
1  57536             3   en  @BarackObama the way you showed up in Louisian...
2  57586             3   en  White house not taking tax avoid seriously! Ap...
   Sentiment          ID             User  \
0          0  1467810369  _TheSpecialOne_   
1          0  1467810672    scotthamilton   
2          0  1467810917         mattycus   

                                                Text  
0  @switchfoot http://tw

In [23]:
#This line of code is combining the two datasets together and turning the row indicies into either Biden or Trump
Combined2020Twts = pd.concat([Biden2020Twts, Trump2020Twts], keys=['Biden', 'Trump'])
#This turns those indicies we created into a new column, the level=0 is for which set of indicies we are pulling out
Combined2020Twts.reset_index(level=0, inplace=True)
Combined2020Twts.rename(columns={'level_0': 'Candidate'}, inplace=True)
Combined2020Twts.drop_duplicates(subset=['tweet'])
print(Combined2020Twts.head())

  Candidate                                              tweet
0     Biden  #Elecciones2020 | En #Florida: #JoeBiden dice ...
1     Biden  #HunterBiden #HunterBidenEmails #JoeBiden #Joe...
2     Biden  @IslandGirlPRV @BradBeauregardJ @MeidasTouch T...
3     Biden  @chrislongview Watching and setting dvr. Let’s...
4     Biden  #censorship #HunterBiden #Biden #BidenEmails #...


In [24]:
#Drop a few more unnecessary data points and rename "text" cols
ElectionTwts2016.drop(['id','lang'], axis=1, inplace=True)
SentimentTwtData.drop(['ID','User'], axis=1, inplace=True)
ElectionTwts2016.rename(columns={'tweet_text': 'Text'}, inplace=True)
Combined2020Twts.rename(columns={'tweet': 'Text'}, inplace=True)
print(ElectionTwts2016.head(3))
print(SentimentTwtData.head(3))

   candidate_id                                               Text
0             3  Ne farkınız var DAIŞ, El-Kaide, El Nusra, YPG ...
1             3  @BarackObama the way you showed up in Louisian...
2             3  White house not taking tax avoid seriously! Ap...
   Sentiment                                               Text
0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1          0  is upset that he can't update his Facebook by ...
2          0  @Kenichan I dived many times for the ball. Man...


In [27]:
#Processes the data
nlp = spacy.load('en_core_web_sm')
def preprocess_text(dset):
    total = len(dset['Text'])
    count = 0
    processed_text = []
    for text in dset['Text'].values: #loops through all values in 'Text' col
        count+=1
        #Replaces @usernames with "name", Cleans text of HTMl tags, URLS, removes extra spaces, and makes the text lowercase
        text = re.sub(r'@\w+', 'username ', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = text.lower()
        
        result = [token.lemma_ for token in nlp(text) if token.is_alpha and not token.is_stop]
        processed_text.append(' '.join(result))
        if (count % 10000) == 0:
            print(text)
            print(' '.join(result))
            print(f"{count} / {total}: {(count/total)*100}%\n")
    return processed_text

In [29]:
ProcessedSentimentTwtData = preprocess_text(SentimentTwtData)
print('----------------Finished----------------')

its another rainboot day 
rainboot day
10000 / 1600000: 0.625%

i threw my sign at donnie and he bent over to get it but it was under a thingee so he made a sad face at me 
throw sign donnie bend thingee sad face
20000 / 1600000: 1.25%

nooooooooooooooo!!!!!! school today. but the worst part is that i wont be able to tweet troughout the day 
nooooooooooooooo school today bad will not able tweet troughout day
30000 / 1600000: 1.875%

allergies or insomnia? doesn't matter the reason 
allergy insomnia matter reason
40000 / 1600000: 2.5%

i just jacked up this umbrella cake 
jack umbrella cake
50000 / 1600000: 3.125%

i'm breaking out. 
break
60000 / 1600000: 3.75%

username  buy it and bring it up here i would love a turtle! had one and had to give it away 
username buy bring love turtle away
70000 / 1600000: 4.375%

i hate car washing 
hate car washing
80000 / 1600000: 5.0%

its mothers day.  hopefully i wont remember later what today is.
mother day hopefully will not remember later toda

In [30]:
#Save Sentiment data
ProcessedSentimentTwtDataD = pd.DataFrame(ProcessedSentimentTwtData, columns=['Text'])
ComboProcessedSentimentTwtDataD = pd.concat([pd.DataFrame(SentimentTwtData['Sentiment'], columns=['Sentiment']), ProcessedSentimentTwtDataD], axis=1)
file_path = os.path.join('data', 'ProcessedSentimentTwtData.csv')
ComboProcessedSentimentTwtDataD.to_csv(file_path, index=False)

In [36]:
ProcessedCombined2020Twts = preprocess_text(Combined2020Twts)
print('Finished')

5000 / 1858241: 0.2690716650854222%
10000 / 1858241: 0.5381433301708444%
15000 / 1858241: 0.8072149952562665%
20000 / 1858241: 1.0762866603416887%
25000 / 1858241: 1.3453583254271109%
30000 / 1858241: 1.614429990512533%
35000 / 1858241: 1.8835016555979553%
40000 / 1858241: 2.1525733206833775%
45000 / 1858241: 2.4216449857687996%
50000 / 1858241: 2.6907166508542217%
55000 / 1858241: 2.959788315939644%
60000 / 1858241: 3.228859981025066%
65000 / 1858241: 3.4979316461104886%
70000 / 1858241: 3.7670033111959107%
75000 / 1858241: 4.036074976281332%
80000 / 1858241: 4.305146641366755%
85000 / 1858241: 4.574218306452177%
90000 / 1858241: 4.843289971537599%
95000 / 1858241: 5.112361636623022%
100000 / 1858241: 5.3814333017084435%
105000 / 1858241: 5.650504966793866%
110000 / 1858241: 5.919576631879288%
115000 / 1858241: 6.188648296964709%
120000 / 1858241: 6.457719962050132%
125000 / 1858241: 6.726791627135555%
130000 / 1858241: 6.995863292220977%
135000 / 1858241: 7.264934957306399%
140000 / 

In [39]:
ProcessedElectionTwts2016 = preprocess_text(ElectionTwts2016)
print('Finished')

5000 / 55393: 9.026411279403534%
10000 / 55393: 18.05282255880707%
15000 / 55393: 27.079233838210605%
20000 / 55393: 36.10564511761414%
25000 / 55393: 45.13205639701767%
30000 / 55393: 54.15846767642121%
35000 / 55393: 63.18487895582474%
40000 / 55393: 72.21129023522828%
45000 / 55393: 81.23770151463181%
50000 / 55393: 90.26411279403534%
55000 / 55393: 99.29052407343887%
Finished


In [37]:
#Save 2020 Election data
ProcessedCombined2020TwtsD = pd.DataFrame(ProcessedCombined2020Twts, columns=['Text'])
file_path = os.path.join('data', 'ProcessedCombined2020Twts.csv')
ProcessedCombined2020TwtsD.to_csv(file_path, index=False)

In [40]:
#Save 2016 Election data
ProcessedElectionTwts2016D = pd.DataFrame(ProcessedElectionTwts2016, columns=['Text'])
file_path = os.path.join('data', 'ProcessedElectionTwts2016.csv')
ProcessedElectionTwts2016D.to_csv(file_path, index=False)

In [2]:
#Re-import Sentiment data
SentimentData = pd.read_csv('data/ProcessedSentimentTwtData.csv')
SentimentData.dropna(subset=['Text'], inplace=True)
train_sentiment, test_sentiment = train_test_split(SentimentData, test_size=0.2)
file_path = os.path.join('data', 'FinalTrainSentimentData.csv')
train_sentiment.to_csv(file_path, index=False)
file_path = os.path.join('data', 'FinalTestSentimentData.csv')
test_sentiment.to_csv(file_path, index=False)