### TRAINING BAYES MODEL WITH CODE FROM THE CLASS

In [4]:
# This snippet downloads the most popular datasets for experimenting with NLTK functionalities.
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     C:\Users

True

In [80]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews


# A function that extracts which words exist in a text based on a list of words to which we compare.
def word_feats(words):
        return dict([(word, True) for word in words])

# Get the negative reviews for movies    
negids = movie_reviews.fileids('neg')

# Get the positive reviews for movies
posids = movie_reviews.fileids('pos')
 
# Find the features that most correspond to negative reviews    
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]

# Find the features that most correspond to positive reviews
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

# We would only use 1500 instances to train on. The quarter of the reviews left is for testing purposes.
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)

In [85]:
negfeats[1]

({'the': True,
  'happy': True,
  'bastard': True,
  "'": True,
  's': True,
  'quick': True,
  'movie': True,
  'review': True,
  'damn': True,
  'that': True,
  'y2k': True,
  'bug': True,
  '.': True,
  'it': True,
  'got': True,
  'a': True,
  'head': True,
  'start': True,
  'in': True,
  'this': True,
  'starring': True,
  'jamie': True,
  'lee': True,
  'curtis': True,
  'and': True,
  'another': True,
  'baldwin': True,
  'brother': True,
  '(': True,
  'william': True,
  'time': True,
  ')': True,
  'story': True,
  'regarding': True,
  'crew': True,
  'of': True,
  'tugboat': True,
  'comes': True,
  'across': True,
  'deserted': True,
  'russian': True,
  'tech': True,
  'ship': True,
  'has': True,
  'strangeness': True,
  'to': True,
  'when': True,
  'they': True,
  'kick': True,
  'power': True,
  'back': True,
  'on': True,
  'little': True,
  'do': True,
  'know': True,
  'within': True,
  'going': True,
  'for': True,
  'gore': True,
  'bringing': True,
  'few': True,

In [6]:
# Construct the training dataset containing 50% positive reviews and 50% negative reviews
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]

# Construct the negative dataset containing 50% positive reviews and 50% negative reviews
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

print ('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

# Train a NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(trainfeats)

# Test the trained classifier and display the most informative features.
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

train on 1500 instances, test on 500 instances
accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


In [7]:
print (classifier.show_most_informative_features(32))

Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
               affecting = True              pos : neg    =      9.7 : 1.0
                  symbol = True              pos : neg    =      9.7 : 1.0
                   mulan = True              pos : neg    =      9.0 : 1.0

In [8]:
trainfeats[1]

({'the': True,
  'happy': True,
  'bastard': True,
  "'": True,
  's': True,
  'quick': True,
  'movie': True,
  'review': True,
  'damn': True,
  'that': True,
  'y2k': True,
  'bug': True,
  '.': True,
  'it': True,
  'got': True,
  'a': True,
  'head': True,
  'start': True,
  'in': True,
  'this': True,
  'starring': True,
  'jamie': True,
  'lee': True,
  'curtis': True,
  'and': True,
  'another': True,
  'baldwin': True,
  'brother': True,
  '(': True,
  'william': True,
  'time': True,
  ')': True,
  'story': True,
  'regarding': True,
  'crew': True,
  'of': True,
  'tugboat': True,
  'comes': True,
  'across': True,
  'deserted': True,
  'russian': True,
  'tech': True,
  'ship': True,
  'has': True,
  'strangeness': True,
  'to': True,
  'when': True,
  'they': True,
  'kick': True,
  'power': True,
  'back': True,
  'on': True,
  'little': True,
  'do': True,
  'know': True,
  'within': True,
  'going': True,
  'for': True,
  'gore': True,
  'bringing': True,
  'few': True,

In [9]:
# Construct our features based on which tweets contain which word
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in movie_reviews.words():
        features['%s' % word] = (word in document_words)
    return features
 
    
def word_feats(words):
        return dict([(word, True) for word in words])
    

In [10]:
tweet = 'Larry is my friend'
word_feats(tweet.split())

{'Larry': True, 'is': True, 'my': True, 'friend': True}

In [11]:
# The tweet we are about to classify
tweet = 'Anna is one of my friends'
print (classifier.classify(word_feats(tweet.split())))

pos


### TEST THE MODEL WITH MANUALLY LABELLED DATA

Testing accuracy for hand labelled data from twitter elections, hand labelling was perform randomly on a selected data as taking them with a time order would be biased due to the events from the elections

In [19]:
import pandas as pd
from datetime import datetime
from tqdm import tqdm

test_politics_tweets = pd.read_excel("random_tweets_as_a_test_labeled.xlsx")
test_politics_tweets

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,184569,@johnand2015 @wme98 @TrumpDynastyUSA @realDona...,pos
1,602960,@realDonaldTrump AMERICANS WANT A CHANGE,pos
2,412384,"@realDonaldTrump is all in, I'm all in. Histor...",pos
3,423577,@JoeFreedomLove @FreeBeacon \nThat's Because T...,neg
4,647595,Not pneumonia symptoms: Med guideline requires...,neg
...,...,...,...
295,495873,@USA_FREEDOM_NOW @klmstlouis @realDonaldTrump ...,neg
296,615525,"#Hillary is corrupting, lying to our young peo...",neg
297,325430,@andylefko @KeithOlbermann @realDonaldTrump @S...,neg
298,460252,@amjoyshow #AMJoy #Trump is ripping off part o...,neg


Data cleaning

In [20]:
import numpy as np
import pandas as pd
import re
from spellchecker import SpellChecker
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import demoji
from bs4 import BeautifulSoup
demoji.download_codes()

# Initializes the spell checker, tokenizer and lammatizer.
check = SpellChecker()
tokenizer = RegexpTokenizer(r'\w+')
lemma = WordNetLemmatizer()

# Create a set of stopwords
stop_words = set(stopwords.words('english'))



# Cleaning functions

########################### FUNCTIONS ###################################
def correct_text(text):
# text needs to be a list of clean word tokens without other characters.
    misspelled = check.unknown(text)
    for word in misspelled:
        text[text.index(word)] = check.correction(word)
    return text

def lemmatize_text(text):
    return [lemma.lemmatize(word) for word in text]

# removing punctuation
def rm_punctuation(record: str) -> str:
    s = re.sub(r'[^\w\s]', '', record)
    return s


# removing stopwords - i.e. the, a, an, he\
def rm_stopwords(record: str) -> str:
    words = list(record.split(' '))
    filtered_sentence = ' '.join([w for w in words if not w in stop_words])
    return filtered_sentence

# removeing emoji
def rm_emoji(record: str) -> str:
    plain = demoji.replace(record, " ")
    return plain

# Removing html coding
def rm_html(record: str) -> str:
    soup = BeautifulSoup(record, 'html.parser')
    plain = soup.get_text()
    return plain


# A function that extracts which words exist in a text based on a list of words to which we compare.
def word_feats(words):
    return dict([(word, True) for word in words])

Downloading emoji data ...
... OK (Got response in 0.55 seconds)
Writing emoji data to C:\Users\PC\.demoji\codes.json ...
... OK


In [21]:
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['text'].apply(lambda x: re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "", x))
print("Regex done at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: re.sub(r'[0-9]+', ' ', x))
print("Numbers replaced at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: re.sub(r'\_', ' ', x))
print("Floors replaced at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: re.sub(r'@|#', ' ', x))
print("Hashtags done at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: x.lower())
print("Lowered at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: rm_punctuation(x))
print("Punctuation done at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: rm_stopwords(x))
print("Stop words bye at:", datetime.now())
test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: x.rstrip())
print("Whitespaces done at:", datetime.now())
#test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: tokenizer.tokenize(x))
#print("Tokenized at:", datetime.now())
#test_politics_tweets.loc[:, 'normalized'] = test_politics_tweets['normalized'].apply(lambda x: lemmatize_text(x))
#print("Lemmatized at:", datetime.now())

Regex done at: 2020-09-24 14:35:27.413998
Numbers replaced at: 2020-09-24 14:35:27.428987
Floors replaced at: 2020-09-24 14:35:27.441980
Hashtags done at: 2020-09-24 14:35:27.452977
Lowered at: 2020-09-24 14:35:27.466968
Punctuation done at: 2020-09-24 14:35:27.508943
Stop words bye at: 2020-09-24 14:35:27.535926
Whitespaces done at: 2020-09-24 14:35:27.546921


In [22]:
from tqdm import tqdm
tqdm.pandas()

test_politics_tweets.loc[:, 'features'] = test_politics_tweets.progress_apply(lambda x: (word_feats(x.normalized.split()), x. sentiment), axis = 1)
test_politics_tweets['features']

100%|██████████| 300/300 [00:00<00:00, 8828.50it/s]


0      ({'johnand': True, 'wme': True, 'trumpdynastyu...
1      ({'realdonaldtrump': True, 'americans': True, ...
2      ({'realdonaldtrump': True, 'im': True, 'histor...
3      ({'joefreedomlove': True, 'freebeacon': True, ...
4      ({'pneumonia': True, 'symptoms': True, 'med': ...
                             ...                        
295    ({'usa': True, 'freedom': True, 'klmstlouis': ...
296    ({'hillary': True, 'corrupting': True, 'lying'...
297    ({'andylefko': True, 'keitholbermann': True, '...
298    ({'amjoyshow': True, 'amjoy': True, 'trump': T...
299    ({'marxist': True, 'speak': True, 'village': T...
Name: features, Length: 300, dtype: object

In [29]:
test_politics_tweets['features'][1]

({'realdonaldtrump': True, 'americans': True, 'want': True, 'change': True},
 'pos')

In [30]:
# Select negative tweets marked by SentiStrenght  
negfeats = test_politics_tweets[test_politics_tweets['sentiment'] == 'neg']

# Select positive tweets marked by SentiStrenght
posfeats = test_politics_tweets[test_politics_tweets['sentiment'] == 'pos']

# We would only use 1500 instances to train on. The quarter of the reviews left is for testing purposes.
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)

#negcutoff = 55
#poscutoff = 55

In [31]:
# Construct the negative dataset containing 50% positive reviews and 50% negative reviews
testfeats = pd.concat([negfeats[negcutoff:], posfeats[poscutoff:]])


In [55]:
# Test the trained classifier and display the most informative features.
print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats['features'].tolist()))

accuracy: 0.5352112676056338
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
               affecting = True              pos : neg    =      9.7 : 1.0
                  symbol = True              pos : neg    =      9.7 : 1.0
                   mulan = True              

Assigning results to the tweets:

In [56]:
# The tweet we are about to classify
tweet = 'I like peanut butter idiots'
print (classifier.classify(word_feats(tweet.split())))

neg


Not quite good classificator I would say

### ASSIGNING SENTIMENT TO TWEETS 

Now we will assign sentiment to the tweets

In [71]:
import pandas as pd
data = pd.read_csv("cleaned_twitter_data_50000_sample.csv")
data

Unnamed: 0.1,Unnamed: 0,initial_index,tweet_created_at,user_created_at,text,user_id,user_name,followers_count,friends_count,user_lang,place_type,place_full_name,place_bounding_box,country,tweet_lang,retweet_count,favorite_count,states,candidate
0,439298,560631,2016-09-08 13:56:16+00:00,Sat May 16 17:31:28 +0000 2009,"['hillaryclinton', 'hit', 'one', 'reporter']",40504632,joey gerdin,542,596,en,city,"Minneapolis, MN","{'type': 'Polygon', 'coordinates': [[[-93.3295...",United States,en,0,0,Minnesota,clinton
1,326531,417258,2016-09-01 09:31:56+00:00,Tue Oct 26 17:08:21 +0000 2010,"['hillaryclinton', 'false', 'said', 'first', '...",208110094,M_Mexico_Great_Again,643,523,en,city,"Ferry Pass, FL","{'type': 'Polygon', 'coordinates': [[[-87.2476...",United States,en,0,0,Florida,clinton
2,313954,400256,2016-08-31 23:16:48+00:00,Wed Dec 14 05:41:00 +0000 2011,"['hillaryclinton', 'blew', 'invitation', 'pres...",436442356,Chris Harms,522,492,en,admin,"Wisconsin, USA","{'type': 'Polygon', 'coordinates': [[[-92.8894...",United States,en,0,0,Wisconsin,clinton
3,450210,573828,2016-09-08 23:21:16+00:00,Mon Aug 08 21:35:38 +0000 2016,"['usaneedstrump', 'hillaryclinton', 'dailycall...",762764227052392449,FrankB,5,72,en,admin,"Pennsylvania, USA","{'type': 'Polygon', 'coordinates': [[[-80.5198...",United States,en,0,0,Pennsylvania,clinton
4,182803,230748,2016-08-23 17:40:01+00:00,Wed Feb 17 16:55:49 +0000 2010,"['trump', 'tempcrab', 'orchard', 'ky', 'f', 'w...",115110145,Carl King,84,18,en,admin,"Kentucky, USA","{'type': 'Polygon', 'coordinates': [[[-89.5715...",United States,en,0,0,Kentucky,trump
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,109611,139329,2016-08-19 00:02:43+00:00,Sat Feb 11 23:08:46 +0000 2012,"['hillaryiscoming', 'ivanroberson', 'kickstart...",489825874,Kal AKA,495,990,en,city,"Los Angeles, CA","{'type': 'Polygon', 'coordinates': [[[-118.668...",United States,en,0,0,California,clinton
49996,224364,282945,2016-08-26 00:43:52+00:00,Fri Mar 05 18:24:11 +0000 2010,"['id', 'like', 'ban', 'trump', 'word', 'harden...",120182490,jenny speier,254,193,en,admin,"Indiana, USA","{'type': 'Polygon', 'coordinates': [[[-88.0978...",United States,en,0,0,Indiana,trump
49997,402684,515797,2016-09-06 18:27:25+00:00,Sat Aug 06 22:45:14 +0000 2011,"['obama', 'clinton', 'created', 'world', 'powe...",349921132,Michael Wilner,10443,661,en,city,"Washington, DC","{'type': 'Polygon', 'coordinates': [[[-77.1194...",United States,en,0,0,Washington,trump
49998,234923,296196,2016-08-26 15:58:23+00:00,Mon Jul 06 02:10:25 +0000 2015,"['hillaryclinton', 'keep', 'america', 'sane', ...",3269554988,cjlamb,1817,1886,en,admin,"Indiana, USA","{'type': 'Polygon', 'coordinates': [[[-88.0978...",United States,en,0,0,Indiana,clinton


In [72]:
data_before_processing = pd.read_csv('tweets_only_eng_and_US_without_cleaning_2020-09-24.csv')
data_before_processing = data_before_processing.rename(columns = {"text":"initial_tweet_text", "Unnamed: 0":"initial_index"})
data_before_processing

Unnamed: 0,initial_index,tweet_created_at,user_created_at,initial_tweet_text,user_id,user_name,followers_count,friends_count,user_lang,place_type,place_full_name,place_bounding_box,country,tweet_lang,retweet_count,favorite_count
0,1,2016-08-12 10:04:02+00:00,Thu Oct 15 00:28:04 +0000 2009,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN CO...,82496193,Red Octopus,531,677,en,city,"Baton Rouge, LA","{'type': 'Polygon', 'coordinates': [[[-91.2189...",United States,en,0,0
1,4,2016-08-12 10:04:30+00:00,Mon Aug 20 09:43:48 +0000 2012,#CNN #newday clear #Trump deliberately throwin...,769208504,Beverly Spence,2652,2976,en,city,"Baltimore, MD","{'type': 'Polygon', 'coordinates': [[[-76.7115...",United States,en,0,0
2,5,2016-08-12 10:04:46+00:00,Tue May 19 03:18:19 +0000 2009,"@realDonaldTrump, you wouldn't recognize a lie...",41043316,"Asa DeMatteo, Ph.D.",183,98,en,city,"Palm Springs, CA","{'type': 'Polygon', 'coordinates': [[[-116.567...",United States,en,0,0
3,7,2016-08-12 10:04:48+00:00,Sun Aug 07 00:57:29 +0000 2016,"""Kid, you know, suing someone? Thats the most ...",762090248159371264,Rafael Alejandro,159,993,en,city,"Secaucus, NJ","{'type': 'Polygon', 'coordinates': [[[-74.0938...",United States,en,0,0
4,8,2016-08-12 10:04:48+00:00,Wed Oct 28 18:34:22 +0000 2009,@HillaryClinton you ARE the co-founder of ISIS...,85879639,tom b,68,268,en,city,"Irving, TX","{'type': 'Polygon', 'coordinates': [[[-97.0341...",United States,en,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517133,657300,2016-09-12 13:20:28+00:00,Sun Jul 10 16:13:13 +0000 2011,@CNBC @SquawkAlley @realDonaldTrump Kudlow is ...,332888709,Linda Mannes Wildes,2097,2716,en,admin,"Florida, USA","{'type': 'Polygon', 'coordinates': [[[-87.6346...",United States,en,0,0
517134,657302,2016-09-12 13:20:32+00:00,Wed May 02 22:48:50 +0000 2012,"TRUMP U, TAXES ,WEIRD MEDICAL REPORT WITH A WH...",569487350,Sheryl Berghoff,1997,2599,en,city,"San Diego, CA","{'type': 'Polygon', 'coordinates': [[[-117.282...",United States,en,0,0
517135,657303,2016-09-12 13:20:33+00:00,Fri Apr 01 21:42:36 +0000 2016,@CarolCNN if MSM were honest watch any utube v...,716017946166857728,Karen B,756,1229,en,city,"Coral Gables, FL","{'type': 'Polygon', 'coordinates': [[[-80.2971...",United States,en,0,0
517136,657305,2016-09-12 13:20:38+00:00,Wed Jun 10 08:43:55 +0000 2015,It's interesting that Hillary Clinton's crowds...,3241116564,Robert Chaffin,2827,3970,en,city,"Fairbanks, AK","{'type': 'Polygon', 'coordinates': [[[-147.813...",United States,en,0,0


In [73]:
data_before_processing = data_before_processing[['initial_index','initial_tweet_text']]
data_before_processing = data_before_processing.rename(columns = {"text":"initial_tweet_text"})
data = pd.merge(left=data, right=data_before_processing, how='left', left_on='initial_index', right_on='initial_index')
data.head()

Unnamed: 0.1,Unnamed: 0,initial_index,tweet_created_at,user_created_at,text,user_id,user_name,followers_count,friends_count,user_lang,place_type,place_full_name,place_bounding_box,country,tweet_lang,retweet_count,favorite_count,states,candidate,initial_tweet_text
0,439298,560631,2016-09-08 13:56:16+00:00,Sat May 16 17:31:28 +0000 2009,"['hillaryclinton', 'hit', 'one', 'reporter']",40504632,joey gerdin,542,596,en,city,"Minneapolis, MN","{'type': 'Polygon', 'coordinates': [[[-93.3295...",United States,en,0,0,Minnesota,clinton,Did @HillaryClinton just hit on one of the rep...
1,326531,417258,2016-09-01 09:31:56+00:00,Tue Oct 26 17:08:21 +0000 2010,"['hillaryclinton', 'false', 'said', 'first', '...",208110094,M_Mexico_Great_Again,643,523,en,city,"Ferry Pass, FL","{'type': 'Polygon', 'coordinates': [[[-87.2476...",United States,en,0,0,Florida,clinton,@HillaryClinton false. He said first 2 years o...
2,313954,400256,2016-08-31 23:16:48+00:00,Wed Dec 14 05:41:00 +0000 2011,"['hillaryclinton', 'blew', 'invitation', 'pres...",436442356,Chris Harms,522,492,en,admin,"Wisconsin, USA","{'type': 'Polygon', 'coordinates': [[[-92.8894...",United States,en,0,0,Wisconsin,clinton,.@HillaryClinton blew off an invitation by the...
3,450210,573828,2016-09-08 23:21:16+00:00,Mon Aug 08 21:35:38 +0000 2016,"['usaneedstrump', 'hillaryclinton', 'dailycall...",762764227052392449,FrankB,5,72,en,admin,"Pennsylvania, USA","{'type': 'Polygon', 'coordinates': [[[-80.5198...",United States,en,0,0,Pennsylvania,clinton,@USAneedsTRUMP @HillaryClinton @DailyCaller Ha...
4,182803,230748,2016-08-23 17:40:01+00:00,Wed Feb 17 16:55:49 +0000 2010,"['trump', 'tempcrab', 'orchard', 'ky', 'f', 'w...",115110145,Carl King,84,18,en,admin,"Kentucky, USA","{'type': 'Polygon', 'coordinates': [[[-89.5715...",United States,en,0,0,Kentucky,trump,"#Trump in 2016 Temp:Crab Orchard, Ky."":83.1°F ..."


In [74]:
data.to_csv("cleaned_twitter_data_50000_sample_with_original_text_20200924.csv")

In [75]:
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

data.loc[:, 'cleaned_for_bayes'] = data['initial_tweet_text'].apply(lambda x: re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', "", x))
print("Regex done at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].progress_apply(lambda x: rm_html(x))
print("XTML decoding deleted at:", datetime.now())
#data.loc[:, 'normalized'] = data['normalized'].apply(lambda x: rm_newline(x))
#print("Newlines cleaned at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: re.sub(r'@|#', ' ', x))
print("Hashtags done at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: re.sub(r'[0-9]+', ' ', x))
print("Numbers replaced at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: re.sub(r'\_', ' ', x))
print("Numbers replaced at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: rm_punctuation(x))
print("Punctuation done at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: rm_stopwords(x))
print("Stop words bye at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: x.rstrip())
print("Whitespaces done at:", datetime.now())
data.loc[:, 'cleaned_for_bayes'] = data['cleaned_for_bayes'].apply(lambda x: x.lower())
print("Lowered at:", datetime.now())

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Regex done at: 2020-09-24 14:49:43.477986


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))



XTML decoding deleted at: 2020-09-24 14:49:57.394849
Hashtags done at: 2020-09-24 14:49:57.722339
Numbers replaced at: 2020-09-24 14:49:58.077139
Numbers replaced at: 2020-09-24 14:49:58.227051
Punctuation done at: 2020-09-24 14:49:58.605835
Stop words bye at: 2020-09-24 14:49:59.079551
Whitespaces done at: 2020-09-24 14:49:59.139518
Lowered at: 2020-09-24 14:49:59.204482


Now we perform all the same pipeline as for test data

In [76]:
tweet = data['cleaned_for_bayes'][1]
print (classifier.classify(word_feats(tweet.split())))
print(tweet)

pos
 hillaryclinton false he said first   years criminals then new flow stoppep drugs carteld evaluate


In [77]:
word_feats(tweet.split())

{'hillaryclinton': True,
 'false': True,
 'he': True,
 'said': True,
 'first': True,
 'years': True,
 'criminals': True,
 'then': True,
 'new': True,
 'flow': True,
 'stoppep': True,
 'drugs': True,
 'carteld': True,
 'evaluate': True}

In [88]:
tweet = data['text'][12]
tweet = '@oybay @BillKristol @Sarahlellison @Evan_McMullin @TheTakeaway   Dont give IDIOT any ideas!  MorallyUnfit MentallyUnstable. NeverTrump'
print (classifier.classify(word_feats(tweet.split())))
print(tweet)

neg
@oybay @BillKristol @Sarahlellison @Evan_McMullin @TheTakeaway   Dont give IDIOT any ideas!  MorallyUnfit MentallyUnstable. NeverTrump


In [89]:
data.loc[:, 'bayes_sentiment'] = data['cleaned_for_bayes'].progress_apply(lambda x: classifier.classify(word_feats(x.split())))
print("Adding sentiment done at:", datetime.now())
data

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))


Adding sentiment done at: 2020-09-24 15:03:43.093041


Unnamed: 0.1,Unnamed: 0,initial_index,tweet_created_at,user_created_at,text,user_id,user_name,followers_count,friends_count,user_lang,...,place_bounding_box,country,tweet_lang,retweet_count,favorite_count,states,candidate,initial_tweet_text,cleaned_for_bayes,bayes_sentiment
0,439298,560631,2016-09-08 13:56:16+00:00,Sat May 16 17:31:28 +0000 2009,"['hillaryclinton', 'hit', 'one', 'reporter']",40504632,joey gerdin,542,596,en,...,"{'type': 'Polygon', 'coordinates': [[[-93.3295...",United States,en,0,0,Minnesota,clinton,Did @HillaryClinton just hit on one of the rep...,did hillaryclinton hit one reporters,pos
1,326531,417258,2016-09-01 09:31:56+00:00,Tue Oct 26 17:08:21 +0000 2010,"['hillaryclinton', 'false', 'said', 'first', '...",208110094,M_Mexico_Great_Again,643,523,en,...,"{'type': 'Polygon', 'coordinates': [[[-87.2476...",United States,en,0,0,Florida,clinton,@HillaryClinton false. He said first 2 years o...,hillaryclinton false he said first years cr...,pos
2,313954,400256,2016-08-31 23:16:48+00:00,Wed Dec 14 05:41:00 +0000 2011,"['hillaryclinton', 'blew', 'invitation', 'pres...",436442356,Chris Harms,522,492,en,...,"{'type': 'Polygon', 'coordinates': [[[-92.8894...",United States,en,0,0,Wisconsin,clinton,.@HillaryClinton blew off an invitation by the...,hillaryclinton blew invitation president mexi...,pos
3,450210,573828,2016-09-08 23:21:16+00:00,Mon Aug 08 21:35:38 +0000 2016,"['usaneedstrump', 'hillaryclinton', 'dailycall...",762764227052392449,FrankB,5,72,en,...,"{'type': 'Polygon', 'coordinates': [[[-80.5198...",United States,en,0,0,Pennsylvania,clinton,@USAneedsTRUMP @HillaryClinton @DailyCaller Ha...,usaneedstrump hillaryclinton dailycaller ha...,pos
4,182803,230748,2016-08-23 17:40:01+00:00,Wed Feb 17 16:55:49 +0000 2010,"['trump', 'tempcrab', 'orchard', 'ky', 'f', 'w...",115110145,Carl King,84,18,en,...,"{'type': 'Polygon', 'coordinates': [[[-89.5715...",United States,en,0,0,Kentucky,trump,"#Trump in 2016 Temp:Crab Orchard, Ky."":83.1°F ...",trump tempcrab orchard ky f wind mph pres...,pos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,109611,139329,2016-08-19 00:02:43+00:00,Sat Feb 11 23:08:46 +0000 2012,"['hillaryiscoming', 'ivanroberson', 'kickstart...",489825874,Kal AKA,495,990,en,...,"{'type': 'Polygon', 'coordinates': [[[-118.668...",United States,en,0,0,California,clinton,@HillaryIsComing @Ivanroberson #Kickstarter ?#...,hillaryiscoming ivanroberson kickstarter t...,neg
49996,224364,282945,2016-08-26 00:43:52+00:00,Fri Mar 05 18:24:11 +0000 2010,"['id', 'like', 'ban', 'trump', 'word', 'harden...",120182490,jenny speier,254,193,en,...,"{'type': 'Polygon', 'coordinates': [[[-88.0978...",United States,en,0,0,Indiana,trump,I'd like to ban having #trump &amp; the word #...,id like ban trump word hardening sentence f...,neg
49997,402684,515797,2016-09-06 18:27:25+00:00,Sat Aug 06 22:45:14 +0000 2011,"['obama', 'clinton', 'created', 'world', 'powe...",349921132,Michael Wilner,10443,661,en,...,"{'type': 'Polygon', 'coordinates': [[[-77.1194...",United States,en,0,0,Washington,trump,"#Obama and #Clinton ""have created what will be...",obama clinton created world power trump say...,pos
49998,234923,296196,2016-08-26 15:58:23+00:00,Mon Jul 06 02:10:25 +0000 2015,"['hillaryclinton', 'keep', 'america', 'sane', ...",3269554988,cjlamb,1817,1886,en,...,"{'type': 'Polygon', 'coordinates': [[[-88.0978...",United States,en,0,0,Indiana,clinton,@HillaryClinton \nKeep America Sane:\nVote Cli...,hillaryclinton \nkeep america sane\nvote clin...,neg


In [90]:
data[data['bayes_sentiment']=='pos']

Unnamed: 0.1,Unnamed: 0,initial_index,tweet_created_at,user_created_at,text,user_id,user_name,followers_count,friends_count,user_lang,...,place_bounding_box,country,tweet_lang,retweet_count,favorite_count,states,candidate,initial_tweet_text,cleaned_for_bayes,bayes_sentiment
0,439298,560631,2016-09-08 13:56:16+00:00,Sat May 16 17:31:28 +0000 2009,"['hillaryclinton', 'hit', 'one', 'reporter']",40504632,joey gerdin,542,596,en,...,"{'type': 'Polygon', 'coordinates': [[[-93.3295...",United States,en,0,0,Minnesota,clinton,Did @HillaryClinton just hit on one of the rep...,did hillaryclinton hit one reporters,pos
1,326531,417258,2016-09-01 09:31:56+00:00,Tue Oct 26 17:08:21 +0000 2010,"['hillaryclinton', 'false', 'said', 'first', '...",208110094,M_Mexico_Great_Again,643,523,en,...,"{'type': 'Polygon', 'coordinates': [[[-87.2476...",United States,en,0,0,Florida,clinton,@HillaryClinton false. He said first 2 years o...,hillaryclinton false he said first years cr...,pos
2,313954,400256,2016-08-31 23:16:48+00:00,Wed Dec 14 05:41:00 +0000 2011,"['hillaryclinton', 'blew', 'invitation', 'pres...",436442356,Chris Harms,522,492,en,...,"{'type': 'Polygon', 'coordinates': [[[-92.8894...",United States,en,0,0,Wisconsin,clinton,.@HillaryClinton blew off an invitation by the...,hillaryclinton blew invitation president mexi...,pos
3,450210,573828,2016-09-08 23:21:16+00:00,Mon Aug 08 21:35:38 +0000 2016,"['usaneedstrump', 'hillaryclinton', 'dailycall...",762764227052392449,FrankB,5,72,en,...,"{'type': 'Polygon', 'coordinates': [[[-80.5198...",United States,en,0,0,Pennsylvania,clinton,@USAneedsTRUMP @HillaryClinton @DailyCaller Ha...,usaneedstrump hillaryclinton dailycaller ha...,pos
4,182803,230748,2016-08-23 17:40:01+00:00,Wed Feb 17 16:55:49 +0000 2010,"['trump', 'tempcrab', 'orchard', 'ky', 'f', 'w...",115110145,Carl King,84,18,en,...,"{'type': 'Polygon', 'coordinates': [[[-89.5715...",United States,en,0,0,Kentucky,trump,"#Trump in 2016 Temp:Crab Orchard, Ky."":83.1°F ...",trump tempcrab orchard ky f wind mph pres...,pos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49988,33069,42003,2016-08-14 10:19:53+00:00,Sun Apr 19 21:21:38 +0000 2009,"['az', 'valentine', 'bringbackjobs', 'savechri...",33315551,Robert Witcher,887,1100,en,...,"{'type': 'Polygon', 'coordinates': [[[-85.6051...",United States,en,0,0,Georgia,trump,@az_valentine @BringBackJobs @savechristians t...,az valentine bringbackjobs savechristians t...,pos
49990,204829,258459,2016-08-25 02:01:50+00:00,Thu Jul 07 00:11:14 +0000 2011,"['politico', 'realdonaldtrump', 'nigel', 'fara...",330676972,Greg Pittman,48,210,en,...,"{'type': 'Polygon', 'coordinates': [[[-80.2081...",United States,en,0,0,Florida,trump,@politico @realDonaldTrump @Nigel_Farage trump...,politico realdonaldtrump nigel farage trump...,pos
49991,412130,527541,2016-09-07 04:09:46+00:00,Sun Mar 10 01:15:32 +0000 2013,"['megynkelly', 'realdonaldtrump', 'ap', 'hilla...",1255784834,jojo capece,316,72,en,...,"{'type': 'Polygon', 'coordinates': [[[-122.514...",United States,en,0,0,California,trump,@megynkelly @realDonaldTrump @AP HILLARY HAS C...,megynkelly realdonaldtrump ap hillary has c...,pos
49994,403008,516195,2016-09-06 18:42:23+00:00,Sun Jan 17 00:26:30 +0000 2010,"['based', 'cnn', 'special', 'last', 'night', '...",105635023,Tabatha Presley,210,656,en,...,"{'type': 'Polygon', 'coordinates': [[[-84.2027...",United States,en,0,0,Georgia,trump,Based on the @CNN special last night on @realD...,based cnn special last night realdonaldtrump...,pos


Saving sentiment data from Naive

In [91]:
data.to_csv("tweets_50k_with_bayes_sentiment_20200924.csv")