In [1]:
# Import dependencies
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# Importing the test database
twitter_df = pd.read_csv("../../res/initial_dataset.csv")
twitter_df

Unnamed: 0.1,Unnamed: 0,tweet_id,full_text,sentiment
0,0,1.590000e+18,@twk_5 @davidhogg111 Good question. The guns a...,anti-gun
1,1,1.590000e+18,@NikaOneDay @thegreatunkn @obiwill_kenobi @Tul...,anti-gun
2,2,1.590000e+18,Just…read this. \nhttps://t.co/TfKqT2nNZI\n\n@...,anti-gun
3,3,1.590000e+18,@TomCottonAR Are you suggesting more guns like...,anti-gun
4,4,1.590000e+18,@GhostofTST Disagreed! You can have sensible g...,anti-gun
...,...,...,...,...
995,1395,1.590000e+18,"@cbssaturday I am Dr. Floyd Jones, https://t.c...",neutral
996,1396,1.590000e+18,Future artist Tray Tray video shoot shot up in...,neutral
997,1397,1.590000e+18,Manhunt suspect in quadruple Aurora shooting t...,neutral
998,1398,1.590000e+18,"@LogicIsLeaving @phike9391 @TMZ no, i don‚Äôt....",neutral


In [4]:
# Function to clean the database
def preprocess_tweet(tweet):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    tweet = tweet.lower()

    # Remove RT
    sentence = re.sub('RT @\w+: '," ", tweet)

    # Remove special characters
    tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet)

    # Single character removal
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove URL's
    tweet = re.sub('((www.[^s]+)|(https?://[^s]+))',' ',tweet)
    
    #Replace 2a|2nd amendment to second amendment
    tweet = re.sub("2a|2nd\samendment|2nd|2ndamendment|secondamendment|2ndamendment", 'second amendment', tweet)
    
    # Remove numbers
    tweet = re.sub('[0-9]+', '', tweet)
    
    return tweet

In [5]:
# Save cleaned tweets in new cleaned column
cleaned_tweets = []

for tweet in twitter_df['full_text']:
  cleaned_tweet = preprocess_tweet(tweet)
  cleaned_tweets.append(cleaned_tweet)

twitter_df['cleaned'] = pd.DataFrame(cleaned_tweets)
twitter_df.head(10)

Unnamed: 0.1,Unnamed: 0,tweet_id,full_text,sentiment,cleaned
0,0,1.59e+18,@twk_5 @davidhogg111 Good question. The guns a...,anti-gun,good question the guns and rifles you used w...
1,1,1.59e+18,@NikaOneDay @thegreatunkn @obiwill_kenobi @Tul...,anti-gun,kenobi personally d rather have time machine ...
2,2,1.59e+18,Just…read this. \nhttps://t.co/TfKqT2nNZI\n\n@...,anti-gun,just read this on the murder of isabella thall...
3,3,1.59e+18,@TomCottonAR Are you suggesting more guns like...,anti-gun,are you suggesting more guns like your collea...
4,4,1.59e+18,@GhostofTST Disagreed! You can have sensible g...,anti-gun,disagreed you can have sensible gun laws or y...
5,5,1.59e+18,"Rest in Power, Takeoff. \n\nSo sad to see anot...",anti-gun,rest in power takeoff so sad to see another vi...
6,6,1.59e+18,@WisDems My roomate was going to date soneone ...,anti-gun,my roomate was going to date soneone until fo...
7,7,1.59e+18,@CARebelBase Your underling premise is wrong. ...,anti-gun,your underling premise is wrong democrats nev...
8,8,1.59e+18,@LiamMiller33 Politicians who prefer NRA paych...,anti-gun,politicians who prefer nra paychecks over pro...
9,9,1.59e+18,@SonsOFreshOil @JOSE97LUIS @TulsiGabbard Reall...,anti-gun,really bad form does the second amendment as ...


In [6]:
# Drop column text
twitter_df = twitter_df.drop(['Unnamed: 0','tweet_id', 'full_text'], axis=1)
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question the guns and rifles you used w...
1,anti-gun,kenobi personally d rather have time machine ...
2,anti-gun,just read this on the murder of isabella thall...
3,anti-gun,are you suggesting more guns like your collea...
4,anti-gun,disagreed you can have sensible gun laws or y...
...,...,...
995,neutral,am dr floyd jones saw chad lawson on the satu...
996,neutral,future artist tray tray video shoot shot up in...
997,neutral,manhunt suspect in quadruple aurora shooting t...
998,neutral,no don i wish could stop all gun violence tbh...


In [7]:
# Remove stopwords
import nltk
nltk.download('stopwords')
stopwordlist = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danaburton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Cleaning and removing the above stop words list from the tweet text
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda text: cleaning_stopwords(text))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question guns rifles used assault weapons...
1,anti-gun,kenobi personally rather time machine could go...
2,anti-gun,read murder isabella thallas denver mundanity ...
3,anti-gun,suggesting guns like colleagues gun violence e...
4,anti-gun,disagreed sensible gun laws second amendment r...


In [9]:
# Getting tokenization of tweet text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
twitter_df['cleaned'] = twitter_df['cleaned'].apply(tokenizer.tokenize)
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[suggesting, guns, like, colleagues, gun, viol..."
4,anti-gun,"[disagreed, sensible, gun, laws, second, amend..."


In [10]:
# Applying Stemming
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: stemming_on_text(x))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[suggesting, guns, like, colleagues, gun, viol..."
4,anti-gun,"[disagreed, sensible, gun, laws, second, amend..."


In [11]:
# Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: lemmatizer_on_text(x))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[suggesting, guns, like, colleagues, gun, viol..."
4,anti-gun,"[disagreed, sensible, gun, laws, second, amend..."


In [12]:
# Removing words with less frequency
# filter function to select only the words with more than 10 counts and less than 800.
import itertools
flat_list = list(itertools.chain.from_iterable(twitter_df['cleaned']))

fd = nltk.FreqDist(flat_list)
word_to_keep = list(filter(lambda x: 800>x[1]>10, fd.items()))

word_list_to_keep= [item[0] for item in word_to_keep]

def remove_lessfreq(tokanized_tweets):
    text_out = [word for word in tokanized_tweets if word in word_list_to_keep]
    return text_out

In [13]:
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: remove_lessfreq(x))
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, guns, rifles, used, assault, weapons, p..."
1,anti-gun,"[time, could, go, back, prevent, guns, first, ..."
2,anti-gun,"[read, murder, gun, violence, us]"
3,anti-gun,"[guns, like, gun, violence, us, americans, gun..."
4,anti-gun,"[gun, laws, second, amendment, second, amendme..."
...,...,...
995,neutral,"[gun, violence, come]"
996,neutral,"[shot, chicago, shooting]"
997,neutral,"[shooting, say]"
998,neutral,"[could, stop, gun, violence, woman, think, kno..."


In [14]:
# Separating input feature and label
X=twitter_df.cleaned
y=twitter_df.sentiment

In [15]:
# Splitting our dataset into Train and Test Subset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Cleaning data in single line through passing clean_text in the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df = 5, ngram_range=(1,5)) 
countVector = vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x)))
print(countVector.shape)

(750, 413)


In [17]:
X_train = vectorizer.transform(X_train.apply(lambda x: ' '.join(x)))
X_test  = vectorizer.transform(X_test.apply(lambda x: ' '.join(x)))

## Balanced Random Forest Classifier

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators =130)
brf_model.fit(X_train, y_train)
y_pred = brf_model.predict(X_test)

In [19]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6051851851851852

In [20]:
from sklearn.metrics import balanced_accuracy_score, classification_report
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.72      0.56      0.63       100
     neutral       0.42      0.73      0.53        60
     pro-gun       0.70      0.52      0.60        90

    accuracy                           0.59       250
   macro avg       0.61      0.61      0.59       250
weighted avg       0.64      0.59      0.60       250



## Predicting Big Dataset

In [21]:
# Importing the test database
big_twitter_df= pd.read_csv("../../res/big_data_tweets.csv")
big_twitter_df

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,dummy_sentiment
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,...,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...,neutral
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,...,,United States,-98.50000,39.76000,0,0,0,0,https://twitter.com/twitter/status/15883200192...,neutral
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,...,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...,neutral
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,...,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...,pro-gun
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,...,,United States,-98.50000,39.76000,0,0,0,0,https://twitter.com/twitter/status/15883198663...,anti-gun
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72326,1591581664358141955,2022-11-13 00:00:13,"""New guns are on their way to a new home! Don'...",Original Tweet,"['#sigwo', '#ar15', '#cerakote', '#2A', '#fde'...",,1569488571597484033,Rogersville,Webster County,29225.0,...,29.0,United States,-93.05573,37.11700,1,1,2,1,https://twitter.com/twitter/status/15915816643...,anti-gun
72327,1591581657232216064,2022-11-13 00:00:11,"""Two Georgia teenage siblings are accused of g...",Original Tweet,"['#georgia', '#shooting', '#arrest']",,1359623829404942345,,,,...,36.0,United States,-75.49990,43.00035,0,0,0,0,https://twitter.com/twitter/status/15915816572...,neutral
72328,1591581621828083712,2022-11-13 00:00:03,"""⭐ ⭐ ⭐ ⭐ ⭐ \n""Comp looks great. Love the look ...",Original Tweet,"['#FaxonFirearms', '#Firearms', '#FamilyBusine...",,1630501897,Cincinnati,Hamilton County,39061.0,...,39.0,United States,-84.45689,39.16200,5,0,203,167,https://twitter.com/twitter/status/15915816218...,anti-gun
72329,1591581616132001794,2022-11-13 00:00:02,"""@adamdavidson @LivLuvLaf5 This GA Dem Voter i...",Reply Tweet,,"['@adamdavidson', '@LivLuvLaf5', '@BrianKempGA']",24174797,Atlanta,Fulton County,13121.0,...,13.0,United States,-84.38798,33.74900,0,0,0,1,https://twitter.com/twitter/status/15915816161...,pro-gun


In [22]:
# Drop column dummy sentiment
big_twitter_df = big_twitter_df.drop(['dummy_sentiment'], axis=1)
big_twitter_df

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,state,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,Arkansas,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,,,United States,-98.50000,39.76000,0,0,0,0,https://twitter.com/twitter/status/15883200192...
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,West Virginia,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,Michigan,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,,,United States,-98.50000,39.76000,0,0,0,0,https://twitter.com/twitter/status/15883198663...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72326,1591581664358141955,2022-11-13 00:00:13,"""New guns are on their way to a new home! Don'...",Original Tweet,"['#sigwo', '#ar15', '#cerakote', '#2A', '#fde'...",,1569488571597484033,Rogersville,Webster County,29225.0,Missouri,29.0,United States,-93.05573,37.11700,1,1,2,1,https://twitter.com/twitter/status/15915816643...
72327,1591581657232216064,2022-11-13 00:00:11,"""Two Georgia teenage siblings are accused of g...",Original Tweet,"['#georgia', '#shooting', '#arrest']",,1359623829404942345,,,,New York,36.0,United States,-75.49990,43.00035,0,0,0,0,https://twitter.com/twitter/status/15915816572...
72328,1591581621828083712,2022-11-13 00:00:03,"""⭐ ⭐ ⭐ ⭐ ⭐ \n""Comp looks great. Love the look ...",Original Tweet,"['#FaxonFirearms', '#Firearms', '#FamilyBusine...",,1630501897,Cincinnati,Hamilton County,39061.0,Ohio,39.0,United States,-84.45689,39.16200,5,0,203,167,https://twitter.com/twitter/status/15915816218...
72329,1591581616132001794,2022-11-13 00:00:02,"""@adamdavidson @LivLuvLaf5 This GA Dem Voter i...",Reply Tweet,,"['@adamdavidson', '@LivLuvLaf5', '@BrianKempGA']",24174797,Atlanta,Fulton County,13121.0,Georgia,13.0,United States,-84.38798,33.74900,0,0,0,1,https://twitter.com/twitter/status/15915816161...


In [23]:
big_twitter_df = big_twitter_df.dropna(subset=['full_text'])

In [24]:
# Save cleaned tweets in new cleaned column
cleaned_tweets = []

for tweet in big_twitter_df['full_text']:
    cleaned_tweet = preprocess_tweet(tweet)
    cleaned_tweets.append(cleaned_tweet)

big_twitter_df['cleaned'] = pd.DataFrame(cleaned_tweets)
big_twitter_df.tail(10)

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,cleaned
72321,1591581867630788608,2022-11-13 00:01:02,"""@demarreleeclair @mpowell53 @WhiteSoxic @JoeN...",Reply Tweet,,"['@demarreleeclair', '@mpowell53', '@WhiteSoxi...",1441180551172345866,Brooklyn,Kings County,36047.0,...,36.0,United States,-73.94958,40.6501,1,0,1,0,https://twitter.com/twitter/status/15915818676...,think you re confused as to who actually work...
72322,1591581833070006274,2022-11-13 00:00:53,"""@RandyRRQuaid Umm, Pointing guns at one's wif...",Reply Tweet,,['@RandyRRQuaid'],66270688,Boca Raton,Palm Beach County,12099.0,...,12.0,United States,-80.0831,26.35869,0,0,0,0,https://twitter.com/twitter/status/15915818330...,umm pointing guns at one wife might be good i...
72323,1591581809426714624,2022-11-13 00:00:48,"""@BuckSexton Libs are nuts as we fight for fai...",Reply Tweet,,['@BuckSexton'],1955856734,Malibu,Los Angeles County,6037.0,...,6.0,United States,-118.81009,34.00501,0,0,0,0,https://twitter.com/twitter/status/15915818094...,libs are nuts as we fight for fair wages wome...
72324,1591581768691650562,2022-11-13 00:00:38,"""@secretbnuy762 Nope, I admit. You got me on t...",Reply Tweet,,['@secretbnuy762'],1267856567417704448,"Washington, D. C.",,,...,11.0,United States,-77.03637,38.89511,1,1,0,0,https://twitter.com/twitter/status/15915817686...,nope admit you got me on that one ll take tha...
72325,1591581736705855490,2022-11-13 00:00:30,"""@BillyBaldwin @BillyBaldwin hey by gun contro...",Reply Tweet,,"['@BillyBaldwin', '@BillyBaldwin']",1891935396,,,,...,49.0,United States,-111.75103,39.25024,0,0,2,0,https://twitter.com/twitter/status/15915817367...,hey by gun control does that include people w...
72326,1591581664358141955,2022-11-13 00:00:13,"""New guns are on their way to a new home! Don'...",Original Tweet,"['#sigwo', '#ar15', '#cerakote', '#2A', '#fde'...",,1569488571597484033,Rogersville,Webster County,29225.0,...,29.0,United States,-93.05573,37.117,1,1,2,1,https://twitter.com/twitter/status/15915816643...,new guns are on their way to new home don for...
72327,1591581657232216064,2022-11-13 00:00:11,"""Two Georgia teenage siblings are accused of g...",Original Tweet,"['#georgia', '#shooting', '#arrest']",,1359623829404942345,,,,...,36.0,United States,-75.4999,43.00035,0,0,0,0,https://twitter.com/twitter/status/15915816572...,two georgia teenage siblings are accused of g...
72328,1591581621828083712,2022-11-13 00:00:03,"""⭐ ⭐ ⭐ ⭐ ⭐ \n""Comp looks great. Love the look ...",Original Tweet,"['#FaxonFirearms', '#Firearms', '#FamilyBusine...",,1630501897,Cincinnati,Hamilton County,39061.0,...,39.0,United States,-84.45689,39.162,5,0,203,167,https://twitter.com/twitter/status/15915816218...,comp looks great love the look of it on my gl...
72329,1591581616132001794,2022-11-13 00:00:02,"""@adamdavidson @LivLuvLaf5 This GA Dem Voter i...",Reply Tweet,,"['@adamdavidson', '@LivLuvLaf5', '@BrianKempGA']",24174797,Atlanta,Fulton County,13121.0,...,13.0,United States,-84.38798,33.749,0,0,0,1,https://twitter.com/twitter/status/15915816161...,this ga dem voter is mourning the losses whil...
72330,1591581615594950656,2022-11-13 00:00:01,"""Terrible news💔😔:\n\n13-year-old Jayz Agnew wh...",Original Tweet,,,14897840,,,,...,11.0,United States,-77.00025,38.91706,2,2,3,0,https://twitter.com/twitter/status/15915816155...,terrible news year old jayz agnew who was sh...


In [25]:
# Drop column text
#big_twitter_df = big_twitter_df.drop(['user_id','reply_count','quote_count','likes_count','retweet_counts','hyperlink'], axis=1)
#big_twitter_df

In [26]:
# Remove stopwords
import nltk
nltk.download('stopwords')
stopwordlist = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danaburton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Cleaning and removing the above stop words list from the tweet text
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
big_twitter_df['cleaned'] = big_twitter_df['cleaned'].apply(lambda text: cleaning_stopwords(text))
big_twitter_df.head()

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,cleaned
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,...,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...,levis kj jefferson people think levis nfl qb p...
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883200192...,republican candidate kids almost killed gunman...
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,...,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...,macdaddy senseless deaths tragic think one sec...
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,...,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...,get new talking points nra schtick tired old
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883198663...,watch gunman opens fire imran khan


In [28]:
# Getting tokenization of tweet text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
big_twitter_df['cleaned'] = big_twitter_df['cleaned'].apply(tokenizer.tokenize)
big_twitter_df.head()

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,cleaned
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,...,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...,"[levis, kj, jefferson, people, think, levis, n..."
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883200192...,"[republican, candidate, kids, almost, killed, ..."
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,...,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...,"[macdaddy, senseless, deaths, tragic, think, o..."
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,...,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...,"[get, new, talking, points, nra, schtick, tire..."
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883198663...,"[watch, gunman, opens, fire, imran, khan]"


In [29]:
# Applying Stemming
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
big_twitter_df['cleaned'] = big_twitter_df['cleaned'].apply(lambda x: stemming_on_text(x))
big_twitter_df.head()

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,cleaned
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,...,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...,"[levis, kj, jefferson, people, think, levis, n..."
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883200192...,"[republican, candidate, kids, almost, killed, ..."
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,...,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...,"[macdaddy, senseless, deaths, tragic, think, o..."
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,...,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...,"[get, new, talking, points, nra, schtick, tire..."
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883198663...,"[watch, gunman, opens, fire, imran, khan]"


In [30]:
# Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
big_twitter_df['cleaned'] = big_twitter_df['cleaned'].apply(lambda x: lemmatizer_on_text(x))
big_twitter_df.head()

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,cleaned
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,...,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...,"[levis, kj, jefferson, people, think, levis, n..."
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883200192...,"[republican, candidate, kids, almost, killed, ..."
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,...,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...,"[macdaddy, senseless, deaths, tragic, think, o..."
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,...,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...,"[get, new, talking, points, nra, schtick, tire..."
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,...,,United States,-98.5,39.76,0,0,0,0,https://twitter.com/twitter/status/15883198663...,"[watch, gunman, opens, fire, imran, khan]"


In [39]:
# Removing words with less frequency
# filter function to select only the words with more than 10 counts and less than 800.
import itertools
flat_list = list(itertools.chain.from_iterable(big_twitter_df['cleaned']))

fd = nltk.FreqDist(flat_list)
word_to_keep = list(filter(lambda x: 800>x[1]>10, fd.items()))

word_list_to_keep = [item[0] for item in word_to_keep]

def remove_lessfreq(tokanized_tweets):
    text_out = [word for word in tokanized_tweets if word in word_list_to_keep]
    return text_out

SyntaxError: invalid syntax (976346009.py, line 13)

In [32]:
big_twitter_df['cleaned'] = big_twitter_df['cleaned'].apply(lambda x: remove_lessfreq(x))
big_twitter_df

In [33]:
predict=big_twitter_df["cleaned"]
X_new  = vectorizer.transform(predict.apply(lambda x: ' '.join(x)))

## Balanced Random Forest Classifier

In [34]:
new_data_pred = brf_model.predict(X_new)

In [35]:
new_data_pred

array(['neutral', 'neutral', 'neutral', ..., 'neutral', 'neutral',
       'neutral'], dtype=object)

In [36]:
big_twitter_df['sentiment']=new_data_pred
big_twitter_df.tail()

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,cleaned,sentiment
72326,1591581664358141955,2022-11-13 00:00:13,"""New guns are on their way to a new home! Don'...",Original Tweet,"['#sigwo', '#ar15', '#cerakote', '#2A', '#fde'...",,1569488571597484033,Rogersville,Webster County,29225.0,...,United States,-93.05573,37.117,1,1,2,1,https://twitter.com/twitter/status/15915816643...,"[new, guns, way, new, home, forget, pyro, sale...",neutral
72327,1591581657232216064,2022-11-13 00:00:11,"""Two Georgia teenage siblings are accused of g...",Original Tweet,"['#georgia', '#shooting', '#arrest']",,1359623829404942345,,,,...,United States,-75.4999,43.00035,0,0,0,0,https://twitter.com/twitter/status/15915816572...,"[two, georgia, teenage, siblings, accused, gun...",neutral
72328,1591581621828083712,2022-11-13 00:00:03,"""⭐ ⭐ ⭐ ⭐ ⭐ \n""Comp looks great. Love the look ...",Original Tweet,"['#FaxonFirearms', '#Firearms', '#FamilyBusine...",,1630501897,Cincinnati,Hamilton County,39061.0,...,United States,-84.45689,39.162,5,0,203,167,https://twitter.com/twitter/status/15915816218...,"[comp, looks, great, love, look, glock, easy, ...",neutral
72329,1591581616132001794,2022-11-13 00:00:02,"""@adamdavidson @LivLuvLaf5 This GA Dem Voter i...",Reply Tweet,,"['@adamdavidson', '@LivLuvLaf5', '@BrianKempGA']",24174797,Atlanta,Fulton County,13121.0,...,United States,-84.38798,33.749,0,0,0,1,https://twitter.com/twitter/status/15915816161...,"[ga, dem, voter, mourning, losses, appearing, ...",neutral
72330,1591581615594950656,2022-11-13 00:00:01,"""Terrible news💔😔:\n\n13-year-old Jayz Agnew wh...",Original Tweet,,,14897840,,,,...,United States,-77.00025,38.91706,2,2,3,0,https://twitter.com/twitter/status/15915816155...,"[terrible, news, year, old, jayz, agnew, shot,...",neutral


In [40]:
# Drop column cleaned
big_twitter_df = big_twitter_df.drop(['cleaned'], axis=1)
big_twitter_df

Unnamed: 0,tweet_id,date_created,full_text,tweet_type,hashtags,mentions,user_id,city,county,fips_county,...,fips_state,country,latitude,longitude,reply_count,quote_count,likes_count,retweet_counts,hyperlink,sentiment
0,1588320083335467009,2022-11-03 23:59:52,"""@UnnecRoughness A is Levis. \nB is KJ Jeffers...",Reply Tweet,,['@UnnecRoughness'],531999121,,,,...,5.0,United States,-92.50044,34.75037,0,0,6,0,https://twitter.com/twitter/status/15883200833...,neutral
1,1588320019259469824,2022-11-03 23:59:36,"""Republican candidate's kids are almost KILLED...",Original Tweet,,,1586027897579802624,,,,...,,United States,-98.50000,39.76000,0,0,0,0,https://twitter.com/twitter/status/15883200192...,neutral
2,1588320006840160256,2022-11-03 23:59:33,"""@Jupiter62214807 @cjstheman_611 @BMC_MacDaddy...",Reply Tweet,,"['@Jupiter62214807', '@cjstheman_611', '@BMC_M...",783106891274596352,,,,...,54.0,United States,-80.50009,38.50038,0,0,1,0,https://twitter.com/twitter/status/15883200068...,neutral
3,1588319891446272001,2022-11-03 23:59:06,"""@davidhogg111 It’s 2022, get new talking poin...",Reply Tweet,,['@davidhogg111'],1511111650199412739,,,,...,26.0,United States,-85.50033,44.25029,0,0,8,0,https://twitter.com/twitter/status/15883198914...,neutral
4,1588319866385477632,2022-11-03 23:59:00,"""Watch: Gunman opens fire on Imran Khan https:...",Original Tweet,,,287297482,,,,...,,United States,-98.50000,39.76000,0,0,0,0,https://twitter.com/twitter/status/15883198663...,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72326,1591581664358141955,2022-11-13 00:00:13,"""New guns are on their way to a new home! Don'...",Original Tweet,"['#sigwo', '#ar15', '#cerakote', '#2A', '#fde'...",,1569488571597484033,Rogersville,Webster County,29225.0,...,29.0,United States,-93.05573,37.11700,1,1,2,1,https://twitter.com/twitter/status/15915816643...,neutral
72327,1591581657232216064,2022-11-13 00:00:11,"""Two Georgia teenage siblings are accused of g...",Original Tweet,"['#georgia', '#shooting', '#arrest']",,1359623829404942345,,,,...,36.0,United States,-75.49990,43.00035,0,0,0,0,https://twitter.com/twitter/status/15915816572...,neutral
72328,1591581621828083712,2022-11-13 00:00:03,"""⭐ ⭐ ⭐ ⭐ ⭐ \n""Comp looks great. Love the look ...",Original Tweet,"['#FaxonFirearms', '#Firearms', '#FamilyBusine...",,1630501897,Cincinnati,Hamilton County,39061.0,...,39.0,United States,-84.45689,39.16200,5,0,203,167,https://twitter.com/twitter/status/15915816218...,neutral
72329,1591581616132001794,2022-11-13 00:00:02,"""@adamdavidson @LivLuvLaf5 This GA Dem Voter i...",Reply Tweet,,"['@adamdavidson', '@LivLuvLaf5', '@BrianKempGA']",24174797,Atlanta,Fulton County,13121.0,...,13.0,United States,-84.38798,33.74900,0,0,0,1,https://twitter.com/twitter/status/15915816161...,neutral


In [41]:
big_twitter_df.to_csv('big_data_prediction_ml_model.csv')

In [38]:
big_twitter_df['sentiment'].value_counts()

neutral     38310
anti-gun    19136
pro-gun     14885
Name: sentiment, dtype: int64