# Using topic modeling to look into frequent topics in negative and positive Tweets

In [1]:
import pandas as pd
import numpy as np
import string
import sys
import re

#text processing, NLP modules
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import gensim.downloader
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (15, 10)

In [2]:
#!pip install python-Levenshtein

# Topics negative tweets

## Import negative tweets labeled by VADER

In [3]:
df = pd.read_csv('Data/tweety_topic.csv')
df_tweety = df

### Cleaning, feature engineering for negative

In [4]:
def clean_again(x):
    x = str(x)
    x = x.lower()
    x = x.split()

    return x

In [5]:
import random
text_data = []
for tweet in df_tweety.text:
    tokens = clean_again(tweet)
    if random.random() > .99:
        print(tokens)
        text_data.append(tokens)

['poison', 'enough', 'left', 'side', 'effect', 'like', 'flocking']
['alaskan', 'healthcare', 'worker', 'suffers', 'severe', 'allergic', 'reaction', 'covid', 'vaccine']
['people', 'required', 'get', 'unwanted', 'vaccine', 'people', 'also', 'required', 'maintain', 'unwanted', 'pregnancy']
['hearing', 'story', 'people', 'allergic', 'reaction', 'pfizer', 'biontech', 'covid']
['need', 'answer', 'serious', 'question', 'continuing']
['pharmaceutical', 'company', 'victim']
['heard', 'today', 'people', 'refusing', 'want', 'british', 'one']
['buckle', 'buttercup', 'second', 'dose', 'bear', 'misgiving', 'mild', 'side', 'effect']
['worrying', 'death', 'portuguese', 'nursedays', 'receiving', 'vaccine', 'also', 'went', 'mostly']
['sick', 'amp', 'tired', 'ot', 'virologist', 'amp', 'scientific', 'professional', 'going', 'broadcast', 'medium', 'say', 'nothi']
['2nd', 'ill', 'hour', 'like', 'bad', 'vaccin']
['first', 'vaccine', 'dosage', 'today', 'needle', 'say', 'someone', 'always', 'reacts', 'injectio

In [6]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('Data/dictionary.gensim')

In [7]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('Data/model5.gensim')

In [8]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.063*"china" + 0.032*"supply" + 0.032*"cold" + 0.032*"war"')
(1, '0.022*"tn" + 0.022*"handful" + 0.022*"gi" + 0.022*"fear"')
(2, '0.057*"dose" + 0.020*"buttercup" + 0.020*"misgiving" + 0.020*"mild"')
(3, '0.036*"vaccine" + 0.036*"fake" + 0.019*"cia" + 0.019*"anti"')
(4, '0.067*"ontario" + 0.067*"case" + 0.050*"toronto" + 0.050*"canada"')
(5, '0.045*"vaccine" + 0.023*"receiving" + 0.023*"also" + 0.023*"went"')
(6, '0.054*"vaccine" + 0.036*"unwanted" + 0.036*"required" + 0.036*"people"')
(7, '0.019*"motif" + 0.019*"suspicious" + 0.019*"greed" + 0.019*"earlier"')
(8, '0.041*"vaccine" + 0.021*"time" + 0.021*"rare" + 0.021*"covid"')
(9, '0.045*"second" + 0.023*"spentday" + 0.023*"worse" + 0.023*"long"')
(10, '0.027*"million" + 0.027*"said" + 0.027*"commission" + 0.027*"dos"')
(11, '0.002*"demand" + 0.002*"among" + 0.002*"grows" + 0.002*"result"')
(12, '0.040*"vaccine" + 0.020*"delhi" + 0.020*"authorization" + 0.020*"drive"')
(13, '0.023*"big" + 0.023*"think" + 0.023*"need" + 0.023*"la

In [9]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('Data/model5.gensim')

# Topics positive tweets

## Import positive tweets

In [21]:
df2 = pd.read_csv('Data/tweety_topic2.csv')
df_tweety2 = df2

  and should_run_async(code)


In [22]:
df_tweety2

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,index,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,retweets,favorites,day,user_infl,sentiment
0,0,0,Rachel Roh,"La Crescenta-Montrose, CA",405,1692,3247,False,2020-12-20 06:06:44,folk said daikon paste could treat cytokine storm,['PfizerBioNTech'],0,0,2020-12-20,24.0,positive
1,2,2,eliðŸ‡±ðŸ‡¹ðŸ‡ªðŸ‡ºðŸ‘Œ,Your Bed,10,88,155,False,2020-12-12 20:33:45,russian vaccine created last 4 year,"['coronavirus', 'SputnikV', 'AstraZeneca', 'Pf...",0,0,2020-12-12,5.0,positive
2,5,5,Dee,"Birmingham, England",105,108,106,False,2020-12-12 20:11:42,doe anyone useful advice guidance whether covi...,[],0,0,2020-12-12,7.0,positive
3,6,6,Gunther Fehlinger,"Austria, Ukraine and Kosovo",2731,5001,69344,False,2020-12-12 20:06:00,bit sad claim fame success patriotic competiti...,['vaccination'],0,4,2020-12-12,88.0,positive
4,7,7,Dr.Krutika Kuppalli,,21924,593,7815,True,2020-12-12 20:04:29,many bright day best winning,"['BidenHarris', 'Election2020']",2,22,2020-12-12,76.0,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15137,38495,38522,Hilary,Wales,102,116,4177,False,2021-03-18 09:35:13,money getting vaccine true vaccine others mrna g,['SPUTNIKV'],1,5,2021-03-18,21.0,positive
15138,38498,38525,Bart Kulpa,"London, UK",3962,3942,40988,False,2021-03-18 09:29:21,vaccination speed along serbia country basks g...,[],2,3,2021-03-18,72.0,positive
15139,38500,38527,a street car named Isaiah,,8,35,43,False,2021-03-18 09:21:19,moscow city health department russian direct i...,[],0,0,2021-03-18,3.0,positive
15140,38511,38538,Sputnik V,"Moscow, Russia",223700,142,678,True,2021-03-18 06:47:46,frankfurter allgemeine leader saxony saxony an...,['SputnikV'],92,365,2021-03-18,224.0,positive


### Cleaning, feature engineering for negative

In [23]:
def clean_again(x):
    x = str(x)
    x = x.lower()
    x = x.split()

    return x

  and should_run_async(code)


In [24]:
import random
text_data = []
for tweet in df_tweety2.text:
    tokens = clean_again(tweet)
    if random.random() > .99:
        print(tokens)
        text_data.append(tokens)

['doe', 'vaccine', 'mean', 'gonna', 'eventually', 'wear', 'proper', 'bra', 'agreement', 'un']
['determine', 'still', 'safe', 'effective', 'storage', 'temperature', 'dipped', 'c']
['ha', 'appreciated', 'immense', 'undithering']
['thanks', 'president', 'reagan']
['think', 'need', 'photo', 'booth', 'vaccination', 'site', 'public', 'interest']
['likejab', 'go', 'thanks', 'dr', 'ammara', 'hughes', 'vaccinating', 'helping', 'towards', 'target']
['give', 'nh', 'staff', 'resource', 'support', 'need', 'tackle', 'health', 'crisis', 'right', 'suppl']
['grandad', 'getting', 'first', 'part', 'vaccine', 'tomorrow', 'great', 'scientific', 'achiev']
['really', 'excited', 'get', 'today', 'delay', 'mine', 'week', 'cuz', 'service', 'couldnt', 'pr']
['covid', 'vaccine', 'thank', 'organised', 'professional', 'service', 'delivery', 'massive', 'cruc']
['begin', 'good', 'note', 'pfizer', 'become', 'first', 'covid', 'vaccine', 'approved', 'give', 'green']
['welcoming', 'exciting', 'manner', 'happy', 'vaccinati

  and should_run_async(code)


In [25]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('Data/dictionary.gensim')

  and should_run_async(code)


In [26]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('Data/model5.gensim')

  and should_run_async(code)


In [27]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.033*"vaccine" + 0.025*"thank" + 0.016*"jab" + 0.016*"one"')
(1, '0.015*"right" + 0.015*"tackle" + 0.015*"staff" + 0.015*"health"')
(2, '0.014*"vaccine" + 0.014*"relief" + 0.014*"hopef" + 0.014*"huge"')
(3, '0.025*"vaccine" + 0.025*"phasetrials" + 0.025*"covaxin" + 0.025*"better"')
(4, '0.053*"vaccine" + 0.048*"first" + 0.048*"covid" + 0.017*"effective"')
(5, '0.022*"today" + 0.022*"vaccine" + 0.022*"first" + 0.011*"got"')
(6, '0.019*"approval" + 0.010*"shot" + 0.010*"first" + 0.010*"ha"')
(7, '0.036*"good" + 0.024*"thanks" + 0.024*"shot" + 0.024*"first"')
(8, '0.070*"effective" + 0.028*"give" + 0.028*"efficient" + 0.028*"amp"')
(9, '0.052*"vaccine" + 0.043*"thanks" + 0.026*"score" + 0.017*"take"')
(10, '0.024*"vaccine" + 0.024*"thanks" + 0.016*"pharmaniaga" + 0.016*"well"')
(11, '0.022*"health" + 0.022*"vaccination" + 0.011*"dosepfizer" + 0.011*"courtesy"')
(12, '0.028*"million" + 0.028*"dos" + 0.028*"happy" + 0.014*"agreement"')
(13, '0.027*"approved" + 0.018*"see" + 0.018*"res

  and should_run_async(code)
