Twitter-Topic and Sentiment Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
from clean_tweets_dataframe import Clean_Tweets

In [30]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

Read and Load The Dataset

In [21]:
class DataLoader:
    def __init__(self,file_name):
        self.file_name = file_name
    def read_csv(self):
        df=pd.read_csv(self.file_name)
        return df
DataLoader_obj= DataLoader('processed_tweet_data.csv')
df = DataLoader_obj.read_csv()

In [24]:
df['lang'][:5]

0    en
1    en
2    en
3    en
4    en
Name: lang, dtype: object

In [40]:

df.shape

(16472, 16)

Prepare The Data

In [41]:
from string import punctuation
class PrepareData:
    def __init__(self,df):
        self.df=df
    def prepare_data(self):
        df['clean_text'] = df['clean_text'].dropna()
        df['clean_text']=df['clean_text'].astype(str)
        df['clean_text'] = df['clean_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', punctuation)))
        df['clean_text'] = df['clean_text'].apply(lambda x: x.lower()) 
        
        
        #clean and prepare for feature engineering
        sentence_list = [sentence for sentence in df['clean_text']]
        vocab_list = [vocab.split() for vocab in sentence_list]

        return vocab_list
    def remove_stopwords(self,vocab_list):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in vocab_list]
    def make_bigrams(self,non_stop_words):
        return [bigram_mod[doc] for doc in non_stop_words]

    def make_trigrams(self,non_stop_words):
        return [trigram_mod[bigram_mod[doc]] for doc in non_stop_words]

    def lemmatization(self,make_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for sent in make_bigrams:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out
    

In [42]:
data  = PrepareData(df)
vocab_list = data.prepare_data()
vocab_list

[['rt',
  'northstarcharts',
  'the',
  '10year',
  'yield',
  'is',
  'telling',
  'us',
  'that',
  'theres',
  'a',
  'high',
  'risk',
  'of',
  'something',
  'breaking',
  'in',
  'the',
  'system',
  'gold',
  'silver',
  'crypto',
  '…'],
 ['rt',
  'michaelaarouet',
  'german',
  '10y',
  'mortgage',
  'rate',
  'went',
  'from',
  '08',
  'to',
  '250',
  '👇',
  'can',
  'you',
  'hear',
  'the',
  'sound',
  'of',
  'german',
  'real',
  'estate',
  'bubble',
  'bursting',
  'https…'],
 ['rt', 'goldseek', 'when', 'httpstcoko2ffhkazg'],
 ['rt',
  'charliebilello',
  'the',
  '30year',
  'mortgage',
  'rate',
  'in',
  'the',
  'us',
  'rises',
  'to',
  '511',
  'its',
  'highest',
  'level',
  'since',
  '2010',
  'last',
  'year',
  'it',
  'hit',
  'an',
  'alltime',
  'low',
  'of',
  '2…'],
 ['rt',
  'biancoresearch',
  'rates',
  'rise',
  'until',
  'something',
  'breaks',
  '…',
  'is',
  'anything',
  'broken',
  'yet',
  'httpstcobrnjek3wtb'],
 ['rt',
  'lancerobert

In [43]:
bigram = gensim.models.Phrases(vocab_list, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[vocab_list], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[vocab_list[0]]])

['rt', 'northstarcharts', 'the', '10year_yield', 'is', 'telling', 'us', 'that', 'theres', 'a', 'high', 'risk', 'of', 'something', 'breaking', 'in', 'the', 'system_gold_silver', 'crypto', '…']


In [28]:
python -m spacy download en

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.0
    Uninstalling en-core-web-sm-2.2.0:
      Successfully uninstalled en-core-web-sm-2.2.0
Successfully installed en-core-web-sm-3.1.0
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-04-27 22:05:21.313042: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll


In [45]:
nlp = spacy.load('en_core_web_sm')
non_stop_words = data.remove_stopwords(vocab_list)
make_bigrams = data.make_bigrams(non_stop_words)
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = data.lemmatization(make_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['yield', 'tell', 's', 'high', 'risk', 'break', 'system', 'gold_silver']]


In [46]:
#create a dictionary
vocab_to_int = corpora.Dictionary(vocab_list)
corpus= [vocab_to_int.doc2bow(vocab) for vocab in vocab_list]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1)]]


In [47]:
[[(vocab_to_int[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('10year', 1),
  ('a', 1),
  ('breaking', 1),
  ('crypto', 1),
  ('gold', 1),
  ('high', 1),
  ('in', 1),
  ('is', 1),
  ('northstarcharts', 1),
  ('of', 1),
  ('risk', 1),
  ('rt', 1),
  ('silver', 1),
  ('something', 1),
  ('system', 1),
  ('telling', 1),
  ('that', 1),
  ('the', 2),
  ('theres', 1),
  ('us', 1),
  ('yield', 1),
  ('…', 1)]]

In [48]:
#Building The Topic Model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=vocab_to_int,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [36]:
#let's View the Topics 
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.061*"your" + 0.043*"new" + 0.041*"details" + 0.029*"some" + 0.022*"best" '
  '+ 0.021*"first" + 0.020*"think" + 0.015*"better" + 0.013*"thanks" + '
  '0.012*"check"'),
 (1,
  '0.097*"from" + 0.031*"know" + 0.023*"pm" + 0.023*"2" + 0.019*"2022" + '
  '0.014*"ukraine" + 0.014*"https…" + 0.013*"free" + 0.012*"4" + 0.011*"hate"'),
 (2,
  '0.055*"be" + 0.050*"will" + 0.043*"was" + 0.039*"our" + 0.026*"10" + '
  '0.026*"today" + 0.021*"online" + 0.015*"says" + 0.013*"she" + '
  '0.012*"covid19"'),
 (3,
  '0.084*"amp" + 0.035*"so" + 0.020*"may" + 0.019*"did" + 0.015*"covid" + '
  '0.015*"love" + 0.014*"thank" + 0.014*"then" + 0.014*"well" + 0.014*"media"'),
 (4,
  '0.070*"her" + 0.040*"make" + 0.024*"5" + 0.021*"week" + 0.014*"dr" + '
  '0.014*"long" + 0.009*"hours" + 0.008*"own" + 0.008*"green" + 0.008*"24"'),
 (5,
  '0.063*"the" + 0.054*"rt" + 0.042*"to" + 0.034*"of" + 0.030*"in" + 0.027*"a" '
  '+ 0.023*"and" + 0.023*"is" + 0.020*"for" + 0.017*"on"'),
 (6,
  '0.055*"who" + 0.035*

In [37]:
#Evaluation of the Model 
# Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=vocab_to_int, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -10.736484368668055


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Coherence Score:  nan


In [59]:
vocab_to_int

<gensim.corpora.dictionary.Dictionary at 0x19204569c10>

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis