# Topic Modeling

### Imports

In [32]:
!pip install keras



In [33]:
!pip install tensorflow



## Data preprocessing

### Import data from csv

In [34]:
import pandas as pd 

In [35]:
tweets=pd.read_csv("covid19_tweets.csv",delimiter=",")

In [36]:
tweets=tweets[["text"]]

In [37]:
tweets.head(5)

Unnamed: 0,text
0,If I smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...
3,@brookbanktv The one gift #COVID19 has give me...
4,25 July : Media Bulletin on Novel #CoronaVirus...


### To lower case

In [38]:
lowered = tweets['text'].str.lower()
tweets['lowered'] = lowered

In [39]:
tweets.head(5)

Unnamed: 0,text,lowered
0,If I smelled the scent of hand sanitizers toda...,if i smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,hey @yankees @yankeespr and @mlb - wouldn't it...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,@diane3443 @wdunlap @realdonaldtrump trump nev...
3,@brookbanktv The one gift #COVID19 has give me...,@brookbanktv the one gift #covid19 has give me...
4,25 July : Media Bulletin on Novel #CoronaVirus...,25 july : media bulletin on novel #coronavirus...


### Clean text punctuation marks

In [40]:
import re

In [41]:
tweets['lowered']=[re.sub(r'[^\w\s]','',text) for text in tweets['lowered']]

In [42]:
tweets.head(5)

Unnamed: 0,text,lowered
0,If I smelled the scent of hand sanitizers toda...,if i smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,hey yankees yankeespr and mlb wouldnt it have...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane3443 wdunlap realdonaldtrump trump never ...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv the one gift covid19 has give me i...
4,25 July : Media Bulletin on Novel #CoronaVirus...,25 july media bulletin on novel coronavirusup...


### Tokenization
Breaking down text into smaller pieces

In [43]:
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:
tokened = tweets.apply(lambda row: nltk.word_tokenize(row['lowered']), axis=1)
tweets['tokenized'] = tokened

In [45]:
tweets.head(5)

Unnamed: 0,text,lowered,tokenized
0,If I smelled the scent of hand sanitizers toda...,if i smelled the scent of hand sanitizers toda...,"[if, i, smelled, the, scent, of, hand, sanitiz..."
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,hey yankees yankeespr and mlb wouldnt it have...,"[hey, yankees, yankeespr, and, mlb, wouldnt, i..."
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane3443 wdunlap realdonaldtrump trump never ...,"[diane3443, wdunlap, realdonaldtrump, trump, n..."
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv the one gift covid19 has give me i...,"[brookbanktv, the, one, gift, covid19, has, gi..."
4,25 July : Media Bulletin on Novel #CoronaVirus...,25 july media bulletin on novel coronavirusup...,"[25, july, media, bulletin, on, novel, coronav..."


### Cleaning text from stop words

In [46]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [47]:
noise = stopwords.words('english')
withoutstop = tweets['tokenized'].apply(lambda x: [item for item in x if item not in noise])
without_stop = []
for a in withoutstop:    
    without_stop.append(", ".join(a))
tweets['without_stop'] = without_stop

In [48]:
tweets.head(5)

Unnamed: 0,text,lowered,tokenized,without_stop
0,If I smelled the scent of hand sanitizers toda...,if i smelled the scent of hand sanitizers toda...,"[if, i, smelled, the, scent, of, hand, sanitiz...","smelled, scent, hand, sanitizers, today, someo..."
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,hey yankees yankeespr and mlb wouldnt it have...,"[hey, yankees, yankeespr, and, mlb, wouldnt, i...","hey, yankees, yankeespr, mlb, wouldnt, made, s..."
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane3443 wdunlap realdonaldtrump trump never ...,"[diane3443, wdunlap, realdonaldtrump, trump, n...","diane3443, wdunlap, realdonaldtrump, trump, ne..."
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv the one gift covid19 has give me i...,"[brookbanktv, the, one, gift, covid19, has, gi...","brookbanktv, one, gift, covid19, give, appreci..."
4,25 July : Media Bulletin on Novel #CoronaVirus...,25 july media bulletin on novel coronavirusup...,"[25, july, media, bulletin, on, novel, coronav...","25, july, media, bulletin, novel, coronavirusu..."


### Lemmatization 
Reducing words to its base

In [55]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


True

In [56]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

In [57]:
tweets['lemmatized'] = tweets['without_stop'].apply(lemmatize_text)

In [58]:
tweets['text']=tweets['lemmatized']
tweets=tweets[['text']]

In [59]:
tweets.head(5)

Unnamed: 0,text
0,"smelled , scent , hand , sanitizers , today , ..."
1,"hey , yankee , yankeespr , mlb , wouldnt , mad..."
2,"diane3443 , wdunlap , realdonaldtrump , trump ..."
3,"brookbanktv , one , gift , covid19 , give , ap..."
4,"25 , july , medium , bulletin , novel , corona..."


### Create a dictionary of the words and frequencies

In [61]:
import gensim
from gensim import corpora

In [62]:
tokenized_text = [word_tokenize(text) for text in tweets['text']]

In [63]:
dictionary = corpora.Dictionary(tokenized_text)

In [64]:
print (dictionary)

Dictionary(323408 unique tokens: [',', 'hand', 'httpstcoqzvybrogb0', 'intoxicated', 'past']...)


In [65]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_text]

### Explore hidden toppics using LDA model
Choosing the optimal number of topics

In [66]:
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

Model №1 \
10 topics

In [67]:
model1=LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=int(10), 
random_state=200, workers=4)

In [68]:
coherence_model_1 = CoherenceModel(model=model1, texts=tokenized_text, 
dictionary=dictionary, coherence='c_v')

In [69]:
coherence_score = coherence_model_1.get_coherence()
print('Coherence Score:', coherence_score)

Coherence Score: 0.31507275776015925


Model №2 \
5 topics

In [70]:
model2=LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=5, 
random_state=200, workers=4)

In [71]:
coherence_model_2 = CoherenceModel(model=model2, texts=tokenized_text, 
dictionary=dictionary, coherence='c_v')

In [72]:
coherence_score = coherence_model_2.get_coherence()
print('Coherence Score:', coherence_score)

Coherence Score: 0.3211961949955794


Model №3 \
20 topics

In [73]:
model3=LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20, 
random_state=200, workers=4)

In [74]:
coherence_model_3 = CoherenceModel(model=model3, texts=tokenized_text, 
dictionary=dictionary, coherence='c_v')

In [75]:
coherence_score = coherence_model_3.get_coherence()
print('Coherence Score:', coherence_score)

Coherence Score: 0.3792560123830891


Model 3 with 10 topics has the highest score = 0,38

### Result's visualization

In [90]:
!pip install pyLDAvis==2.1.2

Collecting pyLDAvis==2.1.2
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 1.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py): started
  Building wheel for pyLDAvis (setup.py): finished with status 'done'
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97721 sha256=8eabd640eeb3cd0bb75c58eded2ff9614e2a8f5073ad38fff0c28f619c87ce3d
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\59\70\ee\99a0df99d8b4a7b87c79640ebee0927c0f6ccff046e9cc2471
Successfully built pyLDAvis
Installing collected packages: pyLDAvis
  Attempting uninstall: pyLDAvis
    Found existing installation: pyLDAvis 3.4.1
    Uninstalling pyLDAvis-3.4.1:
      Successfully uninstalled pyLDAvis-3.4.1
Successfully installed pyLDAvis-2.1.2
Collecting pyLDAvis==2.1.2




Collecting pyLDAvis==2.1.2
  Using cached pyLDAvis-2.1.2.tar.gz (1.6 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py): started
  Building wheel for pyLDAvis (setup.py): finished with status 'done'
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97721 sha256=4dbcfd59ac66cf240ea2da33107a44dbea6408a23c7be183c1645606fd3b8d1c
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\59\70\ee\99a0df99d8b4a7b87c79640ebee0927c0f6ccff046e9cc2471
Successfully built pyLDAvis
Installing collected packages: pyLDAvis
  Attempting uninstall: pyLDAvis
    Found existing installation: pyldavis 3.4.1
    Can't uninstall 'pyldavis'. No files were found to uninstall.
Successfully installed pyLDAvis-2.1.2




In [97]:
!pip install pandas==1.5.3

Collecting pandas==1.5.3
  Downloading pandas-1.5.3-cp39-cp39-win_amd64.whl (10.9 MB)
     ---------------------------------------- 10.9/10.9 MB 3.8 MB/s eta 0:00:00
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.0.1
    Uninstalling pandas-2.0.1:
      Successfully uninstalled pandas-2.0.1
Successfully installed pandas-1.5.3


In [98]:
import pyLDAvis
import pyLDAvis.gensim

In [99]:
pyLDAvis.enable_notebook()

In [100]:
vis = pyLDAvis.gensim.prepare(model3, corpus, dictionary)

In [101]:
pyLDAvis.display(vis)