In [1]:
import pandas as pd
import numpy as np
import re

#Sentiment analysis
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification

#emotion
from scipy.special import softmax
import csv
import urllib.request

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# spacy for lemmatization
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from wordcloud import WordCloud

import chart_studio
import chart_studio.plotly as py

import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/armelleleguelte/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
username = 'leguela' # your username
api_key = 'dUoZ8GPRarmVDovaJE1F' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 300
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## 1- Sentiment analysis of french and US tweets per vaccine:

In [3]:
# open files
tweets_fr = pd.read_csv('../data/tweets_fr_translated.csv')
tweets_us_north = pd.read_csv('../data/tweets_us_north-clean.csv')
tweets_us_south = pd.read_csv('../data/tweets_us_south_clean.csv')

### 1- Perform sentiment analysis classification by using pretrained model “distilbert-base-uncased-finetuned-sst-2-english” (default classifier from 🤗):

In [37]:
classifier = pipeline('sentiment-analysis')

In [None]:
tweets_fr["sen_ana"] = tweets_fr["translated_text"].apply(classifier)

In [None]:
tweets_us_north["sen_ana"] = tweets_us_north["text"].apply(classifier)
tweets_us_south["sen_ana"] = tweets_us_south["text"].apply(classifier)

In [None]:
# second choice:

In [69]:
from pysentimiento import SentimentAnalyzer

In [71]:
analyzer = SentimentAnalyzer(lang="en")

Downloading:   0%|          | 0.00/890 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/295 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [74]:
test2 = tweets_fr.head(4)

In [107]:
## Code not working need to fix later
#sen_ana=[]
#for index, row in test2["translated_text"]:
#    text = row["translated_text"]
#    id = row["id"]
#    result = analyzer.predict(text)
#    x = [id, result['output']]
#    sen_ana.append(x)

In [104]:
sen_ana=[]
for text in test2["translated_text"]:
    result = analyzer.predict(text)
    sen_ana.append(result)

In [101]:
sen_ana

[SentimentOutput(output=NEU, probas={NEU: 0.996, POS: 0.003, NEG: 0.002}),
 SentimentOutput(output=NEU, probas={NEU: 0.995, POS: 0.004, NEG: 0.000}),
 SentimentOutput(output=POS, probas={POS: 0.979, NEU: 0.021, NEG: 0.000}),
 SentimentOutput(output=NEU, probas={NEU: 0.990, POS: 0.006, NEG: 0.004})]

In [92]:
scores_df = pd.DataFrame(sen_ana)

In [93]:
scores_df

Unnamed: 0,0
0,"SentimentOutput(output=NEU, probas={NEU: 0.996, POS: 0.003, NEG: 0.002})"
1,"SentimentOutput(output=NEU, probas={NEU: 0.995, POS: 0.004, NEG: 0.000})"
2,"SentimentOutput(output=POS, probas={POS: 0.979, NEU: 0.021, NEG: 0.000})"
3,"SentimentOutput(output=NEU, probas={NEU: 0.990, POS: 0.006, NEG: 0.004})"


In [None]:
## ROBERTA:

In [4]:
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [6]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [15]:
test = tweets_fr.head(3)

In [32]:
scores=[]
for index, row in test.iterrows():
    text = row["translated_text"]
    id = row["id"]
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    score = output[0][0].detach().numpy()
    score = softmax(score)

    ranking = np.argsort(score)
    ranking = ranking[::-1]
    for i in range(score.shape[0]):
        l = labels[ranking[i]]
        s = score[ranking[i]]
        x = [id,l,np.round(float(s), 4)]

        scores.append(x)

In [33]:
scores

[[1393495540940300288, 'optimism', 0.4005],
 [1393495540940300288, 'sadness', 0.3053],
 [1393495540940300288, 'anger', 0.2501],
 [1393495540940300288, 'joy', 0.0441],
 [1393468639328747520, 'optimism', 0.5666],
 [1393468639328747520, 'sadness', 0.1878],
 [1393468639328747520, 'anger', 0.169],
 [1393468639328747520, 'joy', 0.0766],
 [1392756156649689088, 'optimism', 0.4757],
 [1392756156649689088, 'sadness', 0.3659],
 [1392756156649689088, 'joy', 0.1148],
 [1392756156649689088, 'anger', 0.0436]]

In [34]:
scores_df = pd.DataFrame(scores, columns=["id", "emotion", "score"])

In [35]:
scores_df

Unnamed: 0,id,emotion,score
0,1393495540940300288,optimism,0.4005
1,1393495540940300288,sadness,0.3053
2,1393495540940300288,anger,0.2501
3,1393495540940300288,joy,0.0441
4,1393468639328747520,optimism,0.5666
5,1393468639328747520,sadness,0.1878
6,1393468639328747520,anger,0.169
7,1393468639328747520,joy,0.0766
8,1392756156649689088,optimism,0.4757
9,1392756156649689088,sadness,0.3659


In [50]:
text = "I can't get more Chinese muzzle, inject me 3kg of AztraZeneca"

In [51]:
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [52]:
scores

array([0.75012344, 0.07152066, 0.08470895, 0.0936469 ], dtype=float32)

In [53]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) anger 0.7501
2) sadness 0.0936
3) optimism 0.0847
4) joy 0.0715


In [None]:
df = pd.DataFrame(dict(
    r=[1, 5, 2, 2],
    theta=['joy','optimism','sadness',
           'anger']))

In [56]:
fig = px.line_polar(scores_df, r='score', theta='emotion', line_close=True)
fig.update_traces(fill='toself')
fig.show()

### 2- Split result into two columns:

In [None]:
def split_label(lb):
    label = re.findall(r"[A-Z]+", lb)[0]
    return label

In [None]:
def split_score(sc):
    score = re.findall(r"\d+\.\d+", sc)[0]
    return score   

In [None]:
tweets_fr["score"] = tweets_fr["sen_ana"].apply(str).apply(split_score)
tweets_fr["label"] = tweets_fr["sen_ana"].apply(str).apply(split_label)

In [None]:
tweets_us_north["score"] = tweets_us_north["sen_ana"].apply(str).apply(split_score)
tweets_us_north["label"] = tweets_us_north["sen_ana"].apply(str).apply(split_label)

In [None]:
tweets_us_south["score"] = tweets_us_south["sen_ana"].apply(str).apply(split_score)
tweets_us_south["label"] = tweets_us_south["sen_ana"].apply(str).apply(split_label)

In [None]:
# Change column score from object to float
tweets_fr['score'] = tweets_fr['score'].astype('float64')
tweets_us_north['score'] = tweets_us_north['score'].astype('float64')
tweets_us_south['score'] = tweets_us_south['score'].astype('float64')

**Save files:**

In [None]:
tweets_fr.to_csv('../data/tweet_fr_sa.csv', index=False)
tweets_us_north.to_csv('../data/tweets_us_north_sa.csv', index=False)
tweets_us_south.to_csv('../data/tweets_us_south_sa.csv', index=False)

### 3- Bar plots:

In [None]:
tweets_fr_gp = tweets_fr.groupby(["vaccine", "label"])["score"].mean().reset_index()

In [None]:
fig1 = px.bar(tweets_fr_gp, x='vaccine', y='score', color ='label', barmode='group',title= 'Sentiments analysis of French tweets toward COVID-19 vaccines',
            labels={
                 "score": "Score",
                 "vaccine": "",
                 "label": "Sentiment"
             },
            color_discrete_map={ # replaces default color mapping by value
                "NEGATIVE": "#9467bd", "POSITIVE": "lightsalmon"
            },
                        category_orders={"vaccine": ["pfizer", "moderna","astrazeneca", "johnson"]
                        },
             template="simple_white"
            )
fig1.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig1.update_xaxes(showline=True, zeroline=True)
            
fig1.show()

In [None]:
py.plot(fig1, filename = 'sentiment_analysis_france', auto_open=False)

In [None]:
tweets_us_north_gp = tweets_us_north.groupby(["vaccine", "label"])["score"].mean().reset_index()

In [None]:
fig2 = px.bar(tweets_us_north_gp, x='vaccine', y='score', color ='label', barmode='group',title= 'Sentiments analysis of Northern US tweets toward COVID-19 vaccines',
            labels={
                 "score": "Score",
                 "vaccine": "",
                 "label": "Sentiment"
             },
            color_discrete_map={ # replaces default color mapping by value
                "NEGATIVE": "#9467bd", "POSITIVE": "lightsalmon"
            },
                        category_orders={"vaccine": ["pfizer", "moderna","astrazeneca", "johnson"]
                        },
             template="simple_white"
            )
fig2.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig2.update_xaxes(showline=True, zeroline=True)
            
fig2.show()

In [None]:
py.plot(fig2, filename = 'sentiment_analysis_us_north', auto_open=False)

In [None]:
tweets_us_south_gp = tweets_us_south.groupby(["vaccine", "label"])["score"].mean().reset_index()

In [None]:
fig3 = px.bar(tweets_us_south_gp, x='vaccine', y='score', color ='label', barmode='group',title= 'Sentiments analysis of Southern US tweets toward COVID-19 vaccines',
            labels={
                 "score": "Score",
                 "vaccine": "",
                 "label": "Sentiment"
             },
            color_discrete_map={ # replaces default color mapping by value
                "NEGATIVE": "#9467bd", "POSITIVE": "lightsalmon"
            },
                        category_orders={"vaccine": ["pfizer", "moderna","astrazeneca", "johnson"]
                        },
             template="simple_white"
            )
fig3.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig3.update_xaxes(showline=True, zeroline=True)
            
fig3.show()

In [None]:
py.plot(fig3, filename = 'sentiment_analysis_us_south', auto_open=False)

## 2- Word frequency:

In [None]:
custom_stopwords = STOPWORDS.union(set(['vaccine', 'covid','pfizer', 'astrazeneca', 'moderna', 'johnson', 'janssen']))

In [None]:
def word_frequency(df):
    word_vectorizer = TfidfVectorizer(
        ngram_range=(1,1),
        analyzer='word',
        stop_words=custom_stopwords
    )
    
    # Remove short words, pumctuation, numbers and special characters
    sparse_matrix = word_vectorizer.fit_transform(
        df["translated_text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3])
            )
        )
        
    
    # Create frequency matrix
    frequencies = sparse_matrix.sum(axis=0)
    
    # Create DF from frequency matrix
    result_df = pd.DataFrame(frequencies.reshape(-1,1), index=word_vectorizer.get_feature_names(), columns=['frequency'])
    
    # Return sorted DF
    return result_df    

In [None]:
#tweets_fr_janssen = tweets_fr[tweets_fr["vaccine"] == 'johnson']

In [None]:
freq_tweets_fr = word_frequency(tweets_fr)

In [None]:
freq_tweets_fr = freq_tweets_fr.reset_index()

In [None]:
freq_tweets_fr = freq_tweets_fr.rename(columns={'index':'words'})

In [None]:
tuples_fr = [tuple(x) for x in freq_tweets_fr.values]

In [None]:
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='black', colormap='Set2',collocations=False).generate_from_frequencies(dict(tuples_fr))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
wordcloud.to_file('../wordcloud_france.png')

In [None]:
def word_frequency_us(df):
    word_vectorizer = TfidfVectorizer(
        ngram_range=(1,1),
        analyzer='word',
        stop_words=custom_stopwords
    )
    
    # Remove short words, pumctuation, numbers and special characters
    sparse_matrix = word_vectorizer.fit_transform(
        df["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3])
            )
        )
        
    
    # Create frequency matrix
    frequencies = sparse_matrix.sum(axis=0)
    
    # Create DF from frequency matrix
    result_df = pd.DataFrame(frequencies.reshape(-1,1), index=word_vectorizer.get_feature_names(), columns=['frequency'])
    
    # Return sorted DF
    return result_df    

In [None]:
freq_tweets_us_north = word_frequency_us(tweets_us_north)
freq_tweets_us_south = word_frequency_us(tweets_us_south)

In [None]:
freq_tweets_us_north = freq_tweets_us_north.reset_index()
freq_tweets_us_south = freq_tweets_us_south.reset_index()

In [None]:
freq_tweets_us_north = freq_tweets_us_north.rename(columns={'index':'words'})
freq_tweets_us_south = freq_tweets_us_south.rename(columns={'index':'words'})

In [None]:
tuples_us_n = [tuple(x) for x in freq_tweets_us_north.values]
tuples_us_s = [tuple(x) for x in freq_tweets_us_south.values]

In [None]:
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='lightgray', colormap='Set1',collocations=False).generate_from_frequencies(dict(tuples_us_n))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
wordcloud.to_file('../wordcloud_us_north.png')

In [None]:
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='white', colormap='Set3',collocations=False).generate_from_frequencies(dict(tuples_us_s))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
wordcloud.to_file('../wordcloud_us_south.png')

## 3- Topic modeling - Latent Dirichlet Allocation (LDA):

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['vaccine', 'covid', 'coronavirus'])

**FRANCE:**

In [None]:
tweets_fr_lda = tweets_fr.drop(columns=["id", "date", "location", "follower_count", "retweets", "text", "sen_ana", "score", "vaccine"])
tweets_fr_lda = tweets_fr_lda.rename(columns ={'translated_text': 'text'})

In [None]:
tweets_fr_lda["text"] = tweets_fr_lda["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3]))

In [None]:
# Convert to list
data = tweets_fr_lda.text.values.tolist()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '../data/lda_france.html')

**USA NORTH:**

In [None]:
tweets_us_north_lda = tweets_us_north

In [None]:
tweets_us_north_lda["text"] = tweets_us_north_lda["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3]))

In [None]:
# Convert to list
data = tweets_us_north_lda.text.values.tolist()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '../data/lda_us_north.html')

**USA SOUTH:**

In [None]:
tweets_us_south_lda = tweets_us_south

In [None]:
tweets_us_south_lda["text"] = tweets_us_south_lda["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3]))

In [None]:
# Convert to list
data = tweets_us_south_lda.text.values.tolist()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '../data/lda_us_south.html')