In [None]:
import pandas as pd
import numpy as np
import re

#Sentiment analysis
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TFAutoModelForSequenceClassification

#emotion
from scipy.special import softmax
import csv
import urllib.request

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# spacy for lemmatization
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from wordcloud import WordCloud

import chart_studio
import chart_studio.plotly as py

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
username = 'xxx' # your username
api_key = 'xxx' # your api key - go to profile > settings > regenerate key
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 300
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## A- Sentiment analysis of french and US tweets per vaccine:

In [None]:
# open files
tweets_fr = pd.read_csv('../data/tweets_fr_translated_clean.csv')
tweets_us_north = pd.read_csv('../data/tweets_us_north-clean.csv')
tweets_us_south = pd.read_csv('../data/tweets_us_south_clean.csv')

### 1- Perform sentiment analysis classification by using pretrained model “distilbert-base-uncased-finetuned-sst-2-english” (🤗) -- Classify the Tweets as positive or negative::

In [None]:
classifier = pipeline('sentiment-analysis')

In [None]:
# apply classifier to the french tweets translated in english
tweets_fr["sen_ana"] = tweets_fr["translated_text"].apply(classifier)

In [None]:
tweets_us_north["sen_ana"] = tweets_us_north["text"].apply(classifier)
tweets_us_south["sen_ana"] = tweets_us_south["text"].apply(classifier)

**Make functions to split result into two columns:**

In [None]:
def split_label(lb):
    label = re.findall(r"[A-Z]+", lb)[0]
    return label

In [None]:
def split_score(sc):
    score = re.findall(r"\d+\.\d+", sc)[0]
    return score   

In [None]:
tweets_fr["score"] = tweets_fr["sen_ana"].apply(str).apply(split_score)
tweets_fr["label"] = tweets_fr["sen_ana"].apply(str).apply(split_label)

In [None]:
tweets_us_north["score"] = tweets_us_north["sen_ana"].apply(str).apply(split_score)
tweets_us_north["label"] = tweets_us_north["sen_ana"].apply(str).apply(split_label)

In [None]:
tweets_us_south["score"] = tweets_us_south["sen_ana"].apply(str).apply(split_score)
tweets_us_south["label"] = tweets_us_south["sen_ana"].apply(str).apply(split_label)

In [None]:
# Change column score from object to float
tweets_fr['score'] = tweets_fr['score'].astype('float64')
tweets_us_north['score'] = tweets_us_north['score'].astype('float64')
tweets_us_south['score'] = tweets_us_south['score'].astype('float64')

**Save files:**

In [None]:
# replace names to be the same across notebooks
tweets_fr["vaccine"] = tweets_fr["vaccine"].replace(["pfizer", "moderna", "astrazeneca", "johnson"], ["Pfizer", "Moderna", "AstraZenca", "Johnson&Johnson"])
tweets_fr["label"] = tweets_fr["label"].replace(["NEGATIVE", "POSITIVE"], ["Negative", "Positive"])

tweets_us_north["vaccine"] = tweets_us_north["vaccine"].replace(["pfizer", "moderna", "astrazeneca", "johnson"], ["Pfizer", "Moderna", "AstraZenca", "Johnson&Johnson"])
tweets_us_north["label"] = tweets_us_north["label"].replace(["NEGATIVE", "POSITIVE"], ["Negative", "Positive"])

tweets_us_south["vaccine"] = tweets_us_south["vaccine"].replace(["pfizer", "moderna", "astrazeneca", "johnson"], ["Pfizer", "Moderna", "AstraZenca", "Johnson&Johnson"])
tweets_us_south["label"] = tweets_us_south["label"].replace(["NEGATIVE", "POSITIVE"], ["Negative", "Positive"])

In [None]:
#tweets_fr.to_csv('../data/tweet_fr_sa.csv', index=False)
#tweets_us_north.to_csv('../data/tweets_us_north_sa.csv', index=False)
#tweets_us_south.to_csv('../data/tweets_us_south_sa.csv', index=False)

**Make violin plots:**

In [None]:
# Open files with sentiment analysis possitive or negative
tweets_fr = pd.read_csv('../data/tweet_fr_sa.csv')
tweets_us_ne = pd.read_csv('../data/tweets_us_north_sa.csv')
tweets_us_se = pd.read_csv('../data/tweets_us_south_sa.csv')

In [None]:
# France
fig1 = px.violin(tweets_fr, x='vaccine', y='score', color ='label', title= 'Sentiments analysis of French tweets toward COVID-19 vaccines',
            labels={
                 "score": "Score",
                 "vaccine": "",
                 "label": "Sentiment"
             },
            color_discrete_map={ # replaces default color mapping by value
                "Negative": "steelblue", "Positive": "lightsteelblue"
            },
                        category_orders={"vaccine": ["Pfizer", "Moderna", "AstraZenca", "Johnson&Johnson"]
                        },
             template="simple_white"
            )
fig1.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig1.update_xaxes(showline=True, zeroline=True)
            
fig1.show()

In [None]:
#py.plot(fig1, filename = 'sentiment_analysis_france', auto_open=False)

In [None]:
# US NE
fig2 = px.violin(tweets_us_ne, x='vaccine', y='score', color ='label',title= 'Sentiments analysis of US Northeast Tweets toward COVID-19 vaccines',
            labels={
                 "score": "Score",
                 "vaccine": "",
                 "label": "Sentiment"
             },
            color_discrete_map={ # replaces default color mapping by value
                "Negative": "steelblue", "Positive": "lightsteelblue"
            },
                        category_orders={"vaccine": ["Pfizer", "Moderna", "AstraZenca", "Johnson&Johnson"]
                        },
             template="simple_white"
            )
fig1.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig1.update_xaxes(showline=True, zeroline=True)
            
fig1.show()

In [None]:
#py.plot(fig2, filename = 'sentiment_analysis_us_north', auto_open=False)

In [None]:
# US SE
fig3 = px.violin(tweets_us_se, x='vaccine', y='score', color ='label',title= 'Sentiments analysis of US Southeast Tweets toward COVID-19 vaccines',
            labels={
                 "score": "Score",
                 "vaccine": "",
                 "label": "Sentiment"
             },
            color_discrete_map={ # replaces default color mapping by value
                "Negative": "steelblue", "Positive": "lightsteelblue"
            },
                        category_orders={"vaccine": ["Pfizer", "Moderna", "AstraZenca", "Johnson&Johnson"]
                        },
             template="simple_white"
            )
fig1.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig1.update_xaxes(showline=True, zeroline=True)
            
fig1.show()

In [None]:
#py.plot(fig3, filename = 'sentiment_analysis_us_south', auto_open=False)

### 2- Perform sentiment analysis classification by using pretrained model “cardifnlp/bertweet-base-sentiment ” (🤗) -- Classify the Tweets as positive, negative or neutral:

**FRANCE:**

In [None]:
from pysentimiento import SentimentAnalyzer

In [None]:
# Do the analysis on english Tweets
analyzer = SentimentAnalyzer(lang="en")

In [None]:
## define function to get sentiments
def sentiment_analysis(df):
    sen_ana=[]
    for index, row in df.iterrows():
        #doesn't work without the try (potential problems with few tweets)
        try:
            texts = row["translated_text"]
            tweet_id = row["id"]
            result = analyzer.predict(texts)
            x = [tweet_id, result]
            sen_ana.append(x)
        except:
            pass
    result_df = pd.DataFrame(sen_ana, columns=["id", "sent_score"])
    return result_df

In [None]:
# apply the function 
tweets_fr_bert = sentiment_analysis(tweets_fr)

**Clean results using regex:**

In [None]:
def find_output(op):
    output = re.findall(r"\=\w+", op)[0]
    return output

In [None]:
def find_neutral(ne):
    neutral = re.findall(r"NEU:\W\d\.\d+", ne)[0]
    return neutral

In [None]:
def find_positive(po):
    positive = re.findall(r"POS:\W\d\.\d+", po)[0]
    return positive

In [None]:
def find_negative(ng):
    negative = re.findall(r"NEG:\W\d\.\d+", ng)[0]
    return negative

In [None]:
# apply functions
tweets_fr_bert["output"] = tweets_fr_bert["sent_score"].apply(str).apply(find_output)
tweets_fr_bert["neutral"] = tweets_fr_bert["sent_score"].apply(str).apply(find_neutral)
tweets_fr_bert["positive"] = tweets_fr_bert["sent_score"].apply(str).apply(find_positive)
tweets_fr_bert["negative"] = tweets_fr_bert["sent_score"].apply(str).apply(find_negative)

In [None]:
tweets_fr_bert["output"] = tweets_fr_bert["output"].str.strip("=")
tweets_fr_bert["neutral"] = tweets_fr_bert["neutral"].str.strip("NEU: ")
tweets_fr_bert["positive"] = tweets_fr_bert["positive"].str.strip("POS: ")
tweets_fr_bert["negative"] = tweets_fr_bert["negative"].str.strip("NEG: ")

In [None]:
tweets_fr_bert["neutral"] = tweets_fr_bert["neutral"].astype('float64')
tweets_fr_bert["positive"] = tweets_fr_bert["positive"].astype('float64')
tweets_fr_bert["negative"] = tweets_fr_bert["negative"].astype('float64')

In [None]:
# clean text
tweets_fr_bert["output"] = tweets_fr_bert["output"].replace(["NEU", "POS", "NEG"], ["neutral", "positive", "negative"])

In [None]:
tweets_fr_bert = tweets_fr_bert.drop(columns=["sent_score"])

In [None]:
# merge dataframes together
tweets_fr_bert_all = tweets_fr.merge(tweets_fr_bert, on="id", how="inner")

In [None]:
#save file
#tweets_fr_bert_all.to_csv('../data/tweets_fr_sa_bert.csv', index=False)

In [None]:
# reopen file
tweets_fr_sa_bert = pd.read_csv('../data/tweets_fr_sa_bert.csv')

In [None]:
# get the mean of sentiments
tweets_fr_bert_gp = tweets_fr_sa_bert.groupby("vaccine")["neutral", "positive", "negative"].mean().reset_index()

In [None]:
# melt the dataframe for plot
tweets_fr_bert_melt = pd.melt(tweets_fr_bert_gp, id_vars=['vaccine'], value_vars=['negative','neutral', 'positive'], var_name='sentiment', value_name='mean')

In [None]:
# clean values
tweets_fr_bert_melt["vaccine"] = tweets_fr_bert_melt["vaccine"].replace(["pfizer", "moderna", "astrazeneca", "johnson"], ["Pfizer", "Moderna", "AstraZeneca", "Johnson&Johnson"])
tweets_fr_bert_melt["sentiment"] = tweets_fr_bert_melt["sentiment"].replace(["neutral", "positive", "negative"], ["Neutral", "Negative", "Positive"])

In [None]:
# France
fig = px.bar(tweets_fr_bert_melt, x='vaccine', y='mean', color='sentiment', barmode='stack', title='Sentiments on COVID-19 vaccination in France',
             labels={
                 "mean": "",
                 "vaccine": ""
                 
             },
            color_discrete_map={ # replaces default color mapping by value
                "Negative": "midnightblue", "Neutral": "steelblue","Positive": "lightsteelblue"
            },
            category_orders={"vaccine": ["Pfizer", "Moderna","AstraZeneca", "Johnson&Johnson"],
                            "sentiment": ["Negative", "Neutral", "Positive"]
                        },
             template="simple_white"
            )
fig.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig.update_xaxes(showline=True, zeroline=True)
fig.update_layout(legend_traceorder="reversed")

fig.show()

In [None]:
#py.plot(fig, filename = 'sentiments_vaccines_france', auto_open=False)

**USA NORTHEAST:**

In [None]:
## define function to get sentiments
def sentiment_analysis_us(df):
    sen_ana=[]
    for index, row in df.iterrows():
        try:
            texts = row["text"]
            tweet_id = row["id"]
            result = analyzer.predict(texts)
            x = [tweet_id, result]
            sen_ana.append(x)
        except:
            pass
    result_df = pd.DataFrame(sen_ana, columns=["id", "sent_score"])
    return result_df

In [None]:
tweets_ne_bert = sentiment_analysis_us(tweets_us_north)

In [None]:
tweets_ne_bert["output"] = tweets_ne_bert["sent_score"].apply(str).apply(find_output)
tweets_ne_bert["neutral"] = tweets_ne_bert["sent_score"].apply(str).apply(find_neutral)
tweets_ne_bert["positive"] = tweets_ne_bert["sent_score"].apply(str).apply(find_positive)
tweets_ne_bert["negative"] = tweets_ne_bert["sent_score"].apply(str).apply(find_negative)

In [None]:
tweets_ne_bert["output"] = tweets_ne_bert["output"].str.strip("=")
tweets_ne_bert["neutral"] = tweets_ne_bert["neutral"].str.strip("NEU: ")
tweets_ne_bert["positive"] = tweets_ne_bert["positive"].str.strip("POS: ")
tweets_ne_bert["negative"] = tweets_ne_bert["negative"].str.strip("NEG: ")

In [None]:
tweets_ne_bert["neutral"] = tweets_ne_bert["neutral"].astype('float64')
tweets_ne_bert["positive"] = tweets_ne_bert["positive"].astype('float64')
tweets_ne_bert["negative"] = tweets_ne_bert["negative"].astype('float64')

In [None]:
tweets_ne_bert["output"] = tweets_ne_bert["output"].replace(["NEU", "POS", "NEG"], ["Neutral", "Positive", "Negative"])

In [None]:
tweets_ne_bert_all = tweets_us_north.merge(tweets_ne_bert, on="id", how="inner")

In [None]:
#save file
#tweets_ne_bert_all.to_csv('../data/tweets_ne_sa_bert.csv', index=False)

In [None]:
tweets_ne_bert_all = pd.read_csv('../data/tweets_ne_sa_bert.csv')

In [None]:
tweets_ne_bert_all_gp = tweets_ne_bert_all.groupby("vaccine")["neutral", "positive", "negative"].mean().reset_index()

In [None]:
tweets_ne_bert_melt = pd.melt(tweets_ne_bert_all_gp, id_vars=['vaccine'], value_vars=['negative','neutral', 'positive'], var_name='sentiment', value_name='mean')

In [None]:
tweets_ne_bert_melt["vaccine"] = tweets_ne_bert_melt["vaccine"].replace(["pfizer", "moderna", "astrazeneca", "johnson"], ["Pfizer", "Moderna", "AstraZeneca", "Johnson&Johnson"])
tweets_ne_bert_melt["sentiment"] = tweets_ne_bert_melt["sentiment"].replace(["neutral", "positive", "negative"], ["Neutral", "Negative", "Positive"])

In [None]:
# USA NE
fig = px.bar(tweets_ne_bert_melt, x='vaccine', y='mean', color='sentiment', barmode='stack', title='Sentiments on COVID-19 Vaccination in Northeast States',
             labels={
                 "mean": "",
                 "vaccine": ""
                 
             },
            color_discrete_map={ # replaces default color mapping by value
                "Negative": "darkgreen", "Neutral": "forestgreen","Positive": "yellowgreen"
            },
            category_orders={"vaccine": ["Pfizer", "Moderna","AstraZeneca", "Johnson&Johnson"],
                            "sentiment": ["Negative", "Neutral", "Positive"]
                        },
             template="simple_white"
            )
fig.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig.update_xaxes(showline=True, zeroline=True)
fig.update_layout(legend_traceorder="reversed")

fig.show()

In [None]:
#py.plot(fig, filename = 'sentiments_vaccines_usa_ne', auto_open=False)

**USA SOUTHEAST:**

In [None]:
tweets_se_bert = sentiment_analysis_us(tweets_us_south)

In [None]:
tweets_se_bert["output"] = tweets_se_bert["sent_score"].apply(str).apply(find_output)
tweets_se_bert["neutral"] = tweets_se_bert["sent_score"].apply(str).apply(find_neutral)
tweets_se_bert["positive"] = tweets_se_bert["sent_score"].apply(str).apply(find_positive)
tweets_se_bert["negative"] = tweets_se_bert["sent_score"].apply(str).apply(find_negative)

In [None]:
tweets_se_bert["output"] = tweets_se_bert["output"].str.strip("=")
tweets_se_bert["neutral"] = tweets_se_bert["neutral"].str.strip("NEU: ")
tweets_se_bert["positive"] = tweets_se_bert["positive"].str.strip("POS: ")
tweets_se_bert["negative"] = tweets_se_bert["negative"].str.strip("NEG: ")

In [None]:
tweets_se_bert["neutral"] = tweets_se_bert["neutral"].astype('float64')
tweets_se_bert["positive"] = tweets_se_bert["positive"].astype('float64')
tweets_se_bert["negative"] = tweets_se_bert["negative"].astype('float64')

In [None]:
tweets_se_bert["output"] = tweets_se_bert["output"].replace(["NEU", "POS", "NEG"], ["Neutral", "Positive", "Negative"])

In [None]:
tweets_se_bert_all = tweets_us_south.merge(tweets_ne_bert, on="id", how="inner")

In [None]:
#save file
#tweets_se_bert_all.to_csv('../data/tweets_se_sa_bert.csv', index=False)

In [None]:
tweets_se_bert_all = pd.read_csv('../data/tweets_se_sa_bert.csv')

In [None]:
tweets_se_bert_all_gp = tweets_se_bert_all.groupby("vaccine")["neutral", "positive", "negative"].mean().reset_index()

In [None]:
tweets_se_bert_melt = pd.melt(tweets_se_bert_all_gp, id_vars=['vaccine'], value_vars=['negative','neutral', 'positive'], var_name='sentiment', value_name='mean')

In [None]:
tweets_se_bert_melt["vaccine"] = tweets_se_bert_melt["vaccine"].replace(["pfizer", "moderna", "astrazeneca", "johnson"], ["Pfizer", "Moderna", "AstraZeneca", "Johnson&Johnson"])
tweets_se_bert_melt["sentiment"] = tweets_se_bert_melt["sentiment"].replace(["neutral", "positive", "negative"], ["Neutral", "Negative", "Positive"])

In [None]:
# USA SE
fig = px.bar(tweets_se_bert_melt, x='vaccine', y='mean', color='sentiment', barmode='stack', title='Sentiments on COVID-19 Vaccination in Northeast States',
             labels={
                 "mean": "",
                 "vaccine": ""
                 
             },
            color_discrete_map={ # replaces default color mapping by value
                "Negative": "maroon", "Neutral": "indianred","Positive": "navajowhite"
            },
            category_orders={"vaccine": ["Pfizer", "Moderna","AstraZeneca", "Johnson&Johnson"],
                            "sentiment": ["Negative", "Neutral", "Positive"]
                        },
             template="simple_white"
            )
fig.update_yaxes(showgrid=True, showline=False, tickwidth=0, tickcolor='white')
fig.update_xaxes(showline=True, zeroline=True)
fig.update_layout(legend_traceorder="reversed")

fig.show()

In [None]:
#py.plot(fig, filename = 'sentiments_vaccines_usa_se', auto_open=False)

### 3- Perform sentiment analysis classification by using pretrained model “cardifnlp/twitter-roberta-base-emotion ”:

In [None]:
# Tweet emotion detection classifier
task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
# model using pytorch
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

**FRANCE:**

In [None]:
# Get emotion for french tweets
scores=[]
for index, row in tweets_fr.iterrows():
    #
    try:
        text = row["translated_text"]
        tweet_id = row["id"]
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
        output = model(**encoded_input)
        score = output[0][0].detach().numpy()
        score = softmax(score)
    except:
        pass    
    ranking = np.argsort(score)
    ranking = ranking[::-1]
    for i in range(score.shape[0]):
        l = labels[ranking[i]]
        s = score[ranking[i]]
        x = [tweet_id,l,np.round(float(s), 4)]
        scores.append(x)

In [None]:
# add the result to a dataframe
tweets_fr_emotions = pd.DataFrame(scores, columns=["id", "emotion", "score"])

In [None]:
# pivot the table
tweets_fr_emotions_pv = pd.pivot_table(tweets_fr_emotions, values='score', columns='emotion', index="id").reset_index()

**Get spider plots for emotion by vaccine:**

In [None]:
# merge the table with table 
tweets_fr_emotions_all = tweets_fr.merge(tweets_fr_emotions_pv, on="id", how="inner")

In [None]:
#save file
#tweets_fr_emotions_all.to_csv('../data/tweets_fr_emotions.csv', index=False)

In [None]:
tweets_fr_emotions = pd.read_csv('../data/tweets_fr_emotions.csv')

In [None]:
# Get the mean of each emotion
tweets_fr_emotions_gp = tweets_fr_emotions.groupby("vaccine")["anger", "joy", "optimism", "sadness"].mean().reset_index()

In [None]:
tweets_fr_emotions_gp

In [None]:
#Pfizer
fig = go.Figure(data=go.Scatterpolar(
  r=tweets_fr_emotions_gp.iloc[3,1:5],
  theta=['anger', 'joy','optimism','sadness'],
  fill='toself'))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
        range=[0,0.5]
    ),
  ),
  showlegend=False
)

fig.show()

In [None]:
#Moderna
fig = go.Figure(data=go.Scatterpolar(
  r=tweets_fr_emotions_gp.iloc[2,1:5],
  theta=['anger', 'joy','optimism','sadness'],
  fill='toself'))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
        range=[0,0.5]
    ),
  ),
  showlegend=False
)

fig.show()

In [None]:
#J&J
fig = go.Figure(data=go.Scatterpolar(
  r=tweets_fr_emotions_gp.iloc[1,1:5],
  theta=['anger', 'joy','optimism','sadness'],
  fill='toself'))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
        range=[0,0.5]
    ),
  ),
  showlegend=False
)

fig.show()

In [None]:
#Astra
fig = go.Figure(data=go.Scatterpolar(
  r= tweets_fr_emotions_gp.iloc[0,1:5],
  theta=['anger', 'joy','optimism','sadness'],
  fill='toself'))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
        range=[0,0.5]
    ),
  ),
  showlegend=False
)

fig.show()

**USA NORTHEAST:**

In [None]:
# Define a function to get emotions in US NE
def get_emotion(df):
    scores=[]
    for index, row in df.iterrows():
        text = row["text"]
        tweet_id = row["id"]
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True)
        output = model(**encoded_input)
        score = output[0][0].detach().numpy()
        score = softmax(score)
        
        ranking = np.argsort(score)
        ranking = ranking[::-1]
        for i in range(score.shape[0]):
            l = labels[ranking[i]]
            s = score[ranking[i]]
            x = [tweet_id,l,np.round(float(s), 4)]
            scores.append(x)
    result_df = pd.DataFrame(scores, columns=["id", "emotion", "score"])
    return result_df

In [None]:
# run the function on US NE Tweets
tweets_ne_emotion = get_emotion(tweets_us_north)

In [None]:
# pivot the result
tweets_ne_emotions_pv = pd.pivot_table(tweets_ne_emotion, values='score', columns='emotion', index="id").reset_index()

In [None]:
# merge emotion results with US NE Tweets
tweets_ne_emotions_all = tweets_us_north.merge(tweets_ne_emotions_pv, on="id", how="inner")

In [None]:
# get the mean of each emotion per vaccine
tweets_ne_emotions_gp = tweets_ne_emotions_all.groupby("vaccine")["anger", "joy", "optimism", "sadness"].mean().reset_index()

In [None]:
#Pfizer
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_ne_emotions_gp.iloc[3,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="yellowgreen", opacity=0.6, line=dict(color="green")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

In [None]:
#Moderna
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_ne_emotions_gp.iloc[2,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="yellowgreen", opacity=0.6, line=dict(color="green")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

In [None]:
#Johnson&Johnson
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_ne_emotions_gp.iloc[1,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="yellowgreen", opacity=0.6, line=dict(color="green")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

In [None]:
#Astra
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_ne_emotions_gp.iloc[0,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="yellowgreen", opacity=0.6, line=dict(color="green")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

**USA SOUTHEAST:**

In [None]:
# run the function for US SE Tweets
tweets_se_emotion = get_emotion(tweets_us_south)

In [None]:
tweets_se_emotions_pv = pd.pivot_table(tweets_se_emotion, values='score', columns='emotion', index="id").reset_index()

In [None]:
tweets_se_emotions_all = tweets_us_south.merge(tweets_se_emotions_pv, on="id", how="inner")

In [None]:
tweets_se_emotions_gp = tweets_se_emotions_all.groupby("vaccine")["anger", "joy", "optimism", "sadness"].mean().reset_index()

In [None]:
#Pfizer
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_se_emotions_gp.iloc[3,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="indianred", opacity=0.6, line=dict(color="maroon")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

In [None]:
#Moderna
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_se_emotions_gp.iloc[2,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="indianred", opacity=0.6, line=dict(color="maroon")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

In [None]:
#Johnson
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_se_emotions_gp.iloc[1,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="indianred", opacity=0.6, line=dict(color="maroon")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

In [None]:
#Astra
fig = go.Figure()
fig.add_trace(
                go.Scatterpolar(
                                r=tweets_se_emotions_gp.iloc[0,1:5],
                                theta=['anger', 'joy','optimism','sadness'],
                                fill='toself',
                                
                                fillcolor="indianred", opacity=0.6, line=dict(color="maroon")
                                )
                )

fig.update_layout(
    polar=dict(
        radialaxis=dict(
          visible=True,
            range=[0,0.5]
        ),
      ),
    showlegend=False,
    
)

fig.show()

## 2- Word frequency:

In [None]:
# customize the stopwords
custom_stopwords = STOPWORDS.union(set(['doses','dose','vaccination','vaccinated','vaccine', 'vaccines','coronavirus', 'covid','pfizer', 'astrazeneca', 'moderna', 'johnson', 'janssen']))

In [None]:
# make a function to get word frequency
def word_frequency(df):
    word_vectorizer = TfidfVectorizer(
        ngram_range=(1,1),
        analyzer='word',
        stop_words=custom_stopwords
    )
    
    # Remove short words, pumctuation, numbers and special characters
    sparse_matrix = word_vectorizer.fit_transform(
        df["translated_text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3])
            )
        )
        
    
    # Create frequency matrix
    frequencies = sparse_matrix.sum(axis=0)
    
    # Create DF from frequency matrix
    result_df = pd.DataFrame(frequencies.reshape(-1,1), index=word_vectorizer.get_feature_names(), columns=['frequency'])
    
    # Return sorted DF
    return result_df    

In [None]:
# separate Tweets if either positive or negative
tweets_fr_pos = tweets_fr[tweets_fr.label == 'POSITIVE']
tweets_fr_neg = tweets_fr[tweets_fr.label == 'NEGATIVE']

In [None]:
# run the function to get word frequency
freq_tweets_fr_pos = word_frequency(tweets_fr_pos)
freq_tweets_fr_neg = word_frequency(tweets_fr_neg)

In [None]:
# clean the result
freq_tweets_fr_pos = freq_tweets_fr_pos.reset_index()
freq_tweets_fr_neg = freq_tweets_fr_neg.reset_index()

freq_tweets_fr_pos = freq_tweets_fr_pos.rename(columns={'index':'words'})
freq_tweets_fr_neg = freq_tweets_fr_neg.rename(columns={'index':'words'})

In [None]:
# Get the top 5 frequent words
display(freq_tweets_fr_pos.sort_values("frequency", ascending=False).head(5))
display(freq_tweets_fr_neg.sort_values("frequency", ascending=False).head(5))

In [None]:
# make tuples for wordclouds
tuples_fr_pos = [tuple(x) for x in freq_tweets_fr_pos.values]
tuples_fr_neg = [tuple(x) for x in freq_tweets_fr_neg.values]

In [None]:
# Wordcloud positive french tweets
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='lightgray', colormap='Set1',collocations=False).generate_from_frequencies(dict(tuples_fr_pos))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
#wordcloud.to_file('../wordcloud/wordcloud_france_pos.png')

In [None]:
# Wordcloud negative french tweets
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='black', colormap='Set2',collocations=False).generate_from_frequencies(dict(tuples_fr_neg))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
#wordcloud.to_file('../wordcloud/wordcloud_france_neg.png')

**USA:**

In [None]:
# same as french Tweets function with few modifications
def word_frequency_us(df):
    word_vectorizer = TfidfVectorizer(
        ngram_range=(1,1),
        analyzer='word',
        stop_words=custom_stopwords
    )
    
    # Remove short words, pumctuation, numbers and special characters
    sparse_matrix = word_vectorizer.fit_transform(
        df["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3])
            )
        )
        
    
    # Create frequency matrix
    frequencies = sparse_matrix.sum(axis=0)
    
    # Create DF from frequency matrix
    result_df = pd.DataFrame(frequencies.reshape(-1,1), index=word_vectorizer.get_feature_names(), columns=['frequency'])
    
    # Return sorted DF
    return result_df    

**US NE:**

In [None]:
tweets_us_north_pos = tweets_us_north[tweets_us_north.label == 'POSITIVE']
tweets_us_north_neg = tweets_us_north[tweets_us_north.label == 'NEGATIVE']

tweets_us_south_pos = tweets_us_south[tweets_us_south.label == 'POSITIVE']
tweets_us_south_neg = tweets_us_south[tweets_us_south.label == 'NEGATIVE']

In [None]:
freq_tweets_us_north_pos = word_frequency_us(tweets_us_north_pos)
freq_tweets_us_north_neg = word_frequency_us(tweets_us_north_neg)

freq_tweets_us_south_pos = word_frequency_us(tweets_us_south_pos)
freq_tweets_us_south_neg = word_frequency_us(tweets_us_south_neg)

In [None]:
freq_tweets_us_north_pos = freq_tweets_us_north_pos.reset_index()
freq_tweets_us_north_neg = freq_tweets_us_north_neg.reset_index()

freq_tweets_us_south_pos = freq_tweets_us_south_pos.reset_index()
freq_tweets_us_south_neg = freq_tweets_us_south_neg.reset_index()

In [None]:
freq_tweets_us_north_pos = freq_tweets_us_north_pos.rename(columns={'index':'words'})
freq_tweets_us_north_neg = freq_tweets_us_north_neg.rename(columns={'index':'words'})

freq_tweets_us_south_pos = freq_tweets_us_south_pos.rename(columns={'index':'words'})
freq_tweets_us_south_neg = freq_tweets_us_south_neg.rename(columns={'index':'words'})

In [None]:
tuples_us_n_pos = [tuple(x) for x in freq_tweets_us_north_pos.values]
tuples_us_n_neg = [tuple(x) for x in freq_tweets_us_north_neg.values]

tuples_us_s_pos = [tuple(x) for x in freq_tweets_us_south_pos.values]
tuples_us_s_neg = [tuple(x) for x in freq_tweets_us_south_neg.values]

In [None]:
# Top 5 frequent words
display(freq_tweets_us_north_pos.sort_values("frequency", ascending=False).head(5))
display(freq_tweets_us_north_neg.sort_values("frequency", ascending=False).head(5))

In [None]:
# Wordcloud positive US NE tweets
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='lightgray', colormap='Set1',collocations=False).generate_from_frequencies(dict(tuples_us_n_pos))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
#wordcloud.to_file('../wordcloud/wordcloud_us_north_pos.png')

In [None]:
# Wordcloud negative US NE tweets
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='black', colormap='Set2',collocations=False).generate_from_frequencies(dict(tuples_us_n_neg))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
#wordcloud.to_file('../wordcloud/wordcloud_us_north_neg.png')

**US SE:**

In [None]:
# Top 5 frequent words
display(freq_tweets_us_south_pos.sort_values("frequency", ascending=False).head(5))
display(freq_tweets_us_south_neg.sort_values("frequency", ascending=False).head(5))

In [None]:
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='lightgray', colormap='Set1',collocations=False).generate_from_frequencies(dict(tuples_us_s_pos))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
wordcloud.to_file('../wordcloud/wordcloud_us_south_pos.png')

In [None]:
plt.figure(figsize= (12, 8))
wordcloud = WordCloud(width = 1500, height = 1000,
                      random_state=1, background_color='black', colormap='Set2',collocations=False).generate_from_frequencies(dict(tuples_us_s_neg))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#save image
wordcloud.to_file('../wordcloud/wordcloud_us_south_neg.png')

## 3- Topic modeling - Latent Dirichlet Allocation (LDA):

In [None]:
# customize stop words from NLTK 
stop_words = stopwords.words('english')
stop_words.extend(['vaccine', 'covid', 'coronavirus'])

In [None]:
# run files
tweets_fr_sa = pd.read_csv("../data/tweet_fr_sa.csv")
tweets_us_ne_sa = pd.read_csv("../data/tweets_us_north_sa.csv")
tweets_us_se_sa = pd.read_csv("../data/tweets_us_south_sa.csv")

**FRANCE:**

In [None]:
# clean dataframe
tweets_fr_lda = tweets_fr_sa.drop(columns=["id", "date", "location", "follower_count", "retweets", "text", "sen_ana", "score", "vaccine"])
tweets_fr_lda = tweets_fr_lda.rename(columns ={'translated_text': 'text'})

In [None]:
# remove words smaller than 3 characters
tweets_fr_lda["text"] = tweets_fr_lda["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3]))

In [None]:
# Convert to list
data = tweets_fr_lda.text.values.tolist()

In [None]:
# make a function using simple process from gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models for lda
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '../html_links/lda_france.html')

**USA NORTH:**

In [None]:
tweets_us_north_lda = tweets_us_ne_sa

In [None]:
tweets_us_north_lda["text"] = tweets_us_north_lda["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3]))

In [None]:
# Convert to list
data = tweets_us_north_lda.text.values.tolist()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '../html_links/lda_us_north.html')

**USA SOUTH:**

In [None]:
tweets_us_south_lda = tweets_us_se_sa

In [None]:
tweets_us_south_lda["text"] = tweets_us_south_lda["text"].apply(lambda x: " ".join([x for x in simple_preprocess(x) if len(x)>3]))

In [None]:
# Convert to list
data = tweets_us_south_lda.text.values.tolist()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, '../html_links/lda_us_south.html')