# Experiment with NLP Techniques for Political texts

In [None]:
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords') 
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk import ngrams
import matplotlib.pyplot as plt 
import seaborn as sns
import spacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel

In [None]:
person = 'DonaldTrump'

In [None]:
data = pd.read_csv(f'/Users/derekwu/Desktop/datadescriptor_uselections2020/us2020data/data_clean/cspan/{person}/cleantext_{person}.tsv',sep='\t')

## Data Preprocessing

In [None]:
## TO DO, POS tagging and removal https://towardsdatascience.com/topic-modelling-in-python-with-spacy-and-gensim-dc8f7748bdbf/

In [None]:
nlp = spacy.load('en_core_web_sm')
my_stop_words = ['movie']

for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [None]:
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM', "VERB"]

tokens = []
for summary in nlp.pipe(data['CleanText']):
   proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal and not token.is_stop and token.is_alpha]
   tokens.append(proj_tok)

In [None]:
data['tokens'] = tokens

In [None]:
corpus = ','.join(data["tokens"].str.join(" "))

## Word Cloud

In [None]:
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(corpus)
# Visualize the word cloud
wordcloud.to_image()

## Word/Ngram Frequencies

In [None]:
# visulaize common words
def visualize_common(corpus, n: int, records):

    # get common words
    mostCommon = nltk.FreqDist(ngrams(corpus.split(), n))

    # get words and dictionary
    words = []
    freq = []
    for key in mostCommon: 
        words.append(" ".join(key))
        freq.append(mostCommon[key])
    
    return words[:records], freq[:records]

# common words Unigram
uni_word, uni_freq = visualize_common(corpus, 1, 25)

# plot
plt.rcParams["figure.figsize"] = [8, 6]
plt.rcParams["figure.autolayout"] = True
sns.barplot(x=uni_freq, y=uni_word, color='black')
plt.title('Top 25 Most Frequently Occuring Words')
plt.show()

# commong words bigram
bi_word, bi_freq = visualize_common(corpus, 2, 25)
bi_word = list(map(lambda x: x.replace('_', ' '), bi_word))

# plot
plt.rcParams["figure.figsize"] = [8, 6]
plt.rcParams["figure.autolayout"] = True
sns.barplot(x=bi_freq, y=bi_word, color='black')
plt.title('Top 25 Most Frequently Occuring Words')
plt.show()

# commong words trigram
tri_word, tri_freq = visualize_common(corpus, 3, 25)
tri_word = list(map(lambda x: x.replace('_', ' '), tri_word))

# plot
plt.rcParams["figure.figsize"] = [8, 6]
plt.rcParams["figure.autolayout"] = True
sns.barplot(x=tri_freq, y=tri_word, color='black')
plt.title('Top 25 Most Frequently Occuring Words')
plt.show()


## Topic Modeling

#### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
pd.set_option("display.max_rows", 600)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
documents = data["tokens"].str.join(" ").copy()

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(documents)

In [None]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})

In [None]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [None]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [None]:
top_tfidf[top_tfidf['term'].str.contains('missile')] # can zoom in on specific words

In [None]:
#top_tfidf[top_tfidf['document'].str.contains('kennedy')] # can zoom in on a specific speech if I record the speech name

In [None]:
import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = ['missle', 'election']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)

#### LDA

In [None]:
texts = data.tokens
bigram = gensim.models.phrases.Phrases(texts)
texts = [bigram[line] for line in texts]

In [None]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

#### BERTopic

In [None]:
## Coorelation of words (be able to search specific words) 
## Word connection graph? 

In [None]:
# Event extraction? Stanza/Spacy package 

In [None]:
# Sentiment analysis/text classification (60% authoritariaism, 40% facist)

In [None]:
# Tracking topic modeling? 
# Matching text -> similiarty to previous presidents (clustering) 