# Exploratory Data Analysis Notebook
This notebook will preprocess and leverage NLP models on the unstructured data to turn it into a usable feature space for modeling Tucker Carlson's body of work

In [59]:
#Imports cell

#Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import nltk
from nltk import word_tokenize
from nltk import FreqDist
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [18]:
# import the tucker document data either as a CSV or a pickle
#Read out from CSV
tucker_docs = pd.read_csv('data/tucker_docs.csv', encoding='UTF8', header = None).T
tucker_docs.head()

Unnamed: 0,0
0,Fox News host gives his take on pro-abortion ...
1,Fox News host reflects on the left's respons...
2,Fox News host gives his take on how Americans...
3,Fox News host gives his take on the Supreme C...
4,Fox News host gives his take on the real moti...


## preprocess function

In [19]:
#to stem or to lem? We will lem

wnl = WordNetLemmatizer()

def preprocessing(text):
    #step 1: delete all caps words
    t_d = re.sub(r'\b[A-Z]+\b', '', text)
    
    #step 2: tokenize
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenized_doc = nltk.regexp_tokenize(t_d, pattern)
    
    #step 3: lower all cases
    low_tokenized_doc = [word.lower() for word in tokenized_doc]
    
    #step 4: stop words
    stopwords_list = stopwords.words('english')
    stop_tokenized_doc = [word for word in low_tokenized_doc if word not in stopwords_list]
    
    #step 5: lem
    tokens = [wnl.lemmatize(word) for word in stop_tokenized_doc]
    
    return tokens

In [20]:
#Generate list of preprocessed Tucker Carlson episodes
tucker_list = tucker_docs[0].tolist()
new_list = []
for each_doc in tucker_list:
    new_list.append(preprocessing(each_doc))

In [21]:
#Instantiate IDF vectorizer to create vectorized array

vectorizer = TfidfVectorizer()
vect = vectorizer.fit_transform([' '.join(new_list[n]) for n in range(len(new_list))])
td_idf_df = pd.DataFrame(vect.toarray(), columns = vectorizer.get_feature_names())
td_idf_df.head()

Unnamed: 0,aaron,ab,abaca,aback,abandon,abandoned,abandoning,abandonment,abasement,abbott,...,zoomcall,zoomed,zoonotic,zot,zucker,zuckerberg,zuckerbucks,zvfcgesbfiy,zweig,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#Fit the NMF topic generation model
from sklearn.decomposition import NMF
model = NMF(n_components = 10)
model.fit(td_idf_df)

# to get H
H = model.transform(td_idf_df) # transform document into topic vector representation

# to get W 
W = model.components_ # word component weights for each topic

In [23]:
#List out the top 10 words for each topic

for index,topic in enumerate(W):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['thing', 'year', 'see', 'new', 'state', 'would', 'yes', 'say', 'mean', 'want', 'get', 'thank', 'country', 'one', 'like', 'well', 'right', 'they', 'think', 'going', 'know', 'we', 'that', 're', 'people']


THE TOP 10 WORDS FOR TOPIC #1
['that', 'they', 'american', 'military', 'going', 'biden', 'sanction', 'would', 'weapon', 'nuclear', 'know', 'united', 'invasion', 'energy', 'president', 'state', 'we', 're', 'vladimir', 'ukrainian', 'war', 'putin', 'russian', 'russia', 'ukraine']


THE TOP 10 WORDS FOR TOPIC #2
['function', 'gain', 'boogeyman', 'really', 'lab', 'said', 'look', 'wuhan', 'people', 'celebrate', 'public', 'infection', 'know', 'science', 'virus', 'scientist', 'pandemic', 'immunity', 'research', 'anthony', 'email', 'dr', 'christmas', 'tony', 'fauci']


THE TOP 10 WORDS FOR TOPIC #3
['mandate', 'crime', 'terrorism', 'biden', 'justice', 'violence', 'department', 'teach', 'education', 'terrorist', 'people', 'like', 'meeting', 're', 'they', 'domestic'

In [24]:
#Visualize the 10 topics

%%capture topic_word_plot
def plot_top_words(W, feature_names, n_top_words, title, n_topics):
    fig, axes = plt.subplots(1, n_topics, figsize=(15, 12), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(W):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 20})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=15)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=25)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

n_top_words = 20
tfidf_feature_names = vectorizer.get_feature_names()
plot_top_words(W, tfidf_feature_names, n_top_words, "Topics in NMF model", 10)

UsageError: Line magic function `%%capture` not found.


In [25]:
topic_word_plot()

NameError: name 'topic_word_plot' is not defined

In [26]:
#Manually re-assign topics
topic = {0: 'American Politics', 1: 'War in Ukraine', 2: 'Covid-19', 3: 'School', 4: 'Biden Administration', 
         5: 'War in Afghanistan', 6: 'Trump', 7: 'Abortion', 8: 'Kyle Rittenhouse Shooting', 
         9: 'Covid-19 Vaccine' }

## Preprocess the twitter data

In [27]:
tweet_df = pd.read_csv('data/unique_tweets_list.csv').drop(columns = ['Unnamed: 0'])

In [28]:
#Function library
import ast

def string_to_list(s):
    try:
        l = ast.literal_eval(s)
    except:
        l = None
    return l

In [29]:
def prepare_tweets(tweet_row):
    tweet_list = []
    for n in range(len(tweet_row)):
        tweet_list.append(preprocessing(tweet_row[n]))

    cleaned_tweets = []
    for tweet in tweet_list:
        if len(tweet) > 20:
            cleaned_tweets.append(tweet)
    return cleaned_tweets

In [30]:
#Preprocess each user's tweet history
tweet_df['tweet_history'] = tweet_df['tweet_history'].apply(lambda x: string_to_list(x))
tweet_df = tweet_df.dropna()
tweet_df['cleaned_tweets'] = tweet_df['tweet_history'].apply(lambda x: prepare_tweets(x))

In [31]:
tweet_df

Unnamed: 0,tweet_history,cleaned_tweets
0,"[""Legislative genius"" Nancy @TeamPelosi needs ...","[[imagine, female, journos, stay, twitter, mus..."
1,[withoutrunes: wack-ashimself: withoutrunes: w...,"[[withoutrunes, wack, ashimself, withoutrunes,..."
2,[Was just sent this after complaining about de...,"[[hahahahaha, omg, first, javascript, course, ..."
3,[Over turn Roe V Wade fine but be prepared to ...,"[[forget, roe, wade, law, need, made, amendmen..."
4,[@KealanBurke Some books I love so much I keep...,"[[kealanburke, book, love, much, keep, forever..."
...,...,...
3655,"[If anyone wants to try it for themselves, her...","[[tried, hand, traditional, venezuelan, staple..."
3656,[China‘s Uncontested Candidate ‘Wins‘ Hong Kon...,"[[reminder, abandoned, american, citizen, fami..."
3657,[RT @JRMajewski: How is he supposed to fight i...,"[[april, tucson, sector, border, patrol, agent..."
3658,[@kylejluebke Why haven’t you commented on the...,"[[starmedcare, actually, two, quick, question,..."


exact copy of twitter processing

In [32]:
def twitter_preprocessing(text):

    #Step 0
    #This is Praveen's code
    text = text.lower()
    text = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+@[A-Z0-9_]+|#[A-Z0-9_]+|http\S+", "", text).strip().replace("\r", "").replace("\n", "").replace("\t", "")
   
    #step 1: delete all caps words
    t_d = re.sub(r'\b[A-Z]+\b', '', text)

    #step 2: tokenize
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenized_doc = nltk.regexp_tokenize(t_d, pattern)

    #step 3: stop words
    stopwords_list = stopwords.words('english')
    stop_tokenized_doc = [word for word in tokenized_doc if word not in stopwords_list]

    #step 4: lem
    tokens = [wnl.lemmatize(word) for word in stop_tokenized_doc]
    return ' '.join(tokens)

In [33]:
tweet_df = pd.read_csv('data/unique_tweets_list.csv').drop(columns = ['Unnamed: 0'])

In [34]:
import ast

len(ast.literal_eval(tweet_df.iloc[750,:][0]))

100

In [35]:
def string_to_list(s):
    try:
        l = ast.literal_eval(s)
    except:
        l = None
    return l

In [36]:
tweet_df['tweet_history'] = tweet_df['tweet_history'].apply(lambda x: string_to_list(x))

In [37]:
tweet_df = tweet_df.dropna()

In [38]:
def prepare_tweets(tweet_row):
    tweet_list = []
    for n in range(len(tweet_row)):
        tweet_list.append(twitter_preprocessing(tweet_row[n]))

    cleaned_tweets = []
    for tweet in tweet_list:
        if len(tweet) > 20:
            cleaned_tweets.append(tweet)
    return cleaned_tweets

In [39]:
tweet_df['cleaned_tweets'] = tweet_df['tweet_history'].apply(lambda x: prepare_tweets(x))

In [40]:
tweet_df['vectorized'] = tweet_df['cleaned_tweets'].apply(lambda x: vectorizer.transform(x))

In [41]:

model.transform()

TypeError: transform() missing 1 required positional argument: 'X'

In [42]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3658 entries, 0 to 3659
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_history   3658 non-null   object
 1   cleaned_tweets  3658 non-null   object
 2   vectorized      3658 non-null   object
dtypes: object(3)
memory usage: 114.3+ KB


In [43]:
#Fit the NMF topic generation model

# to get H
H_list = []

for n in range(len(tweet_df)):
    try:
        H_list.append(model.transform(tweet_df['vectorized'][n])) # transform document into topic vector representation
    except:
        continue;

# to get W 
#W = model.components_ # word component weights for each topic

In [44]:
#List out the top 10 words for each topic
topic_weights = []

for n in range(0, len(tweet_df) - 2):
    topic_weights.append(list(pd.DataFrame(H_list[n], columns = list(topic.values())).sum()))

In [45]:
relative_weights_df = pd.DataFrame(topic_weights,  columns = list(topic.values()))

In [46]:
relative_weights_df

Unnamed: 0,American Politics,War in Ukraine,Covid-19,School,Biden Administration,War in Afghanistan,Trump,Abortion,Kyle Rittenhouse Shooting,Covid-19 Vaccine
0,0.680626,0.244300,0.119168,0.731577,0.341945,0.240299,0.988894,0.486750,0.254866,0.224953
1,0.923170,0.111925,0.087826,0.217727,0.209149,0.239076,0.324541,1.091020,0.531066,0.424512
2,0.931812,0.098028,0.093008,0.240249,0.092589,0.135742,0.268368,1.123769,0.165017,0.161810
3,0.070784,0.041267,0.005452,0.044828,0.108038,0.047018,0.110939,0.272246,0.104751,0.063113
4,0.416697,0.222473,0.085506,0.197879,0.080960,0.106119,0.242276,1.331165,0.254181,0.133890
...,...,...,...,...,...,...,...,...,...,...
3651,1.005561,0.230707,0.144323,0.395115,0.240086,0.243627,0.294857,0.704700,0.286908,0.194316
3652,0.609854,1.204737,0.167427,0.091959,0.275848,0.073650,0.365628,0.635599,0.435957,0.143159
3653,0.536499,0.041255,0.045280,0.181352,0.072991,0.113244,0.177772,0.430502,0.161185,0.143768
3654,0.231382,0.393095,0.195363,0.235601,0.249610,0.250492,0.399096,0.752527,0.214983,0.289047


In [47]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2)
kmeans.fit(relative_weights_df)

KMeans(n_clusters=2)

In [48]:
tucker_not_tucker_maybe_we_will_see_soon_enough = pd.Series(kmeans.predict(relative_weights_df))

In [49]:
yay_we_did_a_model_df = pd.concat([relative_weights_df, tucker_not_tucker_maybe_we_will_see_soon_enough], axis = 1)

In [50]:
yay_we_did_a_model_df.iloc[:740,:][0].sort_values(ascending = False)

369    1
289    1
282    1
281    1
280    1
      ..
594    0
597    0
600    0
178    0
739    0
Name: 0, Length: 740, dtype: int32

In [51]:
#Generate tucker carlson distribution
model.transform(td_idf_df)[0]

array([0.05540121, 0.00428809, 0.        , 0.03378609, 0.01889137,
       0.        , 0.00330919, 0.10101941, 0.08129136, 0.        ])

In [57]:
tweet_df['cleaned_tweets']

0       [legislative genius nancy need go away take di...
1       [withoutrunes wack ashimself withoutrunes wack...
2       [sent complaining dealing nan omg i'm dying, h...
3       [turn roe v wade fine prepared care child feed...
4       [book love much keep forever including autogra...
                              ...                        
3655    [anyone want try recipe thing beyond recipe ad...
3656    [china uncontested candidate win hong kong chi...
3657    [rt supposed fight inflation he's busy saying ...
3658    [commented focus conservative overturn roe v w...
3659    [rt biden say he's prepared accept forthcoming...
Name: cleaned_tweets, Length: 3658, dtype: object

In [60]:
wv = Word2Vec.load("word2vec")