# Exploratory Data Analysis Notebook
This notebook will preprocess and leverage NLP models on the unstructured data to turn it into a usable feature space for modeling Tucker Carlson's body of work

In [115]:
#Imports cell

#Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import nltk
from nltk import word_tokenize
from nltk import FreqDist
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# import the tucker document data either as a CSV or a pickle

#Read out from CSV
tucker_docs = pd.read_csv('data/tucker_docs.csv', encoding='UTF8', header = None).T

#Read from pickle
#tucker_docs = pd.read_pickle('data/tucker_pickle')

In [3]:
tucker_docs.head()

Unnamed: 0,0
0,Fox News host gives his take on pro-abortion ...
1,Fox News host reflects on the left's respons...
2,Fox News host gives his take on how Americans...
3,Fox News host gives his take on the Supreme C...
4,Fox News host gives his take on the real moti...


## Implementing the Bradley-Haderthauer Test
Compare two topic distributions: IF BH-score is < 0.2, then a Twitterer can be confidently classified as a Tuckerbot. This person is a lower life form and unable to contribute, in good faith, to the deep state media platform of choice, Twitter.

In [4]:
#make custom stops words to remove first 100 words? remove intro to episode 
#remove words in all caps 

## remove words in all caps

In [5]:


tucker_doc = tucker_docs.iloc[0,0]
#pattern to delete words in all caps
#pattern = "(([a-zA-Z]+(?:'[a-z]+)?))"
t_d = re.sub(r'\b[A-Z]+\b', '', tucker_doc)


In [6]:
type(t_d)

str

In [7]:

pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenized_doc = nltk.regexp_tokenize(t_d, pattern)

In [8]:
#tucker_docs.iloc[0,0]
tokenized_doc

['Fox',
 'News',
 'host',
 'gives',
 'his',
 'take',
 'on',
 'pro',
 'abortion',
 'protesters',
 'targeting',
 'Supreme',
 'Court',
 'justices',
 'over',
 'the',
 'possible',
 'overturn',
 'of',
 'Roe',
 'v',
 'Wade',
 'on',
 'Tucker',
 'Carlson',
 'Tonight',
 "It's",
 'pretty',
 'hard',
 'to',
 'argue',
 'with',
 'people',
 'who',
 'are',
 'passive',
 'aggressive',
 'You',
 'may',
 'have',
 'tried',
 'it',
 'before',
 'Why',
 'are',
 'you',
 'so',
 'angry',
 'they',
 'scream',
 'Stop',
 'being',
 'violent',
 'they',
 'snarl',
 'as',
 'they',
 'punch',
 'you',
 'in',
 'the',
 'face',
 'Passive',
 'aggressive',
 'people',
 'are',
 'intent',
 'on',
 'dominating',
 'you',
 'but',
 "they're",
 'too',
 'dishonest',
 'to',
 'admit',
 'it',
 'Now',
 "it's",
 'not',
 'an',
 'honorable',
 'style',
 'of',
 'attack',
 'but',
 "it's",
 'very',
 'effective',
 'mostly',
 'because',
 "it's",
 'so',
 'bewildering',
 'The',
 'Democratic',
 'Party',
 'practices',
 'this',
 'Democrats',
 'will',
 'never'

In [9]:
#lowercase all words
tokenized_doc = [word.lower() for word in tokenized_doc]
tokenized_doc

['fox',
 'news',
 'host',
 'gives',
 'his',
 'take',
 'on',
 'pro',
 'abortion',
 'protesters',
 'targeting',
 'supreme',
 'court',
 'justices',
 'over',
 'the',
 'possible',
 'overturn',
 'of',
 'roe',
 'v',
 'wade',
 'on',
 'tucker',
 'carlson',
 'tonight',
 "it's",
 'pretty',
 'hard',
 'to',
 'argue',
 'with',
 'people',
 'who',
 'are',
 'passive',
 'aggressive',
 'you',
 'may',
 'have',
 'tried',
 'it',
 'before',
 'why',
 'are',
 'you',
 'so',
 'angry',
 'they',
 'scream',
 'stop',
 'being',
 'violent',
 'they',
 'snarl',
 'as',
 'they',
 'punch',
 'you',
 'in',
 'the',
 'face',
 'passive',
 'aggressive',
 'people',
 'are',
 'intent',
 'on',
 'dominating',
 'you',
 'but',
 "they're",
 'too',
 'dishonest',
 'to',
 'admit',
 'it',
 'now',
 "it's",
 'not',
 'an',
 'honorable',
 'style',
 'of',
 'attack',
 'but',
 "it's",
 'very',
 'effective',
 'mostly',
 'because',
 "it's",
 'so',
 'bewildering',
 'the',
 'democratic',
 'party',
 'practices',
 'this',
 'democrats',
 'will',
 'never'

In [10]:
td_freqdist = FreqDist(tokenized_doc)
td_freqdist.most_common(75)

[('the', 120),
 ('of', 58),
 ('and', 58),
 ('to', 50),
 ('they', 45),
 ('in', 40),
 ('are', 35),
 ('a', 34),
 ('that', 30),
 ('you', 29),
 ('it', 28),
 ('people', 22),
 ('we', 21),
 ('or', 21),
 ('so', 20),
 ('what', 20),
 ('on', 19),
 ('is', 19),
 ('court', 17),
 ('have', 17),
 ('this', 17),
 ("it's", 16),
 ('not', 16),
 ('their', 16),
 ('for', 16),
 ('with', 15),
 ('as', 15),
 ('but', 15),
 ("they're", 15),
 ('supreme', 14),
 ('like', 14),
 ('who', 12),
 ("that's", 12),
 ('be', 12),
 ('us', 12),
 ('right', 12),
 ('because', 11),
 ('why', 10),
 ('now', 9),
 ('know', 9),
 ('threat', 9),
 ('these', 9),
 ('justices', 8),
 ('about', 8),
 ('law', 8),
 ("don't", 8),
 ('angry', 7),
 ('up', 7),
 ('them', 7),
 ('at', 7),
 ('far', 7),
 ('from', 7),
 ('can', 7),
 ('do', 7),
 ('just', 7),
 ('an', 6),
 ('very', 6),
 ('if', 6),
 ('one', 6),
 ('jen', 6),
 ('no', 6),
 ('by', 6),
 ('did', 6),
 ('sam', 6),
 ('position', 6),
 ('would', 6),
 ('then', 6),
 ('violence', 6),
 ('white', 6),
 ('m', 6),
 ('his

In [11]:
stopwords_list = stopwords.words('english')
stop_tokenized_doc = [word for word in tokenized_doc if word not in stopwords_list]

In [12]:
stop_tokenized_doc

['fox',
 'news',
 'host',
 'gives',
 'take',
 'pro',
 'abortion',
 'protesters',
 'targeting',
 'supreme',
 'court',
 'justices',
 'possible',
 'overturn',
 'roe',
 'v',
 'wade',
 'tucker',
 'carlson',
 'tonight',
 'pretty',
 'hard',
 'argue',
 'people',
 'passive',
 'aggressive',
 'may',
 'tried',
 'angry',
 'scream',
 'stop',
 'violent',
 'snarl',
 'punch',
 'face',
 'passive',
 'aggressive',
 'people',
 'intent',
 'dominating',
 "they're",
 'dishonest',
 'admit',
 'honorable',
 'style',
 'attack',
 'effective',
 'mostly',
 'bewildering',
 'democratic',
 'party',
 'practices',
 'democrats',
 'never',
 'meet',
 'open',
 'field',
 'battle',
 'instead',
 'sneak',
 'behind',
 'knock',
 'unconscious',
 'bag',
 'sanctimony',
 'party',
 'weak',
 'men',
 'angry',
 'women',
 'passive',
 'aggression',
 'mode',
 'communication',
 'ever',
 'seen',
 'one',
 'jen',
 "psaki's",
 'press',
 'conferences',
 'know',
 'exactly',
 "we're",
 'talking',
 'watched',
 'one',
 'yesterday',
 'fact',
 'last',
 

In [13]:
stop_td_freqdist = FreqDist(stop_tokenized_doc)
stop_td_freqdist.most_common(75)

[('people', 22),
 ('court', 17),
 ("they're", 15),
 ('supreme', 14),
 ('like', 14),
 ("that's", 12),
 ('us', 12),
 ('right', 12),
 ('know', 9),
 ('threat', 9),
 ('justices', 8),
 ('law', 8),
 ('angry', 7),
 ('far', 7),
 ('one', 6),
 ('jen', 6),
 ('sam', 6),
 ('position', 6),
 ('would', 6),
 ('violence', 6),
 ('white', 6),
 ('abortion', 5),
 ('roe', 5),
 ('v', 5),
 ('wade', 5),
 ('passive', 5),
 ('psaki', 5),
 ('want', 5),
 ('going', 5),
 ('aggressive', 4),
 ('may', 4),
 ('intent', 4),
 ('fact', 4),
 ('well', 4),
 ('justice', 4),
 ('alito', 4),
 ("alito's", 4),
 ('government', 4),
 ('say', 4),
 ('near', 4),
 ('building', 4),
 ('watch', 4),
 ('think', 4),
 ('real', 4),
 ('ones', 4),
 ('enforcement', 4),
 ('get', 4),
 ('look', 4),
 ('kids', 4),
 ('face', 3),
 ('party', 3),
 ('never', 3),
 ('instead', 3),
 ('administration', 3),
 ('conservative', 3),
 ('order', 3),
 ('federal', 3),
 ("can't", 3),
 ('make', 3),
 ('family', 3),
 ('official', 3),
 ('outside', 3),
 ("we've", 3),
 ('according',

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/monicahaderthauer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
#to lem 
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [16]:
wnl.lemmatize('kids')

'kid'

In [17]:
#wnl.lemmatize(wn1.lemmatize(word) for word in stop_tokenized_doc)
tokens = [wnl.lemmatize(word) for word in stop_tokenized_doc]
tokens

#lemmatizer = WordNetLemmatizer()
#def lemmatize_words(text):
   # return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
#stop_tokenized_doc = stop_tokenized_doc.apply(lambda text: lemmatize_words(stop_tokenized_doc))

['fox',
 'news',
 'host',
 'give',
 'take',
 'pro',
 'abortion',
 'protester',
 'targeting',
 'supreme',
 'court',
 'justice',
 'possible',
 'overturn',
 'roe',
 'v',
 'wade',
 'tucker',
 'carlson',
 'tonight',
 'pretty',
 'hard',
 'argue',
 'people',
 'passive',
 'aggressive',
 'may',
 'tried',
 'angry',
 'scream',
 'stop',
 'violent',
 'snarl',
 'punch',
 'face',
 'passive',
 'aggressive',
 'people',
 'intent',
 'dominating',
 "they're",
 'dishonest',
 'admit',
 'honorable',
 'style',
 'attack',
 'effective',
 'mostly',
 'bewildering',
 'democratic',
 'party',
 'practice',
 'democrat',
 'never',
 'meet',
 'open',
 'field',
 'battle',
 'instead',
 'sneak',
 'behind',
 'knock',
 'unconscious',
 'bag',
 'sanctimony',
 'party',
 'weak',
 'men',
 'angry',
 'woman',
 'passive',
 'aggression',
 'mode',
 'communication',
 'ever',
 'seen',
 'one',
 'jen',
 "psaki's",
 'press',
 'conference',
 'know',
 'exactly',
 "we're",
 'talking',
 'watched',
 'one',
 'yesterday',
 'fact',
 'last',
 'peter

In [18]:
stop_tokenized_doc

['fox',
 'news',
 'host',
 'gives',
 'take',
 'pro',
 'abortion',
 'protesters',
 'targeting',
 'supreme',
 'court',
 'justices',
 'possible',
 'overturn',
 'roe',
 'v',
 'wade',
 'tucker',
 'carlson',
 'tonight',
 'pretty',
 'hard',
 'argue',
 'people',
 'passive',
 'aggressive',
 'may',
 'tried',
 'angry',
 'scream',
 'stop',
 'violent',
 'snarl',
 'punch',
 'face',
 'passive',
 'aggressive',
 'people',
 'intent',
 'dominating',
 "they're",
 'dishonest',
 'admit',
 'honorable',
 'style',
 'attack',
 'effective',
 'mostly',
 'bewildering',
 'democratic',
 'party',
 'practices',
 'democrats',
 'never',
 'meet',
 'open',
 'field',
 'battle',
 'instead',
 'sneak',
 'behind',
 'knock',
 'unconscious',
 'bag',
 'sanctimony',
 'party',
 'weak',
 'men',
 'angry',
 'women',
 'passive',
 'aggression',
 'mode',
 'communication',
 'ever',
 'seen',
 'one',
 'jen',
 "psaki's",
 'press',
 'conferences',
 'know',
 'exactly',
 "we're",
 'talking',
 'watched',
 'one',
 'yesterday',
 'fact',
 'last',
 

In [101]:
def preprocessing(text):
    #step 1: delete all caps words
    t_d = re.sub(r'\b[A-Z]+\b', '', text)
    #step 2: tokenize
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenized_doc = nltk.regexp_tokenize(t_d, pattern)
    #step 3: lower all cases
    low_tokenized_doc = [word.lower() for word in tokenized_doc]
    #step 4: stop words
    stopwords_list = stopwords.words('english')
    stop_tokenized_doc = [word for word in low_tokenized_doc if word not in stopwords_list]
    #step 5: lem
    tokens = [wnl.lemmatize(word) for word in stop_tokenized_doc]
    return tokens

In [102]:
tucker_list = tucker_docs[0].tolist()
new_list = []
for each_doc in tucker_list:
    new_list.append(preprocessing(each_doc))

In [103]:
type(new_list[0][1])

str

In [104]:
new_list[0]

['fox',
 'news',
 'host',
 'give',
 'take',
 'pro',
 'abortion',
 'protester',
 'targeting',
 'supreme',
 'court',
 'justice',
 'possible',
 'overturn',
 'roe',
 'v',
 'wade',
 'tucker',
 'carlson',
 'tonight',
 'pretty',
 'hard',
 'argue',
 'people',
 'passive',
 'aggressive',
 'may',
 'tried',
 'angry',
 'scream',
 'stop',
 'violent',
 'snarl',
 'punch',
 'face',
 'passive',
 'aggressive',
 'people',
 'intent',
 'dominating',
 "they're",
 'dishonest',
 'admit',
 'honorable',
 'style',
 'attack',
 'effective',
 'mostly',
 'bewildering',
 'democratic',
 'party',
 'practice',
 'democrat',
 'never',
 'meet',
 'open',
 'field',
 'battle',
 'instead',
 'sneak',
 'behind',
 'knock',
 'unconscious',
 'bag',
 'sanctimony',
 'party',
 'weak',
 'men',
 'angry',
 'woman',
 'passive',
 'aggression',
 'mode',
 'communication',
 'ever',
 'seen',
 'one',
 'jen',
 "psaki's",
 'press',
 'conference',
 'know',
 'exactly',
 "we're",
 'talking',
 'watched',
 'one',
 'yesterday',
 'fact',
 'last',
 'peter

In [112]:
FD = FreqDist(new_list[0])
FD

FreqDist({'people': 22, 'court': 18, "they're": 15, 'supreme': 14, 'like': 14, 'right': 13, 'justice': 12, "that's": 12, 'u': 12, 'one': 10, ...})

In [113]:
new_list[1]
#.get_feature_names()
new_list[0][1].getrow(1)

AttributeError: 'str' object has no attribute 'getrow'

In [81]:
feature_names = new_list[0][0].get_feature_names()
not_so_sparse_not_so_spicy = pd.DataFrame(new_list[0][1].toarray(), columns = feature_names)

In [82]:
not_so_sparse_not_so_spicy.sort_values(by = ['address'])

Unnamed: 0,able,abortion,abortionist,absurd,according,across,activist,actual,actually,address,...,worse,would,write,wrote,ya,yeah,year,yesterday,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
779,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
384,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [83]:
NLfreqdist = FreqDist(feature_names)
NLfreqdist

FreqDist({'able': 1, 'abortion': 1, 'abortionist': 1, 'absurd': 1, 'according': 1, 'across': 1, 'activist': 1, 'actual': 1, 'actually': 1, 'address': 1, ...})

In [118]:
vectorize= CountVectorizer()

In [120]:
vect = vectorize.fit_transform(new_list[0])

In [121]:
vect

<1169x660 sparse matrix of type '<class 'numpy.int64'>'
	with 1174 stored elements in Compressed Sparse Row format>

In [123]:
from sklearn.cluster import KMeans
kmeans = KMeans()