# Exploratory Data Analysis Notebook
This notebook will preprocess and leverage NLP models on the unstructured data to turn it into a usable feature space for modeling Tucker Carlson's body of work

In [6]:
#Imports cell

#Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import nltk
from nltk import word_tokenize
from nltk import FreqDist
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# import the tucker document data either as a CSV or a pickle

#Read out from CSV
tucker_docs = pd.read_csv('data/tucker_docs.csv', encoding='UTF8', header = None).T

#Read from pickle
#tucker_docs = pd.read_pickle('data/tucker_pickle')

In [8]:
tucker_docs.head()

Unnamed: 0,0
0,Fox News host gives his take on pro-abortion ...
1,Fox News host reflects on the left's respons...
2,Fox News host gives his take on how Americans...
3,Fox News host gives his take on the Supreme C...
4,Fox News host gives his take on the real moti...


## Implementing the Bradley-Haderthauer Test
Compare two topic distributions: IF BH-score is < 0.2, then a Twitterer can be confidently classified as a Tuckerbot. This person is a lower life form and unable to contribute, in good faith, to the deep state media platform of choice, Twitter.

In [9]:
#make custom stops words to remove first 100 words? remove intro to episode 
#remove words in all caps 

## remove words in all caps

In [10]:


tucker_doc = tucker_docs.iloc[0,0]
#pattern to delete words in all caps
#pattern = "(([a-zA-Z]+(?:'[a-z]+)?))"
t_d = re.sub(r'\b[A-Z]+\b', '', tucker_doc)
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenized_doc = nltk.regexp_tokenize(t_d, pattern)

#lowercase all words
tokenized_doc = [word.lower() for word in tokenized_doc]
tokenized_doc

#Freqdist
td_freqdist = FreqDist(tokenized_doc)
td_freqdist.most_common(75)

#Stop words
stopwords_list = stopwords.words('english')
stop_tokenized_doc = [word for word in tokenized_doc if word not in stopwords_list]

#Stopped freqdist 
stop_td_freqdist = FreqDist(stop_tokenized_doc)
stop_td_freqdist.most_common(75)

#lemmatize
#to lem 
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

#more lemmatizin
#wnl.lemmatize(wn1.lemmatize(word) for word in stop_tokenized_doc)
tokens = [wnl.lemmatize(word) for word in stop_tokenized_doc]
tokens

#lemmatizer = WordNetLemmatizer()
#def lemmatize_words(text):
   # return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
#stop_tokenized_doc = stop_tokenized_doc.apply(lambda text: lemmatize_words(stop_tokenized_doc))

['fox',
 'news',
 'host',
 'give',
 'take',
 'pro',
 'abortion',
 'protester',
 'targeting',
 'supreme',
 'court',
 'justice',
 'possible',
 'overturn',
 'roe',
 'v',
 'wade',
 'tucker',
 'carlson',
 'tonight',
 'pretty',
 'hard',
 'argue',
 'people',
 'passive',
 'aggressive',
 'may',
 'tried',
 'angry',
 'scream',
 'stop',
 'violent',
 'snarl',
 'punch',
 'face',
 'passive',
 'aggressive',
 'people',
 'intent',
 'dominating',
 "they're",
 'dishonest',
 'admit',
 'honorable',
 'style',
 'attack',
 'effective',
 'mostly',
 'bewildering',
 'democratic',
 'party',
 'practice',
 'democrat',
 'never',
 'meet',
 'open',
 'field',
 'battle',
 'instead',
 'sneak',
 'behind',
 'knock',
 'unconscious',
 'bag',
 'sanctimony',
 'party',
 'weak',
 'men',
 'angry',
 'woman',
 'passive',
 'aggression',
 'mode',
 'communication',
 'ever',
 'seen',
 'one',
 'jen',
 "psaki's",
 'press',
 'conference',
 'know',
 'exactly',
 "we're",
 'talking',
 'watched',
 'one',
 'yesterday',
 'fact',
 'last',
 'peter

### Preprocess function to do all steps above at once

In [11]:
def preprocessing(text):

    #Step 0
    #This is Praveen's code but it's not PEP-8 friendly so you should fix that for next cohort ty
    text = text.lower()
    text = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+@[A-Z0-9_]+|#[A-Z0-9_]+|http\S+", "", text).strip().replace("\r", "").replace("\n", "").replace("\t", "")
   
    #step 1: delete all caps words
    t_d = re.sub(r'\b[A-Z]+\b', '', text)

    #step 2: tokenize
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    tokenized_doc = nltk.regexp_tokenize(t_d, pattern)

    #step 3: stop words
    stopwords_list = stopwords.words('english')
    stop_tokenized_doc = [word for word in tokenized_doc if word not in stopwords_list]

    #step 4: lem
    tokens = [wnl.lemmatize(word) for word in stop_tokenized_doc]
    return ' '.join(tokens)

In [12]:
tucker_list = tucker_docs[0].tolist()
new_list = []
for each_doc in tucker_list:
    new_list.append(preprocessing(each_doc))

In [13]:
feature_names = new_list[0][0].get_feature_names()
not_so_sparse_not_so_spicy = pd.DataFrame(new_list[0][1].toarray(), columns = feature_names)

AttributeError: 'str' object has no attribute 'get_feature_names'

In [14]:
not_so_sparse_not_so_spicy.sort_values(by = ['address'])

NameError: name 'not_so_sparse_not_so_spicy' is not defined

In [15]:
vectorize= CountVectorizer()

In [16]:
vect = vectorize.fit_transform(new_list[0])

ValueError: Iterable over raw text documents expected, string object received.

In [17]:
vect

NameError: name 'vect' is not defined

In [18]:
from sklearn.cluster import KMeans
kmeans = KMeans()

## Preprocess the twitter data

In [19]:
tweet_df = pd.read_csv('data/unique_tweets_list.csv').drop(columns = ['Unnamed: 0'])

In [20]:
import ast

len(ast.literal_eval(tweet_df.iloc[750,:][0]))

100

In [21]:
def string_to_list(s):
    try:
        l = ast.literal_eval(s)
    except:
        l = None
    return l

In [22]:
tweet_df['tweet_history'] = tweet_df['tweet_history'].apply(lambda x: string_to_list(x))

In [23]:
tweet_df = tweet_df.dropna()

In [24]:
def prepare_tweets(tweet_row):
    tweet_list = []
    for n in range(len(tweet_row)):
        tweet_list.append(preprocessing(tweet_row[n]))

    cleaned_tweets = []
    for tweet in tweet_list:
        if len(tweet) > 20:
            cleaned_tweets.append(tweet)
    return cleaned_tweets

In [25]:
tweet_df['cleaned_tweets'] = tweet_df['tweet_history'].apply(lambda x: prepare_tweets(x))