# Document Preprocessing and Modeling

## Package Import and Mongo Client

In [3]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import credentials
import time
import string
import config as cfg

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
#!python -m spacy download en_core_web_lg

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

#!pip install pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis

## Dataset Pulldown

In [4]:
client = MongoClient(**cfg.config)

db = client.youtube

In [5]:
def get_caption_block(videoID):
    try:
        return db.captions.find_one({'videoID' : videoID},{'_id' : 0, 'caption_block' : 1})['caption_block']
    except:
        return None
    

In [6]:
def get_rec_vids(collection,filter_dict={}):
    pipeline = [
        {'$unwind': '$videos'}, 
        {'$match': filter_dict},
        {'$project': {'_id': 0, 'query' : 1, 'order' : '$videos.order', 'videoID' : '$videos.videoID'}}
    ]
    
    vid_df = pd.DataFrame(list(collection.aggregate(pipeline)))
    vid_df['caption'] = vid_df['videoID'].apply(get_caption_block)
    caption_series = vid_df['caption']
    caption_series.index = list(vid_df['videoID'])
    caption_series = caption_series.dropna().drop_duplicates()

    return vid_df, caption_series

In [7]:
test = get_rec_vids(db.recommendations)

In [8]:
test[1]

vGPU5SWV1DE    half of Britain said that breaks it's made it ...
6afu04-KO90    Greta turn Berg the teenage climate change act...
FcFUa9SC8JA    you've er decided to join the Liberal Democrat...
Z612WQhdOQ8    it was like an invitation to a dinner party no...
q3W678l8bok    hello there so the Prime Minister Boris Johnso...
xnEr_sNghTg    first of all I have to tell to mr. Hank oh tha...
l0H-BykoktI    I think that if this is part of the conservati...
JD7Ol0gz11k    so where will you be marching this in the next...
O8ClR86bSMg    I absolutely condemn any attempt to blow up th...
JISpI1h8xys    why is it remotely patronizing for a man to op...
-iKcR4tb62Y    and I think women we've always been under trem...
PludY6bjSIU    after recent holiday the journalist Ansel Epst...
uHGPVsQlsFI    how good a friend was your your bridesmaid my ...
RTXqaLEdK5k    [Music] my name is Tam Thompson I'm here today...
VM05BvXgXmM    it is do-or-die in the sense that is my one bi...
AKYvMAVguAM    now Banksy

In [9]:
parser = English()
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ str(word).lower().strip() if word.lemma_ != "-PRON-" else str(word).lower() for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [10]:
def update_tokens(videoID,collection):
    filter_req = { "tokenized": {'$exists': False,},'videoID' : videoID}
    
    query = collection.find_one(filter_req, {'_id' : 0, 'caption_block' : 1})
    
    if query:
        collection.update_one(filter_req,{'$set' : {"tokenized" : spacy_tokenizer(query['caption_block'])}})
        return True
    return False

In [199]:
update_tokens('McRYTC56DC4',db.captions)

False

In [11]:
for elem in db.captions.find({},{'_id' : 0, 'videoID' : 1}):
    update_tokens(elem['videoID'],db.captions)

In [12]:
to_vector_list = [elem['tokenized'] for elem in db.captions.find({},{'_id' : 0, 'tokenized' : 1})]

In [13]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


data_vectorized = vectorizer.fit_transform(to_vector_list)

  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [14]:
data_vectorized

<1650x20107 sparse matrix of type '<class 'numpy.int64'>'
	with 916623 stored elements in Compressed Sparse Row format>

In [8]:
NUM_TOPICS = 4
pd.read_csv('request_log.csv')

Unnamed: 0,requestID,query,max_results,request_status,datetime
0,da23078c-bf88-4024-91e9-d6cc6df4353a,global warming,50,202,"Sat, 17 Aug 2019 18:25:00 GMT"
1,f211c3a5-9ffc-4cd1-a700-6448aae3f589,gun rights,50,202,"Sat, 17 Aug 2019 18:25:03 GMT"
2,fd4189b0-8c3b-4821-a5cc-59fee424c73e,donald trump rally,50,202,"Sat, 17 Aug 2019 18:25:49 GMT"
3,f3af736f-91ea-46a6-88ff-7e44f206e5d2,abortion pro life,50,202,"Sat, 17 Aug 2019 18:26:16 GMT"


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

class nlp_preprocessor:
            
    def dummy(doc):
        return doc
   
    def __init__(self, vectorizer=None, tokenizer=None, cleaning_function=None, 
                 stemmer=None, model=None):
        """
        A class for pipelining our data in NLP problems. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        tokenizer: The tokenizer to use, if none defaults to split on spaces
        cleaning_function: how to clean the data, if None, defaults to the in built class
        """
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.default_clean
        if not vectorizer:
            vectorizer = CountVectorizer(tokenizer=self.dummy,preprocessor=self.dummy)
        else:
            vectorizer.tokenizer = self.dummy
            vectorizer.preprocessor = self.dummy
        self.stemmer = stemmer
        self.tokenizer = tokenizer
        self.model = model
        self.cleaning_function = cleaning_function
        self.vectorizer = vectorizer
        self._is_fit = False

        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
        
    def preprocess(self, docs, tokenizer,stemmer,cleaning_function):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_docs = []
        for doc in docs:
            cleaned_tokens = []
            for token in tokenizer(doc):
                token_word_list = []
                for word in token:
                    clean_word = cleaning_function(word)
                    if stemmer:
                        clean_word = stemmer.stem(clean_word)
                    token_word_list.append(clean_word)
                cleaned_tokens.append(tuple(token_word_list))
            cleaned_docs.append(cleaned_tokens)
        return cleaned_docs
    
    def default_clean(self, word):
        return word.lower()
    
    def clean_text_old(self, text, tokenizer, stemmer):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_text = []
        for post in text:
            cleaned_words = []
            for word in tokenizer(post):
                low_word = word.lower()
                if stemmer:
                    low_word = stemmer.stem(low_word)
                cleaned_words.append(low_word)
            cleaned_text.append(' '.join(cleaned_words))
        return cleaned_text
    
    def fit(self, docs):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.preprocess(docs, self.tokenizer, self.stemmer,self.cleaning_function)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        
    def transform(self, docs):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.preprocess(docs, self.tokenizer, self.stemmer,self.cleaning_function)
        return self.vectorizer.transform(clean_text)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TreebankWordTokenizer


def lower_clean(word):
    return word.lower()


nlp = nlp_preprocessor(vectorizer=CountVectorizer(min_df=0.1,max_df=0.9), tokenizer=TreebankWordTokenizer().tokenize,
                       cleaning_function=lower_clean,stemmer=PorterStemmer())

nlp.fit(test_captions)

nlp_dict['untrained'] = nlp.transform(test_captions)

NUM_TOPICS = 8
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=100, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

In [9]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [10]:
# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized)

In [11]:
# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)

In [12]:
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [13]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('said', 2578.3349613407854), ('president', 1829.2725649900908), ('trump', 1668.255733195031), ('man', 1406.2749027625916), ('police', 1310.512568545276), ('time', 1211.620649788353), ('know', 1204.2478343235764), ('investigation', 1157.7639157382434), ('told', 1058.5315236249278), ('narrator', 1054.3975877744986)]
Topic 1:
[('like', 3360.10924096578), ('time', 2847.8738477943502), ('years', 2617.6421670461955), ('world', 2251.2347494744545), ('actually', 2249.933691764769), ('going', 2180.2481229553687), ('universe', 1963.384951700923), ('know', 1959.9447214779436), ('way', 1898.0842075820867), ('earth', 1873.796735026446)]
Topic 2:
[('know', 23309.36600420221), ('like', 20074.56493911058), ('people', 16491.143995979863), ('think', 15678.916932181362), ('right', 11834.655172505105), ('going', 10273.167131650069), ('want', 9659.48656141703), ('said', 7840.977369803221), ('got', 7388.494272398069), ('yeah', 7370.634176962241)]
Topic 3:
[('music', 3464.755551638535),

In [14]:
# Keywords for topics clustered by Latent Semantic Indexing
print("NMF Model:")
selected_topics(nmf, vectorizer)

NMF Model:
Topic 0:
[('know', 24.480051410555156), ('like', 20.745264214677352), ('think', 17.36288756168784), ('people', 15.251921734391466), ('right', 11.615074980067009), ('going', 9.04977696754732), ('time', 7.43873519068487), ('said', 7.138734844930159), ('want', 7.098055782909104), ('got', 6.753350003973424)]
Topic 1:
[('jesus', 18.769902864378846), ('let', 17.650392746490855), ('lord', 10.945765255874706), ('life', 10.742220082627039), ('god', 7.433188805700665), ('bind', 5.676348454463307), ('spirits', 5.664028252713735), ('rebuke', 5.170496173956417), ('break', 4.714986927322923), ('spirit', 4.655893311322159)]
Topic 2:
[('president', 16.228213831888713), ('trump', 10.379482562695602), ('said', 7.539526174382537), ('report', 7.526470945694014), ('investigation', 6.112563269978902), ('correct', 5.192586843476637), ('campaign', 4.290874036893819), ('thank', 4.043887113616778), ('going', 3.8877712850612185), ('time', 3.482087263819365)]
Topic 3:
[('yeah', 12.172360817797033), ('l

In [15]:
# Keywords for topics clustered by Non-Negative Matrix Factorization
print("LSI Model:")
selected_topics(lsi, vectorizer)

LSI Model:
Topic 0:
[('know', 0.40317086896863435), ('like', 0.34618583522321994), ('think', 0.29120266806416806), ('people', 0.25109131595978224), ('right', 0.1954859987900009), ('going', 0.1842328623973778), ('said', 0.1542009751415395), ('yeah', 0.13905865289808), ('want', 0.13788717589118574), ('time', 0.13731336689264173)]
Topic 1:
[('jesus', 0.4937346631289153), ('let', 0.4443042590974779), ('lord', 0.2879261167444852), ('life', 0.27140106176918766), ('god', 0.19022663423800842), ('bind', 0.15000471468584173), ('spirits', 0.149553165558465), ('rebuke', 0.1367159378133451), ('break', 0.12192753637972531), ('spirit', 0.12139475748774703)]
Topic 2:
[('like', 0.29852414480405187), ('yeah', 0.26954569907590226), ('love', 0.18293434474216894), ('wow', 0.1596601815582499), ('know', 0.15633516573345912), ('sounds', 0.1214458476527412), ('theater', 0.075606963837201), ('idea', 0.07028613575188701), ('mean', 0.06987826536085501), ('movie', 0.06971449128868645)]
Topic 3:
[('love', 0.2958580

In [24]:
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [25]:
data_vectorized

<1703x20421 sparse matrix of type '<class 'numpy.int64'>'
	with 901265 stored elements in Compressed Sparse Row format>

In [23]:
pyLDAvis.save_html(dash,'lda.html')