In [179]:
import numpy as np
import pandas as pd
import re
from Porter_Stemmer_Python import PorterStemmer
from collections import Counter

# Step 1 - Creating the Feature Vectors

In [68]:
with open("./Project4_paragraphs.txt") as f:
    text = f.read().split("\n")
# filter out all of the empty entries
text = [doc for doc in text if len(doc) > 0]

In [185]:
def filter_doc(doc: str, stop_words: set[str]) -> list[str]:
    """Applies steps A to F from the instructions to a whole document."""
    p = PorterStemmer()
    # Step B (step a is done later so that I can take advantage of regular expressions)
    doc = re.sub('[.!?;:,()-]', ' ', doc)
    doc = re.sub('[\'\"]', '', doc)
    doc = re.sub('<br|/><br|/>', ' ', doc)
    doc = re.sub('[<|/>@#$%^&*]', ' ', doc)
    # Step C
    doc = re.sub('[0-9]', ' ', doc)
    # Step D
    doc = doc.lower()
    # remove duplicate spaces
    doc = re.sub('  +', ' ', doc)
    # Steps A & E
    tokens = [word for word in doc.split(' ') if word not in stop_words]
    # Step F
    return [p.stem(token, 0, len(token)-1) for token in tokens]

In [167]:
# filter stop words
with open('./Project4_stop_words.txt') as f:
    stop_words = f.read()
stop_words = set(stop_words.split('\n'))
stop_words.add('') # add the empty string to the set of strings to 
stop_words

{'',
 'a',
 'able',
 'about',
 'across',
 'after',
 'all',
 'almost',
 'also',
 'am',
 'among',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'but',
 'by',
 'can',
 'cannot',
 'could',
 'dear',
 'did',
 'do',
 'does',
 'either',
 'else',
 'ever',
 'every',
 'for',
 'from',
 'get',
 'got',
 'had',
 'has',
 'have',
 'he',
 'her',
 'hers',
 'him',
 'his',
 'how',
 'however',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 'its',
 'just',
 'least',
 'let',
 'like',
 'likely',
 'may',
 'me',
 'might',
 'most',
 'must',
 'my',
 'neither',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'often',
 'on',
 'only',
 'or',
 'other',
 'our',
 'own',
 'rather',
 'said',
 'say',
 'says',
 'she',
 'should',
 'since',
 'so',
 'some',
 'than',
 'that',
 'the',
 'their',
 'them',
 'then',
 'there',
 'these',
 'they',
 'this',
 'tis',
 'to',
 'too',
 'twas',
 'us',
 'wants',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 '

In [177]:
tokens = [filter_doc(doc, stop_words) for doc in text]
tokens[0][:10]

['on',
 'review',
 'mention',
 'watch',
 'oz',
 'episod',
 'youll',
 'hook',
 'right',
 'exactli']

## Term Document Matrix
*note: I'm only displaying as many columns as jupyter will let me due to the sheer number of unique words*

In [186]:
TDM = pd.DataFrame([Counter(token_list) for token_list in tokens]).fillna(0)
TDM

Unnamed: 0,on,review,mention,watch,oz,episod,youll,hook,right,exactli,...,priestess,timothi,carei,obtrus,visitor,gruesom,grand,guignol,fanat,breathtak
0,1.0,1.0,1.0,3.0,6.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
