Some goals of this code is to keep every cell and function in the form of pure functions, to document well, and to keep small the functions. I would greatly appreciate any contributions following the same guidelines. 

In [77]:
## incase any packages aren't installed, you can uncomment and run this following line. 
# !pip3 install nltk pandas

In [67]:
from nltk import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import pandas as pd

# Phase One: loading and cleaning text

In [55]:
def load_into_reviews():
    """returns list of each individual review"""
    with open('./data/small_train.txt') as text:
        review = text.readline()
        reviews = list()
        while review:
            reviews.append(review)
            review = text.readline()
    return reviews
reviews_list = load_into_reviews()
## check if loaded correctly
# print('reviews number:', len(reviews_list),'\nfirst review:', reviews_list[0])

In [56]:
## check if correctly loaded
def is_one_review_per_entry(reviews_list):
    """returns dict of incorrect reviews"""
    incorrects = dict()
    end_tag = '#EOF'
    for index, review in enumerate(reviews_list):
        end_tags_count = review.count(end_tag) ## returns how many times the tag appears
        if end_tags_count!=1:
            incorrects[index]=end_tags_count ## stores the count and the index for future reference. 
    return incorrects
incorrect_readins = is_one_review_per_entry(reviews_list)
## check results:
# print('length of incorrect_readins:', len(incorrect_readins)) ## should equal zero

In [57]:
## tokenizing reviews
def to_tokenized_list(reviews_list):
    """returns list of reviews where reviews are lists of tokens"""
    tokenized_reviews_list = list()
    for review in reviews_list:
        tokens = word_tokenize(review)
        tokenized_reviews_list.append(tokens)
    return tokenized_reviews_list
tokenized_reviews_list = to_tokenized_list(reviews_list)
## check if loaded correctly
# print('reviews number:', len(tokenized_reviews_list), '\nfirst tokenized review:', tokenized_reviews_list[0])

In [58]:
## removing extra tokens. 
def remove_extra_tokens(tokenized_reviews_list):
    """returns list of reviews where reviews are lists of tokens and some extraneous tokens are removed"""
    clean_reviews_list = list()
    for review in tokenized_reviews_list:
        ## only tokens neither in punctuation or stopwords
        clean_review = [token for token in review if token not in punctuation and token not in stopwords.words('english')]
        clean_reviews_list.append(clean_review)
    return clean_reviews_list
clean_reviews_list = remove_extra_tokens(tokenized_reviews_list)
# print('first cleaned review', clean_reviews_list[0])

first cleaned review ['+1', 'One', 'all-time', 'favorite', "so-laughably-lousy-that-it's-totally-lovable", 'el', 'cheapo', 'stinko', 'nickel', "n'dime", 'independent', 'horror', 'creature', 'features', 'enjoyably', 'dreadful', 'marvel', 'released', 'formidably', 'fecund', 'exploitation', 'outfit', 'Crown', 'International', 'Pictures', 'could', 'play', 'numerous', 'crappy', 'double', 'bills', 'countless', 'drive-ins', 'back', '70', "'s", 'eventually', 'wound', 'rerun', 'like', 'crazy', 'several', 'small-time', 'secondary', 'cable', 'stations', 'throughout', '80', "'s", 'I', 'naturally', 'first', 'saw', 'gloriously', 'ghastly', 'abomination', 'late-night', 'television', 'one', 'fateful', 'Saturday', 'evening', 'early', 'teens', 'deep-seated', 'albeit', 'completely', 'irrational', 'abiding', 'fondness', 'ever', 'since.', 'br', 'br', 'A', 'meteorite', 'falls', 'sky', 'crashes', 'still', 'waters', 'tranquil', 'country', 'lake', 'thereby', 'causing', 'heretofore', 'dormant', 'dinosaur', 'egg

In [59]:
## more cleaning could be done at this point to improve quality. 

In [60]:
def get_ratings(clean_reviews_list):
    """returns ratings and returns clean_reviews_list without the ratings"""
    ratings_y = list()
    for review in clean_reviews_list:
        rating = int(review[0])
        ratings_y.append(rating)
    return ratings_y
ratings_y = get_ratings(clean_reviews_list)
## check to see if works
# print('ratings', ratings_y)

ratings [1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, -1, -1, 1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, -1]


In [61]:
def remove_ratings_from_reviews_list(clean_reviews_list):
    """returns same list without the first token of each review and without last 'EOF'"""
    shortened_list = [review[1:-1] for review in clean_reviews_list] 
    return shortened_list
clean_reviews_short_list = remove_ratings_from_reviews_list(clean_reviews_list)
## check to see if it worksd
# print(clean_reviews_short_list[0])

['One', 'all-time', 'favorite', "so-laughably-lousy-that-it's-totally-lovable", 'el', 'cheapo', 'stinko', 'nickel', "n'dime", 'independent', 'horror', 'creature', 'features', 'enjoyably', 'dreadful', 'marvel', 'released', 'formidably', 'fecund', 'exploitation', 'outfit', 'Crown', 'International', 'Pictures', 'could', 'play', 'numerous', 'crappy', 'double', 'bills', 'countless', 'drive-ins', 'back', '70', "'s", 'eventually', 'wound', 'rerun', 'like', 'crazy', 'several', 'small-time', 'secondary', 'cable', 'stations', 'throughout', '80', "'s", 'I', 'naturally', 'first', 'saw', 'gloriously', 'ghastly', 'abomination', 'late-night', 'television', 'one', 'fateful', 'Saturday', 'evening', 'early', 'teens', 'deep-seated', 'albeit', 'completely', 'irrational', 'abiding', 'fondness', 'ever', 'since.', 'br', 'br', 'A', 'meteorite', 'falls', 'sky', 'crashes', 'still', 'waters', 'tranquil', 'country', 'lake', 'thereby', 'causing', 'heretofore', 'dormant', 'dinosaur', 'egg', 'hatch', 'Of', 'course',

In [62]:
## final preprocessing for tokenizer use. 
def stringify_reviews_tokens(clean_reviews_short_list):
    clean_reviews = list()
    for review_tokens in clean_reviews_short_list:
        review = ' '.join(review_tokens)
        clean_reviews.append(review)
    return clean_reviews
clean_reviews = stringify_reviews_tokens(clean_reviews_short_list)
# print(clean_reviews[1])

I high hopes film I thought CLEAN SHAVEN Kerrigan 's first feature absolutely terrific assuredly cinematic low budget film I 'd ever seen. br br But much CLAIRE DOLAN utterly pointless flat Scene scene seems randomly tossed mix without much thought narrative character. br br Is Claire trying escape prostitute Hard tell Why pick trick airport wants escape life Why pick tricks needs money Seattle Why see dye hair virtually exact color Why Claire accept johns others The filmmaker n't seem know. br br It feels everything improvised though I understand n't case filmmakers held camera making verite documentary. br br After screening I saw Kerrigan defended lack narrative choices condemning film narrative politically conservative It sounded like learned rhetoric I think cop-out. br br I saddened maker film exciting CLEAN SHAVEN would go make lame film one defend tired old `` political '' cliches


## Saving for future use

In [79]:
## saves cleaned tokens and strings for use in next steps
def reviews_to_df():
    reviews_df = pd.DataFrame()
    reviews_df['ratings'] = ratings_y
    reviews_df['tokens'] = clean_reviews_short_list
    reviews_df['clean_reviews'] = clean_reviews
    return reviews_df
reviews_df = reviews_to_df()
# reviews_df.head()

In [87]:
def save_df(reviews_df):
    reviews_df.to_pickle('./data/reviews_df.pickle')
save_df(reviews_df)
## below is to check whether the save worked. 
# df = pd.read_pickle('./data/reviews_df.pickle')
# df.head()