Some goals of this code is to keep every cell and function in the form of pure functions, to document well, and to keep small the functions. I would greatly appreciate any contributions following the same guidelines. 

In [1]:
## incase any packages aren't installed, you can uncomment and run this following line. 
# !pip3 install nltk sklearn

In [2]:
from nltk import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Phase One: loading and cleaning text

In [3]:
def load_into_reviews():
    """returns list of each individual review"""
    with open('./data/small_train.txt') as text:
        review = text.readline()
        reviews = list()
        while review:
            reviews.append(review)
            review = text.readline()
    return reviews
reviews_list = load_into_reviews()
## check if loaded correctly
# print('reviews number:', len(reviews_list),'\nfirst review:', reviews_list[0])

In [4]:
## check if correctly loaded
def is_one_review_per_entry(reviews_list):
    """returns dict of incorrect reviews"""
    incorrects = dict()
    end_tag = '#EOF'
    for index, review in enumerate(reviews_list):
        end_tags_count = review.count(end_tag) ## returns how many times the tag appears
        if end_tags_count!=1:
            incorrects[index]=end_tags_count ## stores the count and the index for future reference. 
    return incorrects
incorrect_readins = is_one_review_per_entry(reviews_list)
## check results:
# print('length of incorrect_readins:', len(incorrect_readins)) ## should equal zero

In [5]:
## tokenizing reviews
def to_tokenized_list(reviews_list):
    """returns list of reviews where reviews are lists of tokens"""
    tokenized_reviews_list = list()
    for review in reviews_list:
        tokens = word_tokenize(review)
        tokenized_reviews_list.append(tokens)
    return tokenized_reviews_list
tokenized_reviews_list = to_tokenized_list(reviews_list)
## check if loaded correctly
# print('reviews number:', len(tokenized_reviews_list), '\nfirst tokenized review:', tokenized_reviews_list[0])

In [34]:
## removing extra tokens. 
def remove_extra_tokens(tokenized_reviews_list):
    """returns list of reviews where reviews are lists of tokens and some extraneous tokens are removed"""
    clean_reviews_list = list()
    for review in tokenized_reviews_list:
        ## only tokens neither in punctuation or stopwords
        clean_review = [token for token in review if token not in punctuation and token not in stopwords.words('english')]
        clean_reviews_list.append(clean_review)
    return clean_reviews_list
clean_reviews_list = remove_extra_tokens(tokenized_reviews_list)
# print('first cleaned review', clean_reviews_list[0])

In [16]:
## more cleaning could be done at this point to improve quality. 

In [26]:
def get_ratings(clean_reviews_list):
    """returns ratings and returns clean_reviews_list without the ratings"""
    ratings_y = list()
    for review in clean_reviews_list:
        rating = int(review[0])
        ratings_y.append(rating)
    return ratings_y
ratings_y = get_ratings(clean_reviews_list)
## check to see if works
# print('ratings', ratings_y)

In [37]:
def remove_ratings_from_reviews_list(clean_reviews_list):
    """returns same list without the first token of each review"""
    shortened_list = [review[1:-1] for review in clean_reviews_list]
    return shortened_list
clean_shortened_reviews_list = remove_ratings_from_reviews_list(clean_reviews_list)
## check to see if it works
# print(clean_shortened_reviews_list[0])

# Phase Two: vectorizing reviews