Some goals of this code is to keep every cell and function in the form of pure functions, to document well, and to keep small the functions. I would greatly appreciate any contributions following the same guidelines. 

In [7]:
## incase any packages aren't installed, you can uncomment and run this following line. 
# !pip3 install nltk pandas

In [8]:
from nltk import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import pandas as pd

# Phase One: loading and cleaning text

In [9]:
def load_into_reviews(path='./data/small_train.txt'):
    """returns list of each individual review"""
    with open(path) as text:
        review = text.readline()
        reviews = list()
        while review:
            reviews.append(review)
            review = text.readline()
    return reviews

In [10]:
def get_ratings(reviews_list):
    """returns ratings from list of strings with first word as the rating"""
    ratings_y = list()
    for review in reviews_list:
        rating = review.split("\t")[0]
        ratings_y.append(rating)
    return ratings_y
def remove_ratings(reviews_list):
    shortened_reviews_list = [review.split('\t', 1)[1] for review in reviews_list]
    return shortened_reviews_list

In [11]:
## check if correctly loaded
def is_one_review_per_entry(reviews_list):
    """returns dict of incorrect reviews"""
    incorrects = dict()
    end_tag = '#EOF'
    for index, review in enumerate(reviews_list):
        end_tags_count = review.count(end_tag) ## returns how many times the tag appears
        if end_tags_count!=1:
            incorrects[index]=end_tags_count ## stores the count and the index for future reference.
    return incorrects

In [12]:
## tokenizing reviews
def to_tokenized_list(reviews_list):
    """returns list of reviews where reviews are lists of tokens"""
    tokenized_reviews_list = list()
    for review in reviews_list:
        tokens = word_tokenize(review)
        tokenized_reviews_list.append(tokens)
    return tokenized_reviews_list

In [13]:
## removing extra tokens. 
def remove_extra_tokens(token_reviews_list):
    """returns list of reviews where reviews are lists of tokens and some extraneous tokens are removed"""
    clean_reviews_list = list()
    for review in token_reviews_list:
        ## only tokens neither in punctuation or stopwords
        clean_review = [token for token in review if token not in punctuation and token not in stopwords.words('english')]
        clean_reviews_list.append(clean_review)
    return clean_reviews_list

In [14]:
## more cleaning could be done at this point to improve quality. 

In [15]:
## final preprocessing for tokenizer use. 
def stringify_reviews_tokens(token_reviews_list):
    string_reviews = list()
    for review_tokens in token_reviews_list:
        review = ' '.join(review_tokens)
        string_reviews.append(review)
    return string_reviews

## Saving for future use

In [16]:
## saves cleaned tokens and strings for use in next steps
def reviews_to_df(ratings_y, clean_token_reviews, clean_reviews, short_reviews_list):
    reviews_df = pd.DataFrame()
    reviews_df['ratings'] = ratings_y
    reviews_df['tokens'] = clean_token_reviews
    reviews_df['clean_reviews'] = clean_reviews
    reviews_df['raw_reviews'] = short_reviews_list
    return reviews_df

In [17]:
def save_df(reviews_df):
    reviews_df.to_pickle('./data/reviews_df.pickle')

In [20]:
def run(load_path, save_path):
    reviews_list = load_into_reviews(load_path)
    ratings_y = get_ratings(reviews_list)
    incorrect_readins = is_one_review_per_entry(reviews_list)
    assert (len(incorrect_readins) == 0), 'incorrect readins'
    short_reviews_list = remove_ratings(reviews_list)
    token_reviews_list = to_tokenized_list(short_reviews_list)
    clean_token_reviews = remove_extra_tokens(token_reviews_list)
    clean_reviews = stringify_reviews_tokens(clean_token_reviews)
    reviews_df = reviews_to_df(ratings_y, clean_token_reviews, clean_reviews, short_reviews_list)
    save_df(reviews_df)
    df = pd.read_pickle(save_path)
    return df.head()
def main():
    load_path = './data/small_train.txt'
    save_path = './data/reviews_df.pickle'
    df = run(load_path, save_path)
    print(df)
#     load_path = './data/small_test.txt'
#     save_path = './data/reviews_df_test.pickle'
#     df = run(load_path, save_path)
    print(df)

In [21]:
main()

  ratings                                             tokens  \
0      +1  [One, all-time, favorite, so-laughably-lousy-t...   
1      -1  [I, high, hopes, film, I, thought, CLEAN, SHAV...   
2      -1  [When, released, I, thought, one, profane, fil...   
3      -1  [I, watched, movie, Starz, Let, go, things, th...   
4      +1  [I, loved, much, I, bought, DVD, novel, time, ...   

                                       clean_reviews  \
0  One all-time favorite so-laughably-lousy-that-...   
1  I high hopes film I thought CLEAN SHAVEN Kerri...   
2  When released I thought one profane films ever...   
3  I watched movie Starz Let go things thought co...   
4  I loved much I bought DVD novel time The chemi...   

                                         raw_reviews  
0  One of my all-time favorite so-laughably-lousy...  
1  I had high hopes for this film, because I thou...  
2  When this was released, I thought this was one...  
3  I just watched this movie on Starz. Let me go ...  
4  