## Dependencies

In [16]:
import pandas as pd
import re
from stop_words import stop_words
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize



# Helper Functions


### To Lowercase

In [17]:
def lower_case(line): return line.lower().strip()


### Stem Words

In [18]:
def stem_words(line):
    ps = PorterStemmer()

    words = line.split()
    
    return_list = [ps.stem(word.strip()) for word in words]

    return ' '.join(return_list)


### Remove Stop Words

In [19]:
def remove_stop_words(line):

    words = line.split()
    
    kept_words = [word for word in words if word not in stop_words]

    return ' '.join(kept_words)


### Remove Special Characters and Numbers

In [20]:
def remove_special_characters_and_numbers(line):
    return re.sub(r'([^a-zA-Z\s]+?)', '', line)




### Vectorize

In [21]:

def get_words_set(df): return {word for index, row in df.iterrows() for word in row['Review'].split()}    


def create_row_dict(index, row, word_set):
    
    if index % 10000 == 0:
        print('processing index ', index, '.')
    
    row_words = set(row['Review'].split())
    
    return_dict = {header: (0, 1)[header in row_words] for header in word_set}
    return_dict['_Freshness'] = row['Freshness']
    return return_dict


def vectorize(df):
    word_set = get_words_set(df)
    
    word_set.add('_Freshness') #added a special character just in case 'Freshness' shows up in the data

    
    print('word_set size: ', len(word_set))
        
    dict_list = [create_row_dict(index, row, word_set) for index, row in df.iterrows()]

    return_df = pd.DataFrame(dict_list)

    print(return_df.head())
    return return_df



# Main
### Clean

In [23]:

import time
start = time.clock()
def get_time():
    print('TIME: ', time.clock() - start)

def clean_data(df):
    df['Review'] = df['Review'].apply(lower_case)
    print('Finished, lower_case: ')
    get_time()
    df['Review'] = df['Review'].apply(remove_stop_words)
    print('Finished, remove_stop_words: ')
    get_time()
    df['Review'] = df['Review'].apply(remove_special_characters_and_numbers)
    print('Finished, remove_special_characters_and_numbers: ')
    get_time()
    df['Review'] = df['Review'].apply(stem_words)
    print('Finished, stem_words: ')
    get_time()

    df.to_csv('cleaned.csv', index=False)
    print('Finished, cleaned to csv: ')
    get_time()
    
raw_df = pd.read_csv('truncated.csv')
# raw_df = pd.read_csv('rotten_tomatoes_reviews.csv')

clean_data(raw_df)


### Vectorize

In [None]:

clean_data = pd.read_csv('cleaned.csv')
vectorized = vectorize(clean_data)
print('Finished, vectorize: ')
get_time()
vectorized.to_csv('vectorized.csv', index=False)
print('Finished! ')
get_time()
