## Dependencies

In [16]:
import pandas as pd
import re
from stop_words import stop_words
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize



# Helper Functions


### To Lowercase

In [17]:
def lower_case(line): return line.lower()


### Stem Words

In [18]:
def stem_words(line):
    ps = PorterStemmer()

    line = line.strip()
    words = line.split()

    return_list = []

    for word in words:

        return_list.append(ps.stem(word.strip()))

    return ' '.join(return_list)


### Remove Stop Words

In [19]:
def remove_stop_words(line):

    line = line.strip()
    words = line.split()
    kept_words = []

    for word in words:

        word = word.strip()

        if word not in stop_words:
            kept_words.append(word)

    return ' '.join(kept_words)


### Remove Special Characters and Numbers

In [20]:
def remove_special_characters_and_numbers(line):
    return re.sub(r'([^a-zA-Z\s]+?)', '', line)




### Vectorize

In [21]:

def get_words_set(df):
    word_list = []
    
    for index, row in df.iterrows():
        words = row['Review'].split()
        for word in words:
            if word not in word_list:
                word_list.append(word)
        
    return word_list


def create_row_dict(row, headers):
    return_dict = {}
    for index in range(len(headers)):
        if headers[index] in row['Review'].split():
            return_dict[headers[index]] = 1
        else:
            return_dict[headers[index]] = 0
            
    return return_dict
            
    

def vectorize(df):
    word_set = get_words_set(df)
    
    headers = ['_Freshness'] #added a special character just in case 'Freshness' shows up in the data
    headers.extend(word_set)
    
    print('headers size: ', len(headers))
        
    dict_list = []

    for index, row in df.iterrows():
        dict_list.append(create_row_dict(row, word_set))

    return_df = pd.DataFrame(dict_list)

    print(return_df.head())
    return return_df





# Main
### Clean

In [23]:

def clean_data(df):
    df['Review'] = df['Review'].apply(lower_case)
    df['Review'] = df['Review'].apply(stem_words)
    df['Review'] = df['Review'].apply(remove_stop_words)
    df['Review'] = df['Review'].apply(remove_special_characters_and_numbers)
    df['Review'] = df['Review'].apply(remove_stop_words)

    df.to_csv('cleaned.csv', index=False)
    
# raw_df = pd.read_csv('truncated.csv')
raw_df = pd.read_csv('rotten_tomatoes_reviews.csv')

clean_data(raw_df)


### Vectorize

In [None]:

clean_data = pd.read_csv('cleaned.csv')
vectorized = vectorize(clean_data.head(20))
vectorized.to_csv('vectorized.csv')
