In [1]:
import numpy as np
import pandas as pd
import glob
import string
import re
import nltk
import time

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textwrap import wrap

# constants
stopwords = nltk.corpus.stopwords.words('english')

In [2]:
'''
Merge csvs in a given folder
Returns a dataframe of all the csvs.
'''
def retrieve_df(path):
    all_files = glob.glob(f"{path}/*.csv")  

    df_list = []

    for filename in all_files:
        print(f"Concatenating {filename}")
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)
    
    return pd.concat(df_list, axis=0, ignore_index=True)

In [3]:
'''
Merge datasets in the data folder.
Returns a dataframe of all the data.
'''
def retrieve_reviews_df():
    return retrieve_df("../data")

In [4]:
'''
Merge datasets in the processed_data folder.
Returns a dataframe of all the data.
'''
def retrieve_processed_reviews_df():
    return retrieve_df("../processed_data")

In [5]:
# preprocessing for the dataframe 
def score(x):
    return -1 if x < 3 else 0 if x==3 or x==4 else 1

def preprocessing(df):
    # drop columns
    new_df = df.drop(columns = ["ProductId", "UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Time"])

    # drop na values
    new_df = new_df.dropna(axis=0)

    # make a new column of sentiment: (-1/0/1) -- pos/neutral/neg -- 1,2/3/4,5
    new_df['Sentiment'] = new_df.apply(lambda x: score(x['Score']), axis=1)


    return new_df

def retrieve_and_preprocess():
    # Create one dataframe with data, then drop columns and change score to 1/0/-1
    df = preprocessing(retrieve_reviews_df())

    # Save df to csv?

    return df

In [6]:
# helper functions for text_preprocess
def remove_html_tags(text):
    no_html = re.sub('<.*?>','',text)
    return no_html

def remove_stopwords(text):
    products = ['dog', 'food', 'soup', 'chai', 'tea','ordered','order','coconut','taffy','product']
    output = " ".join([i for i in text.split() if i not in stopwords and i not in products])
    return output

def remove_punctuation(text):
    punctuation_free = "".join([i for i in text if i not in string.punctuation])
    return punctuation_free

In [7]:
'''
Options:
- html -- remove html headers and syntax
- stop -- remove stopwords
- punc -- remove punctuation
- lower -- change entire string to lowercase
'''
def text_preprocess(df, options, verbose=False):
    df["Clean_text"] = df["Text"]
    
    if 'html' in options:
        if verbose:
            print("Removing HTML tags")
            start = time.time()
        df["Clean_text"] = df["Clean_text"].apply(lambda x: remove_html_tags(x))
        if verbose:
            print(f"Removed HTML tags, took {(start-time.time())} seconds")

    if 'stop' in options:
        if verbose:
            print("Removing stopwords")
            start = time.time()
        df["Clean_text"] = df["Clean_text"].apply(lambda x: remove_stopwords(x))
        if verbose:
            print(f"Removed stopwords, took {(start-time.time())} seconds")

    if 'punc' in options:
        if verbose:
            print("Removing punctuation")
            start = time.time()
        df["Clean_text"] = df["Clean_text"].apply(lambda x: remove_punctuation(x))
        if verbose:
            print(f"Removed punctuation, took {(start-time.time())} seconds")

    if 'lower' in options:
        if verbose:
            print("Lowercasing words")
            start = time.time()
        df["Clean_text"] = df["Clean_text"].apply(lambda x: x.lower())
        if verbose:
            print(f"Lowercased words, took {(start-time.time())} seconds")

In [8]:
'''
X_train : should be a series, df['Text'] (according to EDA ipynb)
'''
def vectorize(X):
    # tokenizer
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(X) 

    # tf-idf
    tfidf_transformer = TfidfTransformer(use_idf=False)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)

    return X_counts, X_tfidf