# Initial Exploration

### Merge CSVs, Preprocessing, EDA

In [10]:
import numpy as np
import pandas as pd
import glob
import string
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer 

# constants
stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords[:10]

In [11]:
'''
Merge datasets in the data folder.
Returns a dataframe of all the data.
'''
def retrieve_reviews_df():
    all_files = glob.glob("./data/*.csv")

    df_list = []

    for filename in all_files:
        print(f"Concatenating {filename}")
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)
    
    return pd.concat(df_list, axis=0, ignore_index=True)

In [12]:
reviews_df = retrieve_reviews_df()
print(reviews_df.head())

Concatenating ./data/Reviews-4.csv
Concatenating ./data/Reviews-1.csv
Concatenating ./data/Reviews-2.csv
Concatenating ./data/Reviews-3.csv
       Id   ProductId          UserId  \
0  566663  B000NV9VQU  A1SXMCMLHAVH0I   
1  566664  B000NV9VQU  A2CHHNIA58EPIG   
2  566665  B000NV9VQU  A1D0EJPEXE8ZOP   
3  566666  B000NV9VQU  A2F2MZW8EOGH5J   
4  566667  B000NV9VQU  A2DYW9Y2D8E43L   

                                        ProfileName  HelpfulnessNumerator  \
0                                              Biba                     2   
1                                            Tamara                     1   
2                                         Don Rubin                     1   
3  daemoncycler "When you arrive at a fork in th...                     0   
4                           Kris Cayocca "Cayotejr"                     1   

   HelpfulnessDenominator  Score        Time  \
0                       2      5  1226880000   
1                       1      5  1316908800   
2     

In [13]:
# number of null values per column
reviews_df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [14]:
# preprocessing for the dataframe 
def score(x):
    if x<3:
        return -1
    elif x==3:
        return 0
    else:
        return 1

def preprocessing(df):
    # drop columns
    new_df = df.drop(columns = ["ProductId", "UserId", "ProfileName", "HelpfulnessNumerator", "HelpfulnessDenominator", "Time"])

    # drop na values
    new_df = new_df.dropna(axis=0)

    # make a new column of sentiment: (-1/0/1) -- pos/neutral/neg -- 1,2/3/4,5
    new_df['Sentiment'] = new_df.apply(lambda x: score(x['Score']), axis=1)


    return new_df

In [15]:
df = preprocessing(reviews_df)
df.head()

Unnamed: 0,Id,Score,Summary,Text,Sentiment
0,566663,5,THE BEST TREAT YOU CAN BUY PERIOD!,There is NOTHING on the market like these trea...,1
1,566664,5,Awesome treats!,My pup can't have anything with grains in it a...,1
2,566665,4,"My dog loves this, but beware there are differ...",My elderly dog who has pretty much given up on...,1
3,566666,5,Awesome treats!,So glad we found Amazon. Used to travel to the...,1
4,566667,3,Good stuff,"Naturally, pooch scarfed this up. What's not ...",0


Text Preprocessing

In [16]:
'''
Removing punctuations like . , ! $( ) * % @
Removing URLs
Removing Stop words
Lower casing
Tokenization
Stemming
Lemmatization
'''
def remove_punctuation(text):
    punctuation_free = "".join([i for i in text if i not in string.punctuation])
    return punctuation_free

def tokenization(text):
    tokens = re.split('W+',text)
    return tokens

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

def text_preprocessing(df):
    # remove punctuation
    df["Clean_text"] = df["Text"].apply(lambda x: remove_punctuation(x))

    # tokenization
    # TODO: tokenization is a bit buggy rn, maybe don't use regular expressions
    df["Clean_text"] = df["Clean_text"].apply(lambda x: tokenization(x))

    # remove stopwords
    df["Clean_text"] = df["Clean_text"].apply(lambda x: remove_stopwords(x))
    
    return df

In [17]:
df = text_preprocessing(df)
df.head()

Unnamed: 0,Id,Score,Summary,Text,Sentiment,Clean_text
0,566663,5,THE BEST TREAT YOU CAN BUY PERIOD!,There is NOTHING on the market like these trea...,1,[There is NOTHING on the market like these tre...
1,566664,5,Awesome treats!,My pup can't have anything with grains in it a...,1,[My pup cant have anything with grains in it a...
2,566665,4,"My dog loves this, but beware there are differ...",My elderly dog who has pretty much given up on...,1,[My elderly dog who has pretty much given up o...
3,566666,5,Awesome treats!,So glad we found Amazon. Used to travel to the...,1,[So glad we found Amazon Used to travel to the...
4,566667,3,Good stuff,"Naturally, pooch scarfed this up. What's not ...",0,"[Naturally pooch scarfed this up , hats not t..."


In [18]:
df['Clean_text'][0]

['There is NOTHING on the market like these treats by Canz My dogs will do anything for them The price that Amazon charges is HALF of what I pay at my local pet store if I have to buy them there The ingredients are pure natural and healthy The jerky is not dried out and brittle it is moist and chewy No carbohydrates to make your pet gain weight either My cats even like these I break them into small pieces for them The beef ones almost smell like they have a little bacon in them but they dont My dogs come running the minute they hear the rustle of this bag when I pick it up I know many dog trainers that swear by these too These will get your dog to do ANYTHING I highly recommend these']