In [322]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [323]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

## Keep Reviews and Ratings

In [324]:
# reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_headline', 'review_body']]
# reviews_ratings_df.reset_index(drop=True)
# reviews_ratings_df

 ## We form three classes and select 20000 reviews randomly from each class.
- [ ] 100,000 each


In [325]:
reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

valid = ["1","2","3","4","5"]
from copy import deepcopy

stars = deepcopy(reviews_ratings_df).star_rating.astype(str) # turn entries to strings
where_valid = stars.index[stars.isin(valid)].tolist() # check valid list and see which of our stars match
reviews_ratings_df = reviews_ratings_df.iloc[where_valid]

reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [326]:
reviews_ratings_df.star_rating = reviews_ratings_df.star_rating.apply(lambda x: int(x))

In [327]:
reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [328]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):
    """Categorizes reviews by adding a rating
    
    """


    if sentiment_type == 'positive_sentiment':
        positive_review_threshold = df[rating_col] > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'negative_sentiment':
        positive_review_threshold = df[rating_col] < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 0

    elif sentiment_type == 'neutral_sentiment':
        positive_review_threshold = df[rating_col] == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [329]:
positive_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'positive_sentiment')
positive_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1


Unnamed: 0,star_rating,review_body,positive_sentiment
0,5,Great product.,1
1,5,What's to say about this commodity item except...,1
2,5,"Haven't used yet, but I am sure I will like it.",1
4,4,Gorgeous colors and easy to use,1
5,5,Perfect for planning weekly meals. Removrd the...,1
...,...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...,1
2640250,4,Although the Palm Pilot is thin and compact it...,1
2640251,4,This book had a lot of great content without b...,1
2640252,5,I am teaching a course in Excel and am using t...,1


In [330]:
negative_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'negative_sentiment')
negative_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 0


Unnamed: 0,star_rating,review_body,negative_sentiment
3,1,Although this was labeled as &#34;new&#34; the...,0
13,1,worked about a month then died,0
20,1,The phone did not work. No Dial Tone. Not wo...,0
27,1,Not laminated and no reinforced holes for hang...,0
28,1,"Cartridge was over filled, black smears on pap...",0
...,...,...,...
2640139,2,This purchase was intended for a home office s...,0
2640149,2,I bought a Palm V from Amazon and thought it w...,0
2640151,1,The display is excellent - it's a good size an...,0
2640201,1,All the CE based hand held or palm computers h...,0


In [331]:
neutral_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'neutral_sentiment')
neutral_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


Unnamed: 0,star_rating,review_body,neutral_sentiment
48,3,Nice quality. Happy with the item,3
64,3,The batch I had exploded all over when I tried...,3
95,3,"It is ok, but considering the price plus shipp...",3
133,3,Delighted to receive a sample of these to try ...,3
145,3,I use this light in a dark area of my closet. ...,3
...,...,...,...
2640209,3,I was VERY disappointed to receive my Palm V a...,3
2640219,3,Very basic. The book spends a lot of time des...,3
2640225,3,"Being a Newton devotee, switching to the Palm ...",3
2640234,3,I have a US Robotics Palm Pro (we go back a wa...,3


In [332]:
pos_rand_sampled_df = positive_sentiment_df.sample(100000)
pos_rand_sampled_df

Unnamed: 0,star_rating,review_body,positive_sentiment
386177,5,perfect. it makes me feel so much safer.,1
2120941,5,"Fast shipment, great product. (Why do I need t...",1
626263,5,Great Quality Bags,1
1370040,5,This is perfect! We use it for our to do lists...,1
434926,5,This product is awesome.,1
...,...,...,...
708250,5,great deal,1
942972,5,just what I needed,1
1210325,5,The only pen I ever use.,1
872285,5,I make all of my own cards. This weight of car...,1


In [333]:
neg_rand_sampled_df = negative_sentiment_df.sample(100000)
neg_rand_sampled_df

Unnamed: 0,star_rating,review_body,negative_sentiment
2556087,1,Ithought that when bought this case it would ...,0
1124476,1,Really frustrating. Just picked these up. Era...,0
780118,1,Tried the black cartridge--it worked. The cya...,0
1665680,1,Didn't even get 6 months use of this piece of ...,0
1868606,1,I was sent the wrong cartridges for my Epson P...,0
...,...,...,...
457919,1,Very touchy with ink installment.,0
1814465,1,"Print quality was surprisingly good, better th...",0
2301883,1,Finally got to use it after 10 months in the b...,0
1448143,2,It's just fair. Doesn't open all the way over ...,0


In [334]:
reviews_ratings_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])
reviews_ratings_df

Unnamed: 0,star_rating,review_body,positive_sentiment,negative_sentiment
386177,5,perfect. it makes me feel so much safer.,1.0,
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,
626263,5,Great Quality Bags,1.0,
1370040,5,This is perfect! We use it for our to do lists...,1.0,
434926,5,This product is awesome.,1.0,
...,...,...,...,...
457919,1,Very touchy with ink installment.,,0.0
1814465,1,"Print quality was surprisingly good, better th...",,0.0
2301883,1,Finally got to use it after 10 months in the b...,,0.0
1448143,2,It's just fair. Doesn't open all the way over ...,,0.0


In [335]:
pos_sentiment = reviews_ratings_df['positive_sentiment'].dropna()
pos_sentiment

386177     1.0
2120941    1.0
626263     1.0
1370040    1.0
434926     1.0
          ... 
708250     1.0
942972     1.0
1210325    1.0
872285     1.0
1414645    1.0
Name: positive_sentiment, Length: 100000, dtype: float64

In [336]:
neg_sentiment = reviews_ratings_df['negative_sentiment'].dropna()
neg_sentiment

2556087    0.0
1124476    0.0
780118     0.0
1665680    0.0
1868606    0.0
          ... 
457919     0.0
1814465    0.0
2301883    0.0
1448143    0.0
739477     0.0
Name: negative_sentiment, Length: 100000, dtype: float64

In [337]:
reviews_ratings_df['sentiment'] = pd.concat([pos_sentiment, neg_sentiment])

In [338]:
reviews_ratings_df

Unnamed: 0,star_rating,review_body,positive_sentiment,negative_sentiment,sentiment
386177,5,perfect. it makes me feel so much safer.,1.0,,1.0
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,,1.0
626263,5,Great Quality Bags,1.0,,1.0
1370040,5,This is perfect! We use it for our to do lists...,1.0,,1.0
434926,5,This product is awesome.,1.0,,1.0
...,...,...,...,...,...
457919,1,Very touchy with ink installment.,,0.0,0.0
1814465,1,"Print quality was surprisingly good, better th...",,0.0,0.0
2301883,1,Finally got to use it after 10 months in the b...,,0.0,0.0
1448143,2,It's just fair. Doesn't open all the way over ...,,0.0,0.0


In [339]:
reviews_sentiment_df = reviews_ratings_df.drop(columns=['positive_sentiment', 'negative_sentiment'])
reviews_sentiment_df

Unnamed: 0,star_rating,review_body,sentiment
386177,5,perfect. it makes me feel so much safer.,1.0
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0
626263,5,Great Quality Bags,1.0
1370040,5,This is perfect! We use it for our to do lists...,1.0
434926,5,This product is awesome.,1.0
...,...,...,...
457919,1,Very touchy with ink installment.,0.0
1814465,1,"Print quality was surprisingly good, better th...",0.0
2301883,1,Finally got to use it after 10 months in the b...,0.0
1448143,2,It's just fair. Doesn't open all the way over ...,0.0


# Data Cleaning

## Lower case
- NOTE: Not all reviews are a string. To solve,
    - [ ] Filter out non-strings when/before randomly sampling

In [340]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        if type(text_review) != str:
            converted_str = str(text_review)
            # update_text_review = converted_str.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()

    updated_df['lower_cased'] = lower_case_reviews
    
    # updated_df = df
    return updated_df

In [341]:
reviews_lower_cased = convert_reviews_to_lower_case(reviews_sentiment_df, 'review_body')

In [342]:
reviews_lower_cased

Unnamed: 0,star_rating,review_body,sentiment,lower_cased
386177,5,perfect. it makes me feel so much safer.,1.0,perfect. it makes me feel so much safer.
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,"fast shipment, great product. (why do i need t..."
626263,5,Great Quality Bags,1.0,great quality bags
1370040,5,This is perfect! We use it for our to do lists...,1.0,this is perfect! we use it for our to do lists...
434926,5,This product is awesome.,1.0,this product is awesome.
...,...,...,...,...
457919,1,Very touchy with ink installment.,0.0,very touchy with ink installment.
1814465,1,"Print quality was surprisingly good, better th...",0.0,"print quality was surprisingly good, better th..."
2301883,1,Finally got to use it after 10 months in the b...,0.0,finally got to use it after 10 months in the b...
1448143,2,It's just fair. Doesn't open all the way over ...,0.0,it's just fair. doesn't open all the way over ...


## Remove HTML and URLs
- [x] Verify by finding a specific entry with HTML, URL.

In [344]:
def remove_html_and_urls(df:pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    cleaned_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check and remove HTML tags
        has_html = bool(re.search('<.*?>', text_review))
        if has_html == True:
            # print("Review", text_reviews_idx, "has HTML -- ", text_review)
            pass

        no_html_review = re.sub('<.*?>', ' ', text_review)
        # print("Review", text_reviews_idx, "without HTML -- ", no_html_review)
    
        # Check and remove URLs
        has_url = bool(re.search(r'http\S+', no_html_review))
        if has_url == True:
            # print("Review", text_reviews_idx, "has URL --", no_html_review)
            pass

        no_html_url_review = re.sub(r'http\S+', '', no_html_review)
        # print("Review", text_reviews_idx, "without HTML, URL -- ", no_html_url_review)
        # print()
        cleaned_reviews.append(no_html_url_review)

    updated_df['no_html_urls'] = cleaned_reviews
    return updated_df

In [345]:
no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')

In [346]:
no_html_urls_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls
386177,5,perfect. it makes me feel so much safer.,1.0,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,"fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t..."
626263,5,Great Quality Bags,1.0,great quality bags,great quality bags
1370040,5,This is perfect! We use it for our to do lists...,1.0,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...
434926,5,This product is awesome.,1.0,this product is awesome.,this product is awesome.
...,...,...,...,...,...
457919,1,Very touchy with ink installment.,0.0,very touchy with ink installment.,very touchy with ink installment.
1814465,1,"Print quality was surprisingly good, better th...",0.0,"print quality was surprisingly good, better th...","print quality was surprisingly good, better th..."
2301883,1,Finally got to use it after 10 months in the b...,0.0,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...
1448143,2,It's just fair. Doesn't open all the way over ...,0.0,it's just fair. doesn't open all the way over ...,it's just fair. doesn't open all the way over ...


## Remove Contractions
- [ ] Need to update; make my own

In [358]:
contraction_mapping = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he's": "he is",
        "isn't": "is not",
        "it's": "it is",
        "let's": "let us",
        "mustn't": "must not",
        "shan't": "shall not",
        "she's": "she is",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they're": "they are",
        "wasn't": "was not",
        "we're": "we are",
        "weren't": "were not",
        "won't": "will not",
        "wouldn't": "would not",
        "you're": "you are",
        "you'll": "you will",
        "you'd": "you would"
    }

In [369]:
def expand_contractions(input_idx, input_text):
    # Function to replace contractions with their expanded forms
    def replace(match):
        # print("Review", input_idx, "with contraction -- ", input_text)
        return contraction_mapping[match.group(0)]

    # Use regular expression to find contractions and replace them
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)
    expanded_text = contraction_pattern.sub(replace, input_text)

    return expanded_text

In [370]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_contractions_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        without_contraction = expand_contractions(text_reviews_idx, text_review)
        # print("Review", text_reviews_idx, "without contraction -- ", without_contraction)
        # print()
        without_contractions_reviews.append(without_contraction)

    updated_df['no_contractions'] = without_contractions_reviews
    return updated_df

In [371]:
reviews_no_contractions_df = remove_contractions(no_html_urls_df, 'no_html_urls')

In [372]:
reviews_no_contractions_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions
386177,5,perfect. it makes me feel so much safer.,1.0,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,"fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t..."
626263,5,Great Quality Bags,1.0,great quality bags,great quality bags,great quality bags
1370040,5,This is perfect! We use it for our to do lists...,1.0,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...
434926,5,This product is awesome.,1.0,this product is awesome.,this product is awesome.,this product is awesome.
...,...,...,...,...,...,...
457919,1,Very touchy with ink installment.,0.0,very touchy with ink installment.,very touchy with ink installment.,very touchy with ink installment.
1814465,1,"Print quality was surprisingly good, better th...",0.0,"print quality was surprisingly good, better th...","print quality was surprisingly good, better th...","print quality was surprisingly good, better th..."
2301883,1,Finally got to use it after 10 months in the b...,0.0,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...
1448143,2,It's just fair. Doesn't open all the way over ...,0.0,it's just fair. doesn't open all the way over ...,it's just fair. doesn't open all the way over ...,it is just fair. does not open all the way ove...


## Remove Non-alphabetical characters
- [ ] If entry has no letters, leave blank?

In [409]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """

    alphabetical_char_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check for non-alphabetical characters
        has_non_alphabetical_char = bool(re.search(r'[^a-zA-Z]', text_review))
        if has_non_alphabetical_char == True:
            # print("Review", text_reviews_idx, "has HTML -- ", text_review)
            pass
        
        # Remove non-alphabetical characters
        with_alphabetical_char = re.sub(r'[^a-zA-Z\s]', ' ', text_review)
        # print("Review", text_reviews_idx, "has HTML -- ", with_alphabetical_char)
        alphabetical_char_reviews.append(with_alphabetical_char)

    updated_df['only_alpha_chars'] = alphabetical_char_reviews
    return updated_df

In [410]:
only_alpha_chars_df = remove_non_alphabetical_characters(reviews_no_contractions_df, 'no_html_urls')

In [411]:
only_alpha_chars_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars
386177,5,perfect. it makes me feel so much safer.,1.0,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.,perfect it makes me feel so much safer
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,"fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t...",fast shipment great product why do i need t...
626263,5,Great Quality Bags,1.0,great quality bags,great quality bags,great quality bags,great quality bags
1370040,5,This is perfect! We use it for our to do lists...,1.0,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...,this is perfect we use it for our to do lists...
434926,5,This product is awesome.,1.0,this product is awesome.,this product is awesome.,this product is awesome.,this product is awesome
...,...,...,...,...,...,...,...
457919,1,Very touchy with ink installment.,0.0,very touchy with ink installment.,very touchy with ink installment.,very touchy with ink installment.,very touchy with ink installment
1814465,1,"Print quality was surprisingly good, better th...",0.0,"print quality was surprisingly good, better th...","print quality was surprisingly good, better th...","print quality was surprisingly good, better th...",print quality was surprisingly good better th...
2301883,1,Finally got to use it after 10 months in the b...,0.0,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...,finally got to use it after months in the b...
1448143,2,It's just fair. Doesn't open all the way over ...,0.0,it's just fair. doesn't open all the way over ...,it's just fair. doesn't open all the way over ...,it is just fair. does not open all the way ove...,it s just fair doesn t open all the way over ...


## Remove extra spaces
- [ ] Verify with a specific entry

In [405]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    single_spaced_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check if there are any extra spaces
        has_extra_space = bool(re.search(r' +', text_review))
        if has_extra_space == True:
            # print("Review", text_reviews_idx, "has extra space -- ", text_review)
            pass
        
        # Remove extra spaces
        single_spaced_review = re.sub(r' +', ' ', text_review)
        # print("Review", text_reviews_idx, "without extra space -- ", single_spaced_review)
        # print()
        
        single_spaced_reviews.append(single_spaced_review)

    updated_df['no_extra_space'] = single_spaced_reviews
    return updated_df

In [414]:
no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'only_alpha_chars')

In [415]:
no_extra_space_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space_df
386177,5,perfect. it makes me feel so much safer.,1.0,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.,perfect. it makes me feel so much safer.,perfect it makes me feel so much safer,perfect it makes me feel so much safer
2120941,5,"Fast shipment, great product. (Why do I need t...",1.0,"fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t...","fast shipment, great product. (why do i need t...",fast shipment great product why do i need t...,fast shipment great product why do i need to g...
626263,5,Great Quality Bags,1.0,great quality bags,great quality bags,great quality bags,great quality bags,great quality bags
1370040,5,This is perfect! We use it for our to do lists...,1.0,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...,this is perfect! we use it for our to do lists...,this is perfect we use it for our to do lists...,this is perfect we use it for our to do lists ...
434926,5,This product is awesome.,1.0,this product is awesome.,this product is awesome.,this product is awesome.,this product is awesome,this product is awesome
...,...,...,...,...,...,...,...,...
457919,1,Very touchy with ink installment.,0.0,very touchy with ink installment.,very touchy with ink installment.,very touchy with ink installment.,very touchy with ink installment,very touchy with ink installment
1814465,1,"Print quality was surprisingly good, better th...",0.0,"print quality was surprisingly good, better th...","print quality was surprisingly good, better th...","print quality was surprisingly good, better th...",print quality was surprisingly good better th...,print quality was surprisingly good better tha...
2301883,1,Finally got to use it after 10 months in the b...,0.0,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...,finally got to use it after 10 months in the b...,finally got to use it after months in the b...,finally got to use it after months in the box ...
1448143,2,It's just fair. Doesn't open all the way over ...,0.0,it's just fair. doesn't open all the way over ...,it's just fair. doesn't open all the way over ...,it is just fair. does not open all the way ove...,it s just fair doesn t open all the way over ...,it s just fair doesn t open all the way over t...


# Pre-processing

## remove the stop words 

In [179]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [180]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        text_review_words = word_tokenize(text_review) 

        filtered_review = []

        for text_review_words_idx in range(len(text_review_words)):
            text_review_word = text_review_words[text_review_words_idx]

            if text_review_word not in stop_words:
                filtered_review.append(text_review_word)

        filtered_review = " ".join(filtered_review)
        updated_reviews.append(filtered_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [181]:
without_stop_words_rb_df = filter_stop_words(reviews_no_extra_space_df, 'review_body_lower_cased')

In [182]:
without_stop_words_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
1870573,5,Took my HP 2600n out of motthballs!,"Couldn't afford $300 for hp toner, especially ...",1.0,,took my hp n out of motthballs,couldnt afford hp toner especially considering...
287543,5,Five Stars,"Excellent services, and products, I'm enjoy us...",1.0,,five stars,excellent services products im enjoy using thanks
2299862,5,easy to setup and quick warm up time,I just bought this product. And I read insta...,1.0,,easy to setup and quick warm up time,bought product read installation manual carefu...
1307137,5,Five Stars,good value,1.0,,five stars,good value
1013791,5,Great for Sudoku Fans,"Upon reading the reviews of this eraser stick,...",1.0,,great for sudoku fans,upon reading reviews eraser stick decided try ...
...,...,...,...,...,...,...,...
1246283,1,One Star,My order was for fine point and they were all ...,,0.0,one star,order fine point medium point
155683,1,Doesn't fit anything,This device's jack is useless for anything. It...,,0.0,doesnt fit anything,devices jack useless anything small fit jack c...
303961,1,Can't wait until it dies,This printer is not a good unit. I have used ...,,0.0,cant wait until it dies,printer good unit used previous hp ones busine...
1179082,1,One Star,malfunctioned after 4 months. answering system...,,0.0,one star,malfunctioned months answering system longer w...


## perform lemmatization  

- "I was jogging with Aman for 3 miles"
- "I was jog with Aman for 3 miles"

In [183]:
from nltk.stem import WordNetLemmatizer

def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    
    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print("Review: ", text_review)
        
        text_review_words = word_tokenize(text_review) 

        lemmed_sentence = []

        for text_review_words_idx in range(len(text_review_words)): # get each word in the string review
            text_review_word = text_review_words[text_review_words_idx]
            lemmed_word = lem.lemmatize(text_review_word)
            lemmed_sentence.append(lemmed_word)
            filtered_review = " ".join(lemmed_sentence)
    
        # print("New sentence", filtered_review)
        # print()

        updated_reviews.append(filtered_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [184]:
lemmed_df = lemmentize_review(without_stop_words_rb_df, 'review_body_lower_cased')

In [185]:
lemmed_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
1870573,5,Took my HP 2600n out of motthballs!,"Couldn't afford $300 for hp toner, especially ...",1.0,,took my hp n out of motthballs,couldnt afford hp toner especially considering...
287543,5,Five Stars,"Excellent services, and products, I'm enjoy us...",1.0,,five stars,excellent service product im enjoy using thanks
2299862,5,easy to setup and quick warm up time,I just bought this product. And I read insta...,1.0,,easy to setup and quick warm up time,bought product read installation manual carefu...
1307137,5,Five Stars,good value,1.0,,five stars,good value
1013791,5,Great for Sudoku Fans,"Upon reading the reviews of this eraser stick,...",1.0,,great for sudoku fans,upon reading review eraser stick decided try s...
...,...,...,...,...,...,...,...
1246283,1,One Star,My order was for fine point and they were all ...,,0.0,one star,order fine point medium point
155683,1,Doesn't fit anything,This device's jack is useless for anything. It...,,0.0,doesnt fit anything,device jack useless anything small fit jack co...
303961,1,Can't wait until it dies,This printer is not a good unit. I have used ...,,0.0,cant wait until it dies,printer good unit used previous hp one busines...
1179082,1,One Star,malfunctioned after 4 months. answering system...,,0.0,one star,malfunctioned month answering system longer wo...


In [186]:
pos_sentiment = lemmed_df['positive_sentiment'].dropna()
pos_sentiment

1870573    1.0
287543     1.0
2299862    1.0
1307137    1.0
1013791    1.0
          ... 
9057       1.0
2581298    1.0
1959297    1.0
925800     1.0
1900855    1.0
Name: positive_sentiment, Length: 100000, dtype: float64

In [187]:
neg_sentiment = lemmed_df['negative_sentiment'].dropna()
neg_sentiment

1345310    0.0
1106392    0.0
1758357    0.0
256733     0.0
1678678    0.0
          ... 
1246283    0.0
155683     0.0
303961     0.0
1179082    0.0
1310093    0.0
Name: negative_sentiment, Length: 100000, dtype: float64

In [188]:
lemmed_df['sentiment'] = pd.concat([pos_sentiment, neg_sentiment])

In [189]:
lemmed_df['sentiment'].unique()

array([1., 0.])

In [190]:
lemmed_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased,sentiment
1870573,5,Took my HP 2600n out of motthballs!,"Couldn't afford $300 for hp toner, especially ...",1.0,,took my hp n out of motthballs,couldnt afford hp toner especially considering...,1.0
287543,5,Five Stars,"Excellent services, and products, I'm enjoy us...",1.0,,five stars,excellent service product im enjoy using thanks,1.0
2299862,5,easy to setup and quick warm up time,I just bought this product. And I read insta...,1.0,,easy to setup and quick warm up time,bought product read installation manual carefu...,1.0
1307137,5,Five Stars,good value,1.0,,five stars,good value,1.0
1013791,5,Great for Sudoku Fans,"Upon reading the reviews of this eraser stick,...",1.0,,great for sudoku fans,upon reading review eraser stick decided try s...,1.0
...,...,...,...,...,...,...,...,...
1246283,1,One Star,My order was for fine point and they were all ...,,0.0,one star,order fine point medium point,0.0
155683,1,Doesn't fit anything,This device's jack is useless for anything. It...,,0.0,doesnt fit anything,device jack useless anything small fit jack co...,0.0
303961,1,Can't wait until it dies,This printer is not a good unit. I have used ...,,0.0,cant wait until it dies,printer good unit used previous hp one busines...,0.0
1179082,1,One Star,malfunctioned after 4 months. answering system...,,0.0,one star,malfunctioned month answering system longer wo...,0.0


# TF-IDF Feature Extraction

In [191]:
def tf_idf_feature_extraction(df:pd.DataFrame, col_name: str):
    """Extract the TF-IDF features from the reviews.

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    tf_idf_features:
        A matrix containing the TF-IDF features extracted
        
    """

    vectorizer = TfidfVectorizer()
    tf_idf_features = vectorizer.fit_transform(df[col_name])

    return tf_idf_features


In [192]:
tf_idf_features = tf_idf_feature_extraction(lemmed_df, 'review_body_lower_cased')


In [193]:
tf_idf_features[0]

<1x101634 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

## Split Features and Sentiment Labels

In [194]:
sentiments = lemmed_df['sentiment']
sentiments.shape

(200000,)

In [195]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tf_idf_features, sentiments, test_size=0.2, random_state=42)

In [196]:
X_train.shape, X_test.shape

((160000, 101634), (40000, 101634))

In [197]:
y_train.shape

(160000,)

# Models

In [198]:
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

## Evaluation Metrics

In [199]:
def eval_accuracy(y_true, y_prediction):
    return sklearn.metrics.accuracy_score(y_true, y_prediction)

def eval_precision(y_true, y_prediction):
    return sklearn.metrics.precision_score(y_true, y_prediction)

def eval_recall(y_true, y_prediction):
    return sklearn.metrics.recall_score(y_true, y_prediction)

def eval_f1_score(y_true, y_prediction):
    return sklearn.metrics.f1_score(y_true, y_prediction)

In [200]:
def train_eval_metric(y_train_true, y_train_predictions):
    accuracy = eval_accuracy(y_train_true, y_train_predictions)
    precision = eval_precision(y_train_true, y_train_predictions)
    recall = eval_recall(y_train_true, y_train_predictions)
    f1 = eval_f1_score(y_train_true, y_train_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

def test_eval_metric(y_test_true, y_test_predictions):
    accuracy = eval_accuracy(y_test_true, y_test_predictions)
    precision = eval_precision(y_test_true, y_test_predictions)
    recall = eval_recall(y_test_true, y_test_predictions)
    f1 = eval_f1_score(y_test_true, y_test_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

# Perceptron

In [201]:
def perceptron_model(X_train, X_test, y_train, y_test): 

    technique = Perceptron(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [202]:
perceptron_train_metrics, perceptron_test_metrics = perceptron_model(X_train, X_test, y_train, y_test)

In [203]:
perceptron_train_metrics, perceptron_test_metrics

({'Accuracy': 0.9108,
  'Precision': 0.8809267178934437,
  'Recall': 0.9500293724299124,
  'F1 Score': 0.9141740333152926},
 {'Accuracy': 0.850675,
  'Precision': 0.8215006420840213,
  'Recall': 0.8959135697494123,
  'F1 Score': 0.8570950068186712})

# SVM

In [204]:
def svm_model(X_train, X_test, y_train, y_test): 

    technique = LinearSVC(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [205]:
svm_train_metrics, svm_test_metrics = svm_model(X_train, X_test, y_train, y_test)



In [206]:
svm_train_metrics, svm_test_metrics

({'Accuracy': 0.93770625,
  'Precision': 0.9384954422518281,
  'Recall': 0.936818028422513,
  'F1 Score': 0.9376559851380175},
 {'Accuracy': 0.895025,
  'Precision': 0.89659501807955,
  'Recall': 0.8929625368879107,
  'F1 Score': 0.8947750908407468})

# Logistic Regression

In [207]:
def logistic_regression_model(X_train, X_test, y_train, y_test): 

    technique = LogisticRegression(random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics

In [208]:
logistic_regression_train_metrics, logistic_regression_test_metrics = logistic_regression_model(X_train, X_test, y_train, y_test)

In [209]:
logistic_regression_train_metrics, logistic_regression_test_metrics

({'Accuracy': 0.91105,
  'Precision': 0.9125221077982515,
  'Recall': 0.9092829377429474,
  'F1 Score': 0.9108996431478119},
 {'Accuracy': 0.89495,
  'Precision': 0.8990649481930756,
  'Recall': 0.8897113989896464,
  'F1 Score': 0.8943637186384433})

# Naive Bayes

In [210]:
def naive_bayes_model(X_train, X_test, y_train, y_test): 

    technique = MultinomialNB(random_state=0)
    technique.fit(X_train.toarray(), y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)

    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [211]:
# naive_bayes_train_metrics, naive_bayes_test_metrics = naive_bayes_model(X_train, X_test, y_train, y_test)