In [88]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [89]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

## Keep Reviews and Ratings

In [90]:
# reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_headline', 'review_body']]
# reviews_ratings_df.reset_index(drop=True)
# reviews_ratings_df

 ## We form three classes and select 20000 reviews randomly from each class.
- [ ] 100,000 each


In [91]:
reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

valid = ["1","2","3","4","5"]
from copy import deepcopy

stars = deepcopy(reviews_ratings_df).star_rating.astype(str) # turn entries to strings
where_valid = stars.index[stars.isin(valid)].tolist() # check valid list and see which of our stars match
reviews_ratings_df = reviews_ratings_df.iloc[where_valid]

reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [92]:
reviews_ratings_df.star_rating = reviews_ratings_df.star_rating.apply(lambda x: int(x))

In [93]:
reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [94]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):
    """Categorizes reviews by adding a rating
    
    """


    if sentiment_type == 'positive_sentiment':
        positive_review_threshold = df[rating_col] > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'negative_sentiment':
        positive_review_threshold = df[rating_col] < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 0

    elif sentiment_type == 'neutral_sentiment':
        positive_review_threshold = df[rating_col] == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [95]:
positive_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'positive_sentiment')
positive_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1


Unnamed: 0,star_rating,review_body,positive_sentiment
0,5,Great product.,1
1,5,What's to say about this commodity item except...,1
2,5,"Haven't used yet, but I am sure I will like it.",1
4,4,Gorgeous colors and easy to use,1
5,5,Perfect for planning weekly meals. Removrd the...,1
...,...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...,1
2640250,4,Although the Palm Pilot is thin and compact it...,1
2640251,4,This book had a lot of great content without b...,1
2640252,5,I am teaching a course in Excel and am using t...,1


In [96]:
negative_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'negative_sentiment')
negative_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 0


Unnamed: 0,star_rating,review_body,negative_sentiment
3,1,Although this was labeled as &#34;new&#34; the...,0
13,1,worked about a month then died,0
20,1,The phone did not work. No Dial Tone. Not wo...,0
27,1,Not laminated and no reinforced holes for hang...,0
28,1,"Cartridge was over filled, black smears on pap...",0
...,...,...,...
2640139,2,This purchase was intended for a home office s...,0
2640149,2,I bought a Palm V from Amazon and thought it w...,0
2640151,1,The display is excellent - it's a good size an...,0
2640201,1,All the CE based hand held or palm computers h...,0


In [97]:
neutral_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'neutral_sentiment')
neutral_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


Unnamed: 0,star_rating,review_body,neutral_sentiment
48,3,Nice quality. Happy with the item,3
64,3,The batch I had exploded all over when I tried...,3
95,3,"It is ok, but considering the price plus shipp...",3
133,3,Delighted to receive a sample of these to try ...,3
145,3,I use this light in a dark area of my closet. ...,3
...,...,...,...
2640209,3,I was VERY disappointed to receive my Palm V a...,3
2640219,3,Very basic. The book spends a lot of time des...,3
2640225,3,"Being a Newton devotee, switching to the Palm ...",3
2640234,3,I have a US Robotics Palm Pro (we go back a wa...,3


In [98]:
pos_rand_sampled_df = positive_sentiment_df.sample(100000)
pos_rand_sampled_df

Unnamed: 0,star_rating,review_body,positive_sentiment
1646953,4,I ordered these borders for my classroom and I...,1
1335626,5,This item was easy to install and started prod...,1
1351479,5,The curtains I received were not as pink as in...,1
912488,5,Refills for study pencil.,1
326487,5,Easy to install. Good ink.,1
...,...,...,...
1312252,5,Good product and delivery on time.,1
86114,5,"Probably should be a monocular, as focus range...",1
1576139,5,This is a must for I anything users. Once it ...,1
1392975,5,Great replacement. Works well. Have been usi...,1


In [99]:
neg_rand_sampled_df = negative_sentiment_df.sample(100000)
neg_rand_sampled_df

Unnamed: 0,star_rating,review_body,negative_sentiment
402447,1,Could never get it to work with my Epson Stylu...,0
378464,1,"I bought a used unit, and it didn't work. I t...",0
2409844,1,"My printer is two years old, and still basical...",0
2110056,1,"Pros: it worked once in a while, and yes when ...",0
2497035,2,This is a very nice looking product. Unfortuna...,0
...,...,...,...
46907,1,The stones were already off of it when I recei...,0
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0
2170048,1,I got fooled by the picture of both black and ...,0
2147611,1,I got this notepads from Amazon since I found ...,0


In [100]:
reviews_ratings_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])
reviews_ratings_df

Unnamed: 0,star_rating,review_body,positive_sentiment,negative_sentiment
1646953,4,I ordered these borders for my classroom and I...,1.0,
1335626,5,This item was easy to install and started prod...,1.0,
1351479,5,The curtains I received were not as pink as in...,1.0,
912488,5,Refills for study pencil.,1.0,
326487,5,Easy to install. Good ink.,1.0,
...,...,...,...,...
46907,1,The stones were already off of it when I recei...,,0.0
1392857,2,"It's a nice looking stand, but doesn't seem ve...",,0.0
2170048,1,I got fooled by the picture of both black and ...,,0.0
2147611,1,I got this notepads from Amazon since I found ...,,0.0


In [101]:
pos_sentiment = reviews_ratings_df['positive_sentiment'].dropna()
pos_sentiment

1646953    1.0
1335626    1.0
1351479    1.0
912488     1.0
326487     1.0
          ... 
1312252    1.0
86114      1.0
1576139    1.0
1392975    1.0
2017615    1.0
Name: positive_sentiment, Length: 100000, dtype: float64

In [102]:
neg_sentiment = reviews_ratings_df['negative_sentiment'].dropna()
neg_sentiment

402447     0.0
378464     0.0
2409844    0.0
2110056    0.0
2497035    0.0
          ... 
46907      0.0
1392857    0.0
2170048    0.0
2147611    0.0
2447202    0.0
Name: negative_sentiment, Length: 100000, dtype: float64

In [103]:
reviews_ratings_df['sentiment'] = pd.concat([pos_sentiment, neg_sentiment])

In [104]:
reviews_ratings_df

Unnamed: 0,star_rating,review_body,positive_sentiment,negative_sentiment,sentiment
1646953,4,I ordered these borders for my classroom and I...,1.0,,1.0
1335626,5,This item was easy to install and started prod...,1.0,,1.0
1351479,5,The curtains I received were not as pink as in...,1.0,,1.0
912488,5,Refills for study pencil.,1.0,,1.0
326487,5,Easy to install. Good ink.,1.0,,1.0
...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,,0.0,0.0
1392857,2,"It's a nice looking stand, but doesn't seem ve...",,0.0,0.0
2170048,1,I got fooled by the picture of both black and ...,,0.0,0.0
2147611,1,I got this notepads from Amazon since I found ...,,0.0,0.0


In [105]:
reviews_sentiment_df = reviews_ratings_df.drop(columns=['positive_sentiment', 'negative_sentiment'])
reviews_sentiment_df

Unnamed: 0,star_rating,review_body,sentiment
1646953,4,I ordered these borders for my classroom and I...,1.0
1335626,5,This item was easy to install and started prod...,1.0
1351479,5,The curtains I received were not as pink as in...,1.0
912488,5,Refills for study pencil.,1.0
326487,5,Easy to install. Good ink.,1.0
...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0
2170048,1,I got fooled by the picture of both black and ...,0.0
2147611,1,I got this notepads from Amazon since I found ...,0.0


# Data Cleaning

## Lower case
- NOTE: Not all reviews are a string. To solve,
    - [ ] Filter out non-strings when/before randomly sampling

In [106]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        if type(text_review) != str:
            converted_str = str(text_review)
            # update_text_review = converted_str.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()

    updated_df['lower_cased'] = lower_case_reviews
    
    # updated_df = df
    return updated_df

In [107]:
reviews_lower_cased = convert_reviews_to_lower_case(reviews_sentiment_df, 'review_body')

In [108]:
reviews_lower_cased

Unnamed: 0,star_rating,review_body,sentiment,lower_cased
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...
912488,5,Refills for study pencil.,1.0,refills for study pencil.
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.
...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve..."
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...


## Remove HTML and URLs
- [x] Verify by finding a specific entry with HTML, URL.

In [109]:
def remove_html_and_urls(df:pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    cleaned_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check and remove HTML tags
        has_html = bool(re.search('<.*?>', text_review))
        if has_html == True:
            # print("Review", text_reviews_idx, "has HTML -- ", text_review)
            pass

        no_html_review = re.sub('<.*?>', ' ', text_review)
        # print("Review", text_reviews_idx, "without HTML -- ", no_html_review)
    
        # Check and remove URLs
        has_url = bool(re.search(r'http\S+', no_html_review))
        if has_url == True:
            # print("Review", text_reviews_idx, "has URL --", no_html_review)
            pass

        no_html_url_review = re.sub(r'http\S+', '', no_html_review)
        # print("Review", text_reviews_idx, "without HTML, URL -- ", no_html_url_review)
        # print()
        cleaned_reviews.append(no_html_url_review)

    updated_df['no_html_urls'] = cleaned_reviews
    return updated_df

In [110]:
no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')

In [111]:
no_html_urls_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...,this item was easy to install and started prod...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...
912488,5,Refills for study pencil.,1.0,refills for study pencil.,refills for study pencil.
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.,easy to install. good ink.
...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...,the stones were already off of it when i recei...
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve...","it's a nice looking stand, but doesn't seem ve..."
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...


## Remove Contractions
- [ ] Need to update; make my own

In [112]:
contraction_mapping = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he's": "he is",
        "isn't": "is not",
        "it's": "it is",
        "let's": "let us",
        "mustn't": "must not",
        "shan't": "shall not",
        "she's": "she is",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they're": "they are",
        "wasn't": "was not",
        "we're": "we are",
        "weren't": "were not",
        "won't": "will not",
        "wouldn't": "would not",
        "you're": "you are",
        "you'll": "you will",
        "you'd": "you would"
    }

In [113]:
def expand_contractions(input_idx, input_text):
    # Function to replace contractions with their expanded forms
    def replace(match):
        # print("Review", input_idx, "with contraction -- ", input_text)
        return contraction_mapping[match.group(0)]

    # Use regular expression to find contractions and replace them
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)
    expanded_text = contraction_pattern.sub(replace, input_text)

    return expanded_text

In [114]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_contractions_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        without_contraction = expand_contractions(text_reviews_idx, text_review)
        # print("Review", text_reviews_idx, "without contraction -- ", without_contraction)
        # print()
        without_contractions_reviews.append(without_contraction)

    updated_df['no_contractions'] = without_contractions_reviews
    return updated_df

In [115]:
reviews_no_contractions_df = remove_contractions(no_html_urls_df, 'no_html_urls')

In [116]:
reviews_no_contractions_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...
912488,5,Refills for study pencil.,1.0,refills for study pencil.,refills for study pencil.,refills for study pencil.
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.,easy to install. good ink.,easy to install. good ink.
...,...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve...","it's a nice looking stand, but doesn't seem ve...","it is a nice looking stand, but does not seem ..."
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...


## Remove Non-alphabetical characters
- [ ] If entry has no letters, leave blank?

In [117]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """

    alphabetical_char_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check for non-alphabetical characters
        has_non_alphabetical_char = bool(re.search(r'[^a-zA-Z]', text_review))
        if has_non_alphabetical_char == True:
            # print("Review", text_reviews_idx, "has HTML -- ", text_review)
            pass
        
        # Remove non-alphabetical characters
        with_alphabetical_char = re.sub(r'[^a-zA-Z\s]', ' ', text_review)
        # print("Review", text_reviews_idx, "has HTML -- ", with_alphabetical_char)
        alphabetical_char_reviews.append(with_alphabetical_char)

    updated_df['only_alpha_chars'] = alphabetical_char_reviews
    return updated_df

In [118]:
only_alpha_chars_df = remove_non_alphabetical_characters(reviews_no_contractions_df, 'no_html_urls')

In [119]:
only_alpha_chars_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...
912488,5,Refills for study pencil.,1.0,refills for study pencil.,refills for study pencil.,refills for study pencil.,refills for study pencil
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.,easy to install. good ink.,easy to install. good ink.,easy to install good ink
...,...,...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve...","it's a nice looking stand, but doesn't seem ve...","it is a nice looking stand, but does not seem ...",it s a nice looking stand but doesn t seem ve...
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...


## Remove extra spaces
- [ ] Verify with a specific entry

In [120]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    single_spaced_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check if there are any extra spaces
        has_extra_space = bool(re.search(r' +', text_review))
        if has_extra_space == True:
            # print("Review", text_reviews_idx, "has extra space -- ", text_review)
            pass
        
        # Remove extra spaces
        single_spaced_review = re.sub(r' +', ' ', text_review)
        # print("Review", text_reviews_idx, "without extra space -- ", single_spaced_review)
        # print()
        
        single_spaced_reviews.append(single_spaced_review)

    updated_df['no_extra_space'] = single_spaced_reviews
    return updated_df

In [121]:
no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'only_alpha_chars')

In [122]:
no_extra_space_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...
912488,5,Refills for study pencil.,1.0,refills for study pencil.,refills for study pencil.,refills for study pencil.,refills for study pencil,refills for study pencil
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.,easy to install. good ink.,easy to install. good ink.,easy to install good ink,easy to install good ink
...,...,...,...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve...","it's a nice looking stand, but doesn't seem ve...","it is a nice looking stand, but does not seem ...",it s a nice looking stand but doesn t seem ve...,it s a nice looking stand but doesn t seem ver...
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...


# Pre-processing

## remove the stop words 

In [123]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [124]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_stop_words_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        text_review_words = word_tokenize(text_review) 

        # print("Before stop word removal", text_reviews_idx, " -- ", text_review)

        filtered_review = []

        for text_review_words_idx in range(len(text_review_words)):
            text_review_word = text_review_words[text_review_words_idx]
            
            # Check if review word is a stop word
            if text_review_word in stop_words:
                # print("  Stop word -- ", text_review_word)
                pass
            else:
                # print(text_review_word, " -- is NOT a stop word in review")
                filtered_review.append(text_review_word)

        
        filtered_review = " ".join(filtered_review)
        # print("After stop word removal", text_reviews_idx, " -- ", filtered_review)
        # print()
        
        without_stop_words_reviews.append(filtered_review)

    updated_df['without_stop_words'] = without_stop_words_reviews
    return updated_df

In [125]:
no_stop_words_df = filter_stop_words(no_extra_space_df, 'no_extra_space')

In [126]:
no_stop_words_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space,without_stop_words
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,ordered borders classroom pleased little wider...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,item easy install started producing right away...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,curtains received pink picture pink enough wor...
912488,5,Refills for study pencil.,1.0,refills for study pencil.,refills for study pencil.,refills for study pencil.,refills for study pencil,refills for study pencil,refills study pencil
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.,easy to install. good ink.,easy to install. good ink.,easy to install good ink,easy to install good ink,easy install good ink
...,...,...,...,...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,stones already received
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve...","it's a nice looking stand, but doesn't seem ve...","it is a nice looking stand, but does not seem ...",it s a nice looking stand but doesn t seem ve...,it s a nice looking stand but doesn t seem ver...,nice looking stand seem sturdy would never put...
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,got fooled picture black color cartridges unfo...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,got notepads amazon since found much cheaper o...


## perform lemmatization  

- "I was jogging with Aman for 3 miles"
- "I was jog with Aman for 3 miles"
- [ ] NOT working with "ing". Why?
- [ ] Workig with "words" -> word

In [127]:
from nltk.stem import WordNetLemmatizer

def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    lemmed_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]        
        lemmed_words = word_tokenize(text_review) 

        # print("Before lem update", text_reviews_idx, " -- ", text_review)
        # print("Lemmed words", lemmed_words)
        # lemmatized_words = [lem.lemmatize(word) for word in lemmed_words]
        # lemmatized_sentence = ' '.join(lemmatized_words)
        # print("After lem update", text_reviews_idx, " -- ", lemmatized_sentence)
        

        lemmed_sentence = []

        for lemmed_words_idx in range(len(lemmed_words)):
            lemmed_word = lemmed_words[lemmed_words_idx]
            
            apply_lemmatization = lem.lemmatize(lemmed_word)
            # print(apply_lemmatization)
            
            lemmed_sentence.append(apply_lemmatization)
            filtered_review = " ".join(lemmed_sentence)
    
        # print("After lem update -- ", filtered_review)
        # print()

        lemmed_reviews.append(filtered_review)

    updated_df['lemmed_reviews'] = lemmed_reviews

    return updated_df

In [128]:
lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')

In [129]:
lemmed_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space,without_stop_words,lemmed_reviews
1646953,4,I ordered these borders for my classroom and I...,1.0,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,i ordered these borders for my classroom and i...,ordered borders classroom pleased little wider...,ordered border classroom pleased little wider ...
1335626,5,This item was easy to install and started prod...,1.0,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,this item was easy to install and started prod...,item easy install started producing right away...,item easy install started producing right away...
1351479,5,The curtains I received were not as pink as in...,1.0,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,the curtains i received were not as pink as in...,curtains received pink picture pink enough wor...,curtain received pink picture pink enough work...
912488,5,Refills for study pencil.,1.0,refills for study pencil.,refills for study pencil.,refills for study pencil.,refills for study pencil,refills for study pencil,refills study pencil,refill study pencil
326487,5,Easy to install. Good ink.,1.0,easy to install. good ink.,easy to install. good ink.,easy to install. good ink.,easy to install good ink,easy to install good ink,easy install good ink,easy install good ink
...,...,...,...,...,...,...,...,...,...,...
46907,1,The stones were already off of it when I recei...,0.0,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,the stones were already off of it when i recei...,stones already received,stone already received
1392857,2,"It's a nice looking stand, but doesn't seem ve...",0.0,"it's a nice looking stand, but doesn't seem ve...","it's a nice looking stand, but doesn't seem ve...","it is a nice looking stand, but does not seem ...",it s a nice looking stand but doesn t seem ve...,it s a nice looking stand but doesn t seem ver...,nice looking stand seem sturdy would never put...,nice looking stand seem sturdy would never put...
2170048,1,I got fooled by the picture of both black and ...,0.0,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,i got fooled by the picture of both black and ...,got fooled picture black color cartridges unfo...,got fooled picture black color cartridge unfor...
2147611,1,I got this notepads from Amazon since I found ...,0.0,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,i got this notepads from amazon since i found ...,got notepads amazon since found much cheaper o...,got notepad amazon since found much cheaper on...


# TF-IDF Feature Extraction

In [130]:
def tf_idf_feature_extraction(df:pd.DataFrame, col_name: str):
    """Extract the TF-IDF features from the reviews.

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    tf_idf_features:
        A matrix containing the TF-IDF features extracted
        
    """

    vectorizer = TfidfVectorizer()
    tf_idf_features = vectorizer.fit_transform(df[col_name])

    return tf_idf_features


In [131]:
tf_idf_features = tf_idf_feature_extraction(lemmed_df, 'lemmed_reviews')


In [132]:
tf_idf_features[0]

<1x56508 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

## Split Features and Sentiment Labels

In [133]:
sentiments = lemmed_df['sentiment']
sentiments.shape

(200000,)

In [134]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tf_idf_features, sentiments, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160000, 56508), (40000, 56508), (160000,), (40000,))

# Models

In [140]:
import sklearn
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

## Evaluation Metrics

In [141]:
def eval_accuracy(y_true, y_prediction):
    return sklearn.metrics.accuracy_score(y_true, y_prediction)

def eval_precision(y_true, y_prediction):
    return sklearn.metrics.precision_score(y_true, y_prediction)

def eval_recall(y_true, y_prediction):
    return sklearn.metrics.recall_score(y_true, y_prediction)

def eval_f1_score(y_true, y_prediction):
    return sklearn.metrics.f1_score(y_true, y_prediction)

In [142]:
def train_eval_metric(y_train_true, y_train_predictions):
    accuracy = eval_accuracy(y_train_true, y_train_predictions)
    precision = eval_precision(y_train_true, y_train_predictions)
    recall = eval_recall(y_train_true, y_train_predictions)
    f1 = eval_f1_score(y_train_true, y_train_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

def test_eval_metric(y_test_true, y_test_predictions):
    accuracy = eval_accuracy(y_test_true, y_test_predictions)
    precision = eval_precision(y_test_true, y_test_predictions)
    recall = eval_recall(y_test_true, y_test_predictions)
    f1 = eval_f1_score(y_test_true, y_test_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

# Perceptron

In [143]:
def perceptron_model(X_train, X_test, y_train, y_test): 

    technique = Perceptron(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [144]:
perceptron_train_metrics, perceptron_test_metrics = perceptron_model(X_train, X_test, y_train, y_test)

In [145]:
perceptron_train_metrics, perceptron_test_metrics

({'Accuracy': 0.903625,
  'Precision': 0.9143539012279148,
  'Recall': 0.8906970640068994,
  'F1 Score': 0.90237046041635},
 {'Accuracy': 0.851625,
  'Precision': 0.8628432789593228,
  'Recall': 0.8360426149152204,
  'F1 Score': 0.8492315508700622})

# SVM

In [146]:
def svm_model(X_train, X_test, y_train, y_test): 

    technique = LinearSVC(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [147]:
svm_train_metrics, svm_test_metrics = svm_model(X_train, X_test, y_train, y_test)



In [148]:
svm_train_metrics, svm_test_metrics

({'Accuracy': 0.9307125,
  'Precision': 0.9316140829899425,
  'Recall': 0.9296811528991213,
  'F1 Score': 0.9306466142835694},
 {'Accuracy': 0.893275,
  'Precision': 0.8936116952037649,
  'Recall': 0.8927624668634022,
  'F1 Score': 0.8931868791753197})

# Logistic Regression

In [149]:
def logistic_regression_model(X_train, X_test, y_train, y_test): 

    technique = LogisticRegression(random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics

In [150]:
logistic_regression_train_metrics, logistic_regression_test_metrics = logistic_regression_model(X_train, X_test, y_train, y_test)

In [151]:
logistic_regression_train_metrics, logistic_regression_test_metrics

({'Accuracy': 0.90973125,
  'Precision': 0.912331455021131,
  'Recall': 0.9065956728786231,
  'F1 Score': 0.909454520377905},
 {'Accuracy': 0.895275,
  'Precision': 0.8972850678733032,
  'Recall': 0.8926624318511479,
  'F1 Score': 0.8949677807587193})

# Naive Bayes

In [152]:
def naive_bayes_model(X_train, X_test, y_train, y_test): 

    technique = MultinomialNB(random_state=0)
    technique.fit(X_train.toarray(), y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)

    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [153]:
# naive_bayes_train_metrics, naive_bayes_test_metrics = naive_bayes_model(X_train, X_test, y_train, y_test)