In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [3]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

## Keep Reviews and Ratings

In [76]:
reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


 ## We form three classes and select 20000 reviews randomly from each class.
- [ ] 100,000 each


In [5]:


valid = ["1","2","3","4","5"]
from copy import deepcopy

stars = deepcopy(reviews_ratings_df).star_rating.astype(str) # turn entries to strings
where_valid = stars.index[stars.isin(valid)].tolist() # check valid list and see which of our stars match
reviews_ratings_df = reviews_ratings_df.iloc[where_valid]

reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [6]:
reviews_ratings_df.star_rating = reviews_ratings_df.star_rating.apply(lambda x: int(x))

In [7]:
reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [8]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):
    """Categorizes reviews by adding a rating
    
    """


    if sentiment_type == 'positive_sentiment':
        positive_review_threshold = df[rating_col] > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'negative_sentiment':
        positive_review_threshold = df[rating_col] < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 0

    elif sentiment_type == 'neutral_sentiment':
        positive_review_threshold = df[rating_col] == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [9]:
positive_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'positive_sentiment')
positive_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1


Unnamed: 0,star_rating,review_body,positive_sentiment
0,5,Great product.,1
1,5,What's to say about this commodity item except...,1
2,5,"Haven't used yet, but I am sure I will like it.",1
4,4,Gorgeous colors and easy to use,1
5,5,Perfect for planning weekly meals. Removrd the...,1
...,...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...,1
2640250,4,Although the Palm Pilot is thin and compact it...,1
2640251,4,This book had a lot of great content without b...,1
2640252,5,I am teaching a course in Excel and am using t...,1


In [10]:
negative_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'negative_sentiment')
negative_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 0


Unnamed: 0,star_rating,review_body,negative_sentiment
3,1,Although this was labeled as &#34;new&#34; the...,0
13,1,worked about a month then died,0
20,1,The phone did not work. No Dial Tone. Not wo...,0
27,1,Not laminated and no reinforced holes for hang...,0
28,1,"Cartridge was over filled, black smears on pap...",0
...,...,...,...
2640139,2,This purchase was intended for a home office s...,0
2640149,2,I bought a Palm V from Amazon and thought it w...,0
2640151,1,The display is excellent - it's a good size an...,0
2640201,1,All the CE based hand held or palm computers h...,0


In [11]:
neutral_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'neutral_sentiment')
neutral_sentiment_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


Unnamed: 0,star_rating,review_body,neutral_sentiment
48,3,Nice quality. Happy with the item,3
64,3,The batch I had exploded all over when I tried...,3
95,3,"It is ok, but considering the price plus shipp...",3
133,3,Delighted to receive a sample of these to try ...,3
145,3,I use this light in a dark area of my closet. ...,3
...,...,...,...
2640209,3,I was VERY disappointed to receive my Palm V a...,3
2640219,3,Very basic. The book spends a lot of time des...,3
2640225,3,"Being a Newton devotee, switching to the Palm ...",3
2640234,3,I have a US Robotics Palm Pro (we go back a wa...,3


In [12]:
pos_rand_sampled_df = positive_sentiment_df.sample(100000)
pos_rand_sampled_df

Unnamed: 0,star_rating,review_body,positive_sentiment
513036,5,"Ok and very very fast delivery (dearly paid, a...",1
348810,5,cheez it,1
151383,5,I really like this pen. It can write even when...,1
1300333,5,very good thanks,1
2406033,5,I think I've bought about half the things Aver...,1
...,...,...,...
1777581,5,They are huge. I love them so much I made a se...,1
990536,5,These greeting cards are wonderful. I love Deb...,1
594520,4,Click and done!,1
711296,5,Love these pens,1


In [13]:
neg_rand_sampled_df = negative_sentiment_df.sample(100000)
neg_rand_sampled_df

Unnamed: 0,star_rating,review_body,negative_sentiment
523975,1,Seams are tearing first time I tried on with o...,0
2630874,1,"We thought we had bought a good prodcut, Multi...",0
1255475,1,"The driver for iMac, Os X, does NOT work! Ther...",0
2638943,1,This is one of the wost scanner i have ever ha...,0
1363039,1,I received the printer yesterday only to find ...,0
...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0
1280158,1,Horrible! The pad is way too thin and keep sli...,0
2341521,1,The first cartridge to be replaced leaked all ...,0
2322330,2,Very Cheaply made. The Cork is fraying along t...,0


In [14]:
reviews_ratings_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])
reviews_ratings_df

Unnamed: 0,star_rating,review_body,positive_sentiment,negative_sentiment
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,
348810,5,cheez it,1.0,
151383,5,I really like this pen. It can write even when...,1.0,
1300333,5,very good thanks,1.0,
2406033,5,I think I've bought about half the things Aver...,1.0,
...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,,0.0
1280158,1,Horrible! The pad is way too thin and keep sli...,,0.0
2341521,1,The first cartridge to be replaced leaked all ...,,0.0
2322330,2,Very Cheaply made. The Cork is fraying along t...,,0.0


In [15]:
pos_sentiment = reviews_ratings_df['positive_sentiment'].dropna()
pos_sentiment

513036     1.0
348810     1.0
151383     1.0
1300333    1.0
2406033    1.0
          ... 
1777581    1.0
990536     1.0
594520     1.0
711296     1.0
1100       1.0
Name: positive_sentiment, Length: 100000, dtype: float64

In [16]:
neg_sentiment = reviews_ratings_df['negative_sentiment'].dropna()
neg_sentiment

523975     0.0
2630874    0.0
1255475    0.0
2638943    0.0
1363039    0.0
          ... 
2164856    0.0
1280158    0.0
2341521    0.0
2322330    0.0
2213676    0.0
Name: negative_sentiment, Length: 100000, dtype: float64

In [17]:
reviews_ratings_df['sentiment'] = pd.concat([pos_sentiment, neg_sentiment])

In [18]:
reviews_ratings_df

Unnamed: 0,star_rating,review_body,positive_sentiment,negative_sentiment,sentiment
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,,1.0
348810,5,cheez it,1.0,,1.0
151383,5,I really like this pen. It can write even when...,1.0,,1.0
1300333,5,very good thanks,1.0,,1.0
2406033,5,I think I've bought about half the things Aver...,1.0,,1.0
...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,,0.0,0.0
1280158,1,Horrible! The pad is way too thin and keep sli...,,0.0,0.0
2341521,1,The first cartridge to be replaced leaked all ...,,0.0,0.0
2322330,2,Very Cheaply made. The Cork is fraying along t...,,0.0,0.0


In [19]:
reviews_sentiment_df = reviews_ratings_df.drop(columns=['positive_sentiment', 'negative_sentiment'])
reviews_sentiment_df

Unnamed: 0,star_rating,review_body,sentiment
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0
348810,5,cheez it,1.0
151383,5,I really like this pen. It can write even when...,1.0
1300333,5,very good thanks,1.0
2406033,5,I think I've bought about half the things Aver...,1.0
...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0
2341521,1,The first cartridge to be replaced leaked all ...,0.0
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0


# Data Cleaning

## Lower case
- NOTE: Not all reviews are a string. To solve,
    - [ ] Filter out non-strings when/before randomly sampling

In [20]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        if type(text_review) != str:
            converted_str = str(text_review)
            # update_text_review = converted_str.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()

    updated_df['lower_cased'] = lower_case_reviews
    
    # updated_df = df
    return updated_df

In [21]:
reviews_lower_cased = convert_reviews_to_lower_case(reviews_sentiment_df, 'review_body')

In [22]:
reviews_lower_cased

Unnamed: 0,star_rating,review_body,sentiment,lower_cased
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a..."
348810,5,cheez it,1.0,cheez it
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...
1300333,5,very good thanks,1.0,very good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...
...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...


## Remove HTML and URLs
- [x] Verify by finding a specific entry with HTML, URL.

In [23]:
def remove_html_and_urls(df: pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    cleaned_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check and remove HTML tags
        has_html = bool(re.search('<.*?>', text_review))
        if has_html == True:
            # print("Review", text_reviews_idx, "has HTML -- ", text_review)
            pass

        no_html_review = re.sub('<.*?>', ' ', text_review)
        # print("Review", text_reviews_idx, "without HTML -- ", no_html_review)
    
        # Check and remove URLs
        has_url = bool(re.search(r'http\S+', no_html_review))
        if has_url == True:
            # print("Review", text_reviews_idx, "has URL --", no_html_review)
            pass

        no_html_url_review = re.sub(r'http\S+', '', no_html_review)
        # print("Review", text_reviews_idx, "without HTML, URL -- ", no_html_url_review)
        # print()
        cleaned_reviews.append(no_html_url_review)

    updated_df['no_html_urls'] = cleaned_reviews
    return updated_df

In [24]:
no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')

In [25]:
no_html_urls_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a..."
348810,5,cheez it,1.0,cheez it,cheez it
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...,i really like this pen. it can write even when...
1300333,5,very good thanks,1.0,very good thanks,very good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...,i think i've bought about half the things aver...
...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...


## Remove Contractions
- [ ] Need to update; make my own

In [26]:
contraction_mapping = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he's": "he is",
        "isn't": "is not",
        "it's": "it is",
        "let's": "let us",
        "mustn't": "must not",
        "shan't": "shall not",
        "she's": "she is",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they're": "they are",
        "wasn't": "was not",
        "we're": "we are",
        "weren't": "were not",
        "won't": "will not",
        "wouldn't": "would not",
        "you're": "you are",
        "you'll": "you will",
        "you'd": "you would"
    }

In [27]:
def expand_contractions(input_idx, input_text):
    # Function to replace contractions with their expanded forms
    def replace(match):
        # print("Review", input_idx, "with contraction -- ", input_text)
        return contraction_mapping[match.group(0)]

    # Use regular expression to find contractions and replace them
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)
    expanded_text = contraction_pattern.sub(replace, input_text)

    return expanded_text

In [28]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_contractions_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        without_contraction = expand_contractions(text_reviews_idx, text_review)
        # print("Review", text_reviews_idx, "without contraction -- ", without_contraction)
        # print()
        without_contractions_reviews.append(without_contraction)

    updated_df['no_contractions'] = without_contractions_reviews
    return updated_df

In [29]:
reviews_no_contractions_df = remove_contractions(no_html_urls_df, 'no_html_urls')

In [30]:
reviews_no_contractions_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a..."
348810,5,cheez it,1.0,cheez it,cheez it,cheez it
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen. it can write even when...
1300333,5,very good thanks,1.0,very good thanks,very good thanks,very good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i've bought about half the things aver...
...,...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...


## Remove Non-alphabetical characters
- [ ] If entry has no letters, leave blank?

In [31]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """

    alphabetical_char_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check for non-alphabetical characters
        has_non_alphabetical_char = bool(re.search(r'[^a-zA-Z]', text_review))
        if has_non_alphabetical_char == True:
            # print("Review", text_reviews_idx, "has HTML -- ", text_review)
            pass
        
        # Remove non-alphabetical characters
        with_alphabetical_char = re.sub(r'[^a-zA-Z\s]', ' ', text_review)
        # print("Review", text_reviews_idx, "has HTML -- ", with_alphabetical_char)
        alphabetical_char_reviews.append(with_alphabetical_char)

    updated_df['only_alpha_chars'] = alphabetical_char_reviews
    return updated_df

In [32]:
only_alpha_chars_df = remove_non_alphabetical_characters(reviews_no_contractions_df, 'no_html_urls')

In [33]:
only_alpha_chars_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...",ok and very very fast delivery dearly paid a...
348810,5,cheez it,1.0,cheez it,cheez it,cheez it,cheez it
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen it can write even when...
1300333,5,very good thanks,1.0,very good thanks,very good thanks,very good thanks,very good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i ve bought about half the things aver...
...,...,...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible the pad is way too thin and keep sli...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made the cork is fraying along t...


## Remove extra spaces
- [ ] Verify with a specific entry

In [34]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    single_spaced_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # Check if there are any extra spaces
        has_extra_space = bool(re.search(r' +', text_review))
        if has_extra_space == True:
            # print("Review", text_reviews_idx, "has extra space -- ", text_review)
            pass
        
        # Remove extra spaces
        single_spaced_review = re.sub(r' +', ' ', text_review)
        # print("Review", text_reviews_idx, "without extra space -- ", single_spaced_review)
        # print()
        
        single_spaced_reviews.append(single_spaced_review)

    updated_df['no_extra_space'] = single_spaced_reviews
    return updated_df

In [35]:
no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'only_alpha_chars')

In [36]:
no_extra_space_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...",ok and very very fast delivery dearly paid a...,ok and very very fast delivery dearly paid any...
348810,5,cheez it,1.0,cheez it,cheez it,cheez it,cheez it,cheez it
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen it can write even when...,i really like this pen it can write even when ...
1300333,5,very good thanks,1.0,very good thanks,very good thanks,very good thanks,very good thanks,very good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i ve bought about half the things aver...,i think i ve bought about half the things aver...
...,...,...,...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible the pad is way too thin and keep sli...,horrible the pad is way too thin and keep slid...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made the cork is fraying along t...,very cheaply made the cork is fraying along th...


# Pre-processing

## remove the stop words 

In [37]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [38]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_stop_words_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        text_review_words = word_tokenize(text_review) 

        # print("Before stop word removal", text_reviews_idx, " -- ", text_review)

        filtered_review = []

        for text_review_words_idx in range(len(text_review_words)):
            text_review_word = text_review_words[text_review_words_idx]
            
            # Check if review word is a stop word
            if text_review_word in stop_words:
                # print("  Stop word -- ", text_review_word)
                pass
            else:
                # print(text_review_word, " -- is NOT a stop word in review")
                filtered_review.append(text_review_word)

        
        filtered_review = " ".join(filtered_review)
        # print("After stop word removal", text_reviews_idx, " -- ", filtered_review)
        # print()
        
        without_stop_words_reviews.append(filtered_review)

    updated_df['without_stop_words'] = without_stop_words_reviews
    return updated_df

In [39]:
no_stop_words_df = filter_stop_words(no_extra_space_df, 'no_extra_space')

In [40]:
no_stop_words_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space,without_stop_words
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...",ok and very very fast delivery dearly paid a...,ok and very very fast delivery dearly paid any...,ok fast delivery dearly paid anyway
348810,5,cheez it,1.0,cheez it,cheez it,cheez it,cheez it,cheez it,cheez
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen it can write even when...,i really like this pen it can write even when ...,really like pen write even lying
1300333,5,very good thanks,1.0,very good thanks,very good thanks,very good thanks,very good thanks,very good thanks,good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i ve bought about half the things aver...,i think i ve bought about half the things aver...,think bought half things avery sells since rea...
...,...,...,...,...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,problem earlier customer bastexwireless screen...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible the pad is way too thin and keep sli...,horrible the pad is way too thin and keep slid...,horrible pad way thin keep sliding worth price...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,first cartridge replaced leaked desk clothing ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made the cork is fraying along t...,very cheaply made the cork is fraying along th...,cheaply made cork fraying along dry erase fram...


## perform lemmatization  

- "I was jogging with Aman for 3 miles"
- "I was jog with Aman for 3 miles"
- [ ] NOT working with "ing". Why?
- [ ] Working with "words" -> word

In [None]:
# lemmatized_words = [lem.lemmatize(word) for word in lemmed_words]
        # lemmatized_sentence = ' '.join(lemmatized_words)
        # print("After lem update", text_reviews_idx, " -- ", lemmatized_sentence)
        

In [46]:
from nltk.stem import WordNetLemmatizer

def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    lemmed_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]        
        words_in_review = word_tokenize(text_review) 

        # print("Before lem update", text_reviews_idx, " -- ", text_review)
        # print("Lemmed words", words_in_review)
        

        lemmed_sentence = []

        # Split review into words
        for lemmed_words_idx in range(len(words_in_review)):
            word = words_in_review[lemmed_words_idx]
            
            apply_lemmatization = lem.lemmatize(word)
            # print(apply_lemmatization)
            
            lemmed_sentence.append(apply_lemmatization)
            filtered_review = " ".join(lemmed_sentence)
    
        # print("After lem update -- ", filtered_review)
        # print()

        lemmed_reviews.append(filtered_review)

    updated_df['lemmed_reviews'] = lemmed_reviews

    return updated_df

In [54]:
lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')

In [55]:
lemmed_df

Unnamed: 0,star_rating,review_body,sentiment,lower_cased,no_html_urls,no_contractions,only_alpha_chars,no_extra_space,without_stop_words,lemmed_reviews
513036,5,"Ok and very very fast delivery (dearly paid, a...",1.0,"ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...","ok and very very fast delivery (dearly paid, a...",ok and very very fast delivery dearly paid a...,ok and very very fast delivery dearly paid any...,ok fast delivery dearly paid anyway,ok fast delivery dearly paid anyway
348810,5,cheez it,1.0,cheez it,cheez it,cheez it,cheez it,cheez it,cheez,cheez
151383,5,I really like this pen. It can write even when...,1.0,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen. it can write even when...,i really like this pen it can write even when...,i really like this pen it can write even when ...,really like pen write even lying,really like pen write even lying
1300333,5,very good thanks,1.0,very good thanks,very good thanks,very good thanks,very good thanks,very good thanks,good thanks,good thanks
2406033,5,I think I've bought about half the things Aver...,1.0,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i've bought about half the things aver...,i think i ve bought about half the things aver...,i think i ve bought about half the things aver...,think bought half things avery sells since rea...,think bought half thing avery sell since reall...
...,...,...,...,...,...,...,...,...,...,...
2164856,1,Same problem as earlier customer of BastexWire...,0.0,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,same problem as earlier customer of bastexwire...,problem earlier customer bastexwireless screen...,problem earlier customer bastexwireless screen...
1280158,1,Horrible! The pad is way too thin and keep sli...,0.0,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible! the pad is way too thin and keep sli...,horrible the pad is way too thin and keep sli...,horrible the pad is way too thin and keep slid...,horrible pad way thin keep sliding worth price...,horrible pad way thin keep sliding worth price...
2341521,1,The first cartridge to be replaced leaked all ...,0.0,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,the first cartridge to be replaced leaked all ...,first cartridge replaced leaked desk clothing ...,first cartridge replaced leaked desk clothing ...
2322330,2,Very Cheaply made. The Cork is fraying along t...,0.0,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made. the cork is fraying along t...,very cheaply made the cork is fraying along t...,very cheaply made the cork is fraying along th...,cheaply made cork fraying along dry erase fram...,cheaply made cork fraying along dry erase fram...


# TF-IDF Feature Extraction

In [56]:
def tf_idf_feature_extraction(df: pd.DataFrame, col_name: str):
    """Extract the TF-IDF features from the reviews.

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    tf_idf_features:
        A matrix containing the TF-IDF features extracted
        
    """

    vectorizer = TfidfVectorizer()
    tf_idf_features = vectorizer.fit_transform(df[col_name])

    return tf_idf_features


In [57]:
tf_idf_features = tf_idf_feature_extraction(lemmed_df, 'lemmed_reviews')


In [58]:
tf_idf_features[0]

<1x56588 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

## Split Features and Sentiment Labels

In [59]:
sentiments = lemmed_df['sentiment']
sentiments.shape

(200000,)

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tf_idf_features, sentiments, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160000, 56588), (40000, 56588), (160000,), (40000,))

# Models

In [61]:
import sklearn
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

## Evaluation Metrics

In [62]:
def eval_accuracy(y_true, y_prediction):
    return sklearn.metrics.accuracy_score(y_true, y_prediction)

def eval_precision(y_true, y_prediction):
    return sklearn.metrics.precision_score(y_true, y_prediction)

def eval_recall(y_true, y_prediction):
    return sklearn.metrics.recall_score(y_true, y_prediction)

def eval_f1_score(y_true, y_prediction):
    return sklearn.metrics.f1_score(y_true, y_prediction)

In [63]:
def train_eval_metric(y_train_true, y_train_predictions):
    accuracy = eval_accuracy(y_train_true, y_train_predictions)
    precision = eval_precision(y_train_true, y_train_predictions)
    recall = eval_recall(y_train_true, y_train_predictions)
    f1 = eval_f1_score(y_train_true, y_train_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

def test_eval_metric(y_test_true, y_test_predictions):
    accuracy = eval_accuracy(y_test_true, y_test_predictions)
    precision = eval_precision(y_test_true, y_test_predictions)
    recall = eval_recall(y_test_true, y_test_predictions)
    f1 = eval_f1_score(y_test_true, y_test_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

# Perceptron

In [64]:
def perceptron_model(X_train, X_test, y_train, y_test): 

    technique = Perceptron(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [65]:
perceptron_train_metrics, perceptron_test_metrics = perceptron_model(X_train, X_test, y_train, y_test)

In [66]:
perceptron_train_metrics, perceptron_test_metrics

({'Accuracy': 0.89968125,
  'Precision': 0.9227992701695005,
  'Recall': 0.8723611683977652,
  'F1 Score': 0.8968716469310786},
 {'Accuracy': 0.8538,
  'Precision': 0.8757371301067842,
  'Recall': 0.82448857099985,
  'F1 Score': 0.8493404781533388})

# SVM

In [67]:
def svm_model(X_train, X_test, y_train, y_test): 

    technique = LinearSVC(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [68]:
svm_train_metrics, svm_test_metrics = svm_model(X_train, X_test, y_train, y_test)



In [69]:
svm_train_metrics, svm_test_metrics

({'Accuracy': 0.93075625,
  'Precision': 0.9315119947914058,
  'Recall': 0.9298936343069981,
  'F1 Score': 0.9307021110242377},
 {'Accuracy': 0.89385,
  'Precision': 0.8952758672624128,
  'Recall': 0.8919621867653679,
  'F1 Score': 0.8936159551012227})

# Logistic Regression

In [70]:
def logistic_regression_model(X_train, X_test, y_train, y_test): 

    technique = LogisticRegression(random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics

In [71]:
logistic_regression_train_metrics, logistic_regression_test_metrics = logistic_regression_model(X_train, X_test, y_train, y_test)

In [72]:
logistic_regression_train_metrics, logistic_regression_test_metrics

({'Accuracy': 0.90866875,
  'Precision': 0.9117284106077014,
  'Recall': 0.9049708150536828,
  'F1 Score': 0.9083370446804373},
 {'Accuracy': 0.89635,
  'Precision': 0.8988272008858912,
  'Recall': 0.8931626069124193,
  'F1 Score': 0.8959859508278977})

# Naive Bayes

In [73]:
def naive_bayes_model(X_train, X_test, y_train, y_test): 

    technique = MultinomialNB()
    technique.fit(X_train.toarray(), y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)

    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [74]:
naive_bayes_train_metrics, naive_bayes_test_metrics = naive_bayes_model(X_train, X_test, y_train, y_test)

In [75]:
# naive_bayes_train_metrics, naive_bayes_test_metrics

NameError: name 'naive_bayes_train_metrics' is not defined