In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [2]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

In [3]:
# amazon_reviews_copy_df = amazon_reviews_df.copy()

## Keep Reviews and Ratings

In [4]:
# reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_headline', 'review_body']]
# reviews_ratings_df.reset_index(drop=True)
# reviews_ratings_df

 ## We form three classes and select 20000 reviews randomly from each class.
- [ ] 100,000 each


In [5]:
reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_headline', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

valid = ["1","2","3","4","5"]
from copy import deepcopy

stars = deepcopy(reviews_ratings_df).star_rating.astype(str) # turn entries to strings
where_valid = stars.index[stars.isin(valid)].tolist() # check valid list and see which of our stars match
reviews_ratings_df = reviews_ratings_df.iloc[where_valid]

reviews_ratings_df

Unnamed: 0,star_rating,review_headline,review_body
0,5,Five Stars,Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it."
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...
4,4,Four Stars,Gorgeous colors and easy to use
...,...,...,...
2640249,4,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...
2640250,4,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...
2640251,4,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...
2640252,5,class text,I am teaching a course in Excel and am using t...


In [6]:
reviews_ratings_df.star_rating = reviews_ratings_df.star_rating.apply(lambda x: int(x))

In [7]:
reviews_ratings_df

Unnamed: 0,star_rating,review_headline,review_body
0,5,Five Stars,Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it."
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...
4,4,Four Stars,Gorgeous colors and easy to use
...,...,...,...
2640249,4,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...
2640250,4,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...
2640251,4,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...
2640252,5,class text,I am teaching a course in Excel and am using t...


In [8]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):


    if sentiment_type == 'positive_sentiment':
        positive_review_threshold = df[rating_col] > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'negative_sentiment':
        positive_review_threshold = df[rating_col] < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 0

    elif sentiment_type == 'neutral_sentiment':
        positive_review_threshold = df[rating_col] == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

positive_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'positive_sentiment')
negative_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'negative_sentiment')
neutral_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'neutral_sentiment')
# separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'n_sentiment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


In [9]:
# positive_sentiment_df

In [10]:
# positive_sentiment_df["star_rating"].value_counts()

In [11]:
# negative_sentiment_df

In [12]:
# negative_sentiment_df["star_rating"].value_counts()

In [13]:
# neutral_sentiment_df

In [14]:
# neutral_sentiment_df["star_rating"].value_counts()

In [15]:
pos_rand_sampled_df = positive_sentiment_df.sample(100000)
pos_rand_sampled_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1
174966,5,Wow!,Shipment and expectations were far exceeded!,1
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1
1006338,5,Five Stars,Very useful around the house.,1
...,...,...,...,...
912900,4,Four Stars,"It has worked well without incident. However, ...",1
2246444,5,Best product for the cost!,"These little pens are wonderful, and usually t...",1
2273410,5,Quiet Quality (I'm THRILLED) Printer,"I LOVE THIS PRINTER!!!! It's very quiet, easy ...",1
1419150,4,Good Product,Product works great for DVD labels. Everything...,1


In [16]:
neg_rand_sampled_df = negative_sentiment_df.sample(100000)
neg_rand_sampled_df

Unnamed: 0,star_rating,review_headline,review_body,negative_sentiment
2585018,2,NOT......for desktop faxing,This printer was great for typical ink jet pri...,0
120833,1,be sure to use the cumbersome individual inks ...,I've tried for a week to connect this Brother ...,0
1624777,1,Piece of Junk!,This electric eraser is a piece of junk. The m...,0
1656875,2,Priced too high,Kind of like a velvet painting...looks okay fr...,0
2061752,1,Connection,Good voice quality. But costumer service is no...,0
...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,0
2526240,1,rv,I bought it as recommended to be used with my ...,0
1795722,1,Not sure about this,After talking with you i think you are not mak...,0
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,0


In [17]:
# reviews_ratings_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])

In [18]:
reviews_ratings_with_sentiment_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])

In [19]:
reviews_ratings_with_sentiment_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,
1006338,5,Five Stars,Very useful around the house.,1.0,
...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0


# Data Cleaning

## Lower case
- NOTE: Not all reviews are a string. To solve,
    - [ ] Filter out non-strings when/before randomly sampling

In [20]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        if type(text_review) != str:
            converted_str = str(text_review)
            # update_text_review = converted_str.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()

    update_col_name = col_name + '_lower_cased'
    updated_df[update_col_name] = lower_case_reviews
    
    # updated_df = df
    return updated_df

In [21]:
# reviews_ratings_with_sentiment_df

In [22]:
reviews_lc_hl_df = convert_reviews_to_lower_case(reviews_ratings_with_sentiment_df, 'review_headline')

In [23]:
# reviews_lc_hl_df.head(10)

In [24]:
reviews_lc_rb_df = convert_reviews_to_lower_case(reviews_lc_hl_df, 'review_body')

In [25]:
reviews_lc_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,it was great nice and pretty i think i'm going...
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens!,let me start by saying that i look for everyth...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow!,shipment and expectations were far exceeded!
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice and sturdy. easy to assemble. the nice ...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,very useful around the house.
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,i bought the item thinking it was appropriate ...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,i bought it as recommended to be used with my ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,after talking with you i think you are not mak...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine?,everything is just a bit too crowded and small...


## Remove HTML and URLs
- [ ] Verify by finding a specific entry with HTML, URL.

In [26]:
def remove_html_and_urls(df:pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    # updated_reviews = []
    # updated_df = df.copy()
    # text_reviews = df[col_name].values

    # for text_reviews_idx in range(len(text_reviews)):
    #     text_review = text_reviews[text_reviews_idx]

    #     if type(text_review) != str:
    #         updated_reviews.append(text_review)
    #     else:
    #         update_text_review = url_pattern.sub(r'', text_review)
    #         updated_reviews.append(update_text_review)

    # updated_df[col_name] = updated_reviews
    # return updated_df

    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    def clean_text(text):
        if isinstance(text, str):
            return url_pattern.sub('', text)
        return text

    df[col_name] = df[col_name].apply(clean_text)
    return df

In [27]:
reviews_no_html_urls_df = remove_html_and_urls(reviews_lc_rb_df, 'review_headline')

In [28]:
# reviews_no_html_urls_df

In [29]:
reviews_no_html_urls_rb_df = remove_html_and_urls(reviews_no_html_urls_df, 'review_body')

In [30]:
reviews_no_html_urls_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,it was great nice and pretty i think i'm going...
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens!,let me start by saying that i look for everyth...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow!,shipment and expectations were far exceeded!
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice and sturdy. easy to assemble. the nice ...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,very useful around the house.
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,i bought the item thinking it was appropriate ...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,i bought it as recommended to be used with my ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,after talking with you i think you are not mak...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine?,everything is just a bit too crowded and small...


## Remove Non-alphabetical characters
- [ ] If entry has no letters, leave blank?

In [31]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """
    


    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_review)

        # if type(text_review) != str:
        #     updated_reviews.append(text_review)
        # else:
        update_text_review = re.sub(r'[^a-zA-Z\s]', '', text_review)
        # print(text_review)
        updated_reviews.append(update_text_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [32]:
reviews_alph_chars_df = remove_non_alphabetical_characters(reviews_no_html_urls_rb_df, 'review_headline_lower_cased')

In [33]:
reviews_alph_chars_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,it was great nice and pretty i think i'm going...
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let me start by saying that i look for everyth...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment and expectations were far exceeded!
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice and sturdy. easy to assemble. the nice ...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,very useful around the house.
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,i bought the item thinking it was appropriate ...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,i bought it as recommended to be used with my ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,after talking with you i think you are not mak...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything is just a bit too crowded and small...


In [34]:
reviews_alph_chars_rb_df = remove_non_alphabetical_characters(reviews_alph_chars_df, 'review_body_lower_cased')

In [35]:
reviews_alph_chars_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,it was great nice and pretty i think im going ...
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let me start by saying that i look for everyth...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment and expectations were far exceeded
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice and sturdy easy to assemble the nice th...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,very useful around the house
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,i bought the item thinking it was appropriate ...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,i bought it as recommended to be used with my ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,after talking with you i think you are not mak...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything is just a bit too crowded and small...


## Remove extra spaces
- [ ] Verify with a specific entry

In [36]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_review)

        # if type(text_review) != str:
        #     updated_reviews.append(text_review)
        # else:
        update_text_review = re.sub(r' +', ' ', text_review)
        # print(text_review)
        updated_reviews.append(update_text_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [37]:
reviews_no_extra_space_df = remove_extra_spaces(reviews_alph_chars_rb_df, 'review_headline_lower_cased')

In [38]:
# reviews_no_extra_space_df

In [39]:
reviews_no_extra_space_rb_df = remove_extra_spaces(reviews_no_extra_space_df, 'review_body_lower_cased')

In [40]:
reviews_no_extra_space_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,it was great nice and pretty i think im going ...
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let me start by saying that i look for everyth...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment and expectations were far exceeded
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice and sturdy easy to assemble the nice thin...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,very useful around the house
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,i bought the item thinking it was appropriate ...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,i bought it as recommended to be used with my ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,after talking with you i think you are not mak...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything is just a bit too crowded and small...


## Remove Contractions
- [ ] Need to update; make my own

In [41]:
contraction_mapping = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "don't": "do not"
}

In [42]:
def expand_contractions(input_text):
    # Function to replace contractions with their expanded forms
    def replace(match):
        print("match:", match)
        return contraction_mapping[match.group(0)]

    # Use regular expression to find contractions and replace them
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)
    expanded_text = contraction_pattern.sub(replace, input_text)

    return expanded_text

In [43]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_review)

        # if type(text_review) != str:
        #     updated_reviews.append(text_review)
        # else:
        without_contraction = expand_contractions(text_review)
        # print(text_review)
        updated_reviews.append(without_contraction)

    updated_df[col_name] = updated_reviews
    return updated_df

In [44]:
reviews_no_contractions_df = remove_contractions(reviews_alph_chars_rb_df, 'review_headline_lower_cased')

In [45]:
# reviews_no_contractions_df

In [46]:
reviews_no_contractions_rb_df = remove_contractions(reviews_no_contractions_df, 'review_body_lower_cased')

In [47]:
reviews_no_contractions_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,it was great nice and pretty i think im going ...
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let me start by saying that i look for everyth...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment and expectations were far exceeded
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice and sturdy easy to assemble the nice th...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,very useful around the house
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,i bought the item thinking it was appropriate ...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,i bought it as recommended to be used with my ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,after talking with you i think you are not mak...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything is just a bit too crowded and small...


# Pre-processing

## remove the stop words 

In [48]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [49]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        text_review_words = word_tokenize(text_review) 

        filtered_review = []

        for text_review_words_idx in range(len(text_review_words)):
            text_review_word = text_review_words[text_review_words_idx]

            if text_review_word not in stop_words:
                filtered_review.append(text_review_word)

        filtered_review = " ".join(filtered_review)
        updated_reviews.append(filtered_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [50]:
without_stop_words_rb_df = filter_stop_words(reviews_no_contractions_rb_df, 'review_body_lower_cased')

In [51]:
without_stop_words_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,great nice pretty think im going order
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let start saying look everything pink saw pink...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment expectations far exceeded
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice sturdy easy assemble nice thing boxes lid...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,useful around house
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,bought item thinking appropriate home delivere...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,bought recommended used purchase sony ereader ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,talking think making clear actually cover item...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything bit crowded small also lot mistakes...


## perform lemmatization  

- "I was jogging with Aman for 3 miles"
- "I was jog with Aman for 3 miles"

In [52]:
from nltk.stem import WordNetLemmatizer

def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    
    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print("Review: ", text_review)
        
        text_review_words = word_tokenize(text_review) 

        lemmed_sentence = []

        for text_review_words_idx in range(len(text_review_words)): # get each word in the string review
            text_review_word = text_review_words[text_review_words_idx]
            lemmed_word = lem.lemmatize(text_review_word)
            lemmed_sentence.append(lemmed_word)
            filtered_review = " ".join(lemmed_sentence)
    
        # print("New sentence", filtered_review)
        # print()

        updated_reviews.append(filtered_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [53]:
lemmed_df = lemmentize_review(without_stop_words_rb_df, 'review_body_lower_cased')

In [54]:
lemmed_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,great nice pretty think im going order
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let start saying look everything pink saw pink...
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment expectation far exceeded
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice sturdy easy assemble nice thing box lid l...
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,useful around house
...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,bought item thinking appropriate home delivere...
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,bought recommended used purchase sony ereader ...
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,talking think making clear actually cover item...
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything bit crowded small also lot mistake ...


In [55]:
pos_sentiment = lemmed_df['positive_sentiment'].dropna()
pos_sentiment

725969     1.0
1991611    1.0
174966     1.0
2334158    1.0
1006338    1.0
          ... 
912900     1.0
2246444    1.0
2273410    1.0
1419150    1.0
1600259    1.0
Name: positive_sentiment, Length: 100000, dtype: float64

In [56]:
neg_sentiment = lemmed_df['negative_sentiment'].dropna()
neg_sentiment

2585018    0.0
120833     0.0
1624777    0.0
1656875    0.0
2061752    0.0
          ... 
1773240    0.0
2526240    0.0
1795722    0.0
1669592    0.0
586686     0.0
Name: negative_sentiment, Length: 100000, dtype: float64

In [57]:
lemmed_df['sentiment'] = pd.concat([pos_sentiment, neg_sentiment])

In [58]:
lemmed_df['sentiment'].unique()

array([1., 0.])

In [59]:
lemmed_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_headline_lower_cased,review_body_lower_cased,sentiment
725969,5,Five Stars,It was great nice and pretty I think I'm going...,1.0,,five stars,great nice pretty think im going order,1.0
1991611,5,Love these pens!,Let me start by saying that I look for everyth...,1.0,,love these pens,let start saying look everything pink saw pink...,1.0
174966,5,Wow!,Shipment and expectations were far exceeded!,1.0,,wow,shipment expectation far exceeded,1.0
2334158,5,Nice and sturdy,Nice and sturdy. Easy to assemble. The nice ...,1.0,,nice and sturdy,nice sturdy easy assemble nice thing box lid l...,1.0
1006338,5,Five Stars,Very useful around the house.,1.0,,five stars,useful around house,1.0
...,...,...,...,...,...,...,...,...
1773240,1,Very dissatisfied customer,I bought the item thinking it was appropriate ...,,0.0,very dissatisfied customer,bought item thinking appropriate home delivere...,0.0
2526240,1,rv,I bought it as recommended to be used with my ...,,0.0,rv,bought recommended used purchase sony ereader ...,0.0
1795722,1,Not sure about this,After talking with you i think you are not mak...,,0.0,not sure about this,talking think making clear actually cover item...,0.0
1669592,2,Anyone want mine?,Everything is just a bit too crowded and small...,,0.0,anyone want mine,everything bit crowded small also lot mistake ...,0.0


# TF-IDF Feature Extraction

In [60]:
# def compute_tf_idf_for_feature_extraction(df:pd.DataFrame, col_name: str):
#     """Extract the TF-IDF features from the reviews.

#     Parameters
#     ----------
#     df: `pd.DataFrame`
#         The data
    
#     col_name: `str`
#         Column with reviews

#     Return
#     ------
#     df: `pd.DataFrame`
#         An updated DataFrame with the extra spaces removed
#     """
    
#     feature_vector = []
#     update_text_review = []
#     text_reviews = df[col_name].values

#     vectorizer = TfidfVectorizer(input='content', max_df=1, min_df=1)
    

#     # for text_reviews_idx in range(len(text_reviews)):
#     #     text_review = text_reviews[text_reviews_idx]
#     #     # print("Review: ", text_review)
#     #     update_text_review.append(text_review)
#         # text_review_list = text_review.split(' ')
#         # print("Review: ", text_review_list)

    
#     X = vectorizer.fit_transform(update_text_review)

#     return X
# X = compute_tf_idf_for_feature_extraction(lemmed_df, 'review_body_lower_cased')


In [61]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(lemmed_df['review_body_lower_cased'])

In [62]:
X[0]

<1x101572 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [63]:
y = lemmed_df['sentiment']
y.shape

(200000,)

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
X_train.shape, X_test.shape

((160000, 101572), (40000, 101572))

In [66]:
y_train.shape

(160000,)

In [67]:
import sklearn

In [68]:
def evaulate(y_label, y_predicted):
    accuracy = sklearn.metrics.accuracy_score(y_label, y_predicted)
    precision = sklearn.metrics.precision_score(y_label, y_predicted, average='binary')
    recall = sklearn.metrics.recall_score(y_label, y_predicted, average='binary')
    f1 = sklearn.metrics.f1_score(y_label, y_predicted, average='binary')

    return accuracy, precision, recall,f1

# Perceptron

In [69]:
clf = sklearn.linear_model.Perceptron(tol=1e-3, random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))


Training: Accuracy: 0.9137, Precision: 0.9312, Recall: 0.8936, F1-Score: 0.9120
 Testing: Accuracy: 0.8560, Precision: 0.8715, Recall: 0.8351, F1-Score: 0.8529


# SVM

In [72]:
clf = sklearn.svm.LinearSVC(random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))




Training: Accuracy: 0.9385, Precision: 0.9399, Recall: 0.9369, F1-Score: 0.9384
 Testing: Accuracy: 0.8968, Precision: 0.8955, Recall: 0.8985, F1-Score: 0.8970


# Logistic Regression

In [73]:
clf = sklearn.linear_model.LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))


Training: Accuracy: 0.9111, Precision: 0.9145, Recall: 0.9071, F1-Score: 0.9108
 Testing: Accuracy: 0.9002, Precision: 0.9010, Recall: 0.8991, F1-Score: 0.9000


# Naive Bayes

In [76]:
from sklearn.naive_bayes import MultinomialNB

In [78]:
clf = MultinomialNB()
clf.fit(X_train.toarray(), y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

tr_acc, tr_prec, tr_rec, tr_f1 = evaulate(y_train, y_pred_train)
te_acc, te_prec, te_rec, te_f1 = evaulate(y_test, y_pred_test)

print("Training: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(tr_acc, tr_prec, tr_rec, tr_f1))
print(" Testing: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-Score: {:.4f}".format(te_acc, te_prec, te_rec, te_f1))


Training: Accuracy: 0.8798, Precision: 0.8980, Recall: 0.8570, F1-Score: 0.8770
 Testing: Accuracy: 0.8630, Precision: 0.8769, Recall: 0.8444, F1-Score: 0.8603
