In [165]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [166]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

In [167]:
# amazon_reviews_copy_df = amazon_reviews_df.copy()

## Keep Reviews and Ratings

In [168]:
reviews_ratings_df = amazon_reviews_copy_df[['star_rating', 'review_headline', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

Unnamed: 0,star_rating,review_headline,review_body
0,5,Five Stars,Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it."
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...
4,4,Four Stars,Gorgeous colors and easy to use
...,...,...,...
2640249,4,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...
2640250,4,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...
2640251,4,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...
2640252,5,class text,I am teaching a course in Excel and am using t...


 ## We form three classes and select 20000 reviews randomly from each class.
- [ ] 100,000 each


In [169]:
from copy import deepcopy
valid = ["1","2","3","4","5"]


stars = deepcopy(reviews_ratings_df).star_rating.astype(str) # turn entries to strings
where_valid = stars.index[stars.isin(valid)].tolist() # check valid list and see which of our stars match
reviews_ratings_df = reviews_ratings_df.iloc[where_valid]

reviews_ratings_df

Unnamed: 0,star_rating,review_headline,review_body
0,5,Five Stars,Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it."
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...
4,4,Four Stars,Gorgeous colors and easy to use
...,...,...,...
2640249,4,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...
2640250,4,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...
2640251,4,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...
2640252,5,class text,I am teaching a course in Excel and am using t...


In [170]:
reviews_ratings_df.star_rating = reviews_ratings_df.star_rating.apply(lambda x: int(x))

In [171]:
reviews_ratings_df

Unnamed: 0,star_rating,review_headline,review_body
0,5,Five Stars,Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it."
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...
4,4,Four Stars,Gorgeous colors and easy to use
...,...,...,...
2640249,4,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...
2640250,4,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...
2640251,4,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...
2640252,5,class text,I am teaching a course in Excel and am using t...


In [172]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):


    if sentiment_type == 'positive_sentiment':
        positive_review_threshold = df[rating_col] > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'negative_sentiment':
        positive_review_threshold = df[rating_col] < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 0

    elif sentiment_type == 'neutral_sentiment':
        positive_review_threshold = df[rating_col] == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [173]:
positive_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'positive_sentiment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1


In [174]:
positive_sentiment_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment
0,5,Five Stars,Great product.,1
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,1
2,5,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",1
4,4,Four Stars,Gorgeous colors and easy to use,1
5,5,Five Stars,Perfect for planning weekly meals. Removrd the...,1
...,...,...,...,...
2640249,4,Great value! A must if you hate to carry thing...,I can't live anymore whithout my Palm III. But...,1
2640250,4,Attaches the Palm Pilot like an appendage,Although the Palm Pilot is thin and compact it...,1
2640251,4,"Excellent information, pictures and stories, I...",This book had a lot of great content without b...,1
2640252,5,class text,I am teaching a course in Excel and am using t...,1


In [175]:
positive_sentiment_df["star_rating"].value_counts()

star_rating
5    1582812
4     418371
Name: count, dtype: int64

In [176]:
negative_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'negative_sentiment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 0


In [177]:
negative_sentiment_df

Unnamed: 0,star_rating,review_headline,review_body,negative_sentiment
3,1,and the shredder was dirty and the bin was par...,Although this was labeled as &#34;new&#34; the...,0
13,1,One Star,worked about a month then died,0
20,1,One Star,The phone did not work. No Dial Tone. Not wo...,0
27,1,One Star,Not laminated and no reinforced holes for hang...,0
28,1,One Star,"Cartridge was over filled, black smears on pap...",0
...,...,...,...,...
2640139,2,Did not work from the moment it was set up.,This purchase was intended for a home office s...,0
2640149,2,Problems crashing,I bought a Palm V from Amazon and thought it w...,0
2640151,1,"Nice display, but the case is poorly engineered",The display is excellent - it's a good size an...,0
2640201,1,Pseudo Compatability Still a Serious Problem,All the CE based hand held or palm computers h...,0


In [178]:
negative_sentiment_df["star_rating"].value_counts()

star_rating
1    306979
2    138384
Name: count, dtype: int64

In [179]:
neutral_sentiment_df = separate_reviews_by_rating(reviews_ratings_df, 'star_rating', 3, 'neutral_sentiment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


In [180]:
neutral_sentiment_df["star_rating"].value_counts()

star_rating
3    193691
Name: count, dtype: int64

- [ ] Be sure to properly sample to correct amount.

In [181]:
pos_rand_sampled_df = positive_sentiment_df.sample(2000)
pos_rand_sampled_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1
148474,5,Love these.,love theses pens they write so smoothly with n...,1
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1
...,...,...,...,...
160980,5,This is a notepad and not a 'journal'. Paper a...,It is important to realize that this is a note...,1
1035261,5,Five Stars,Love these pens. Write smooth and erase easily!,1
1937610,5,Uses the latest GPS chip with outstanding sens...,I have used GPS devices for many years and thi...,1
1887687,5,So far so good,It's doing a great job so far. I don't use it ...,1


In [182]:
neg_rand_sampled_df = negative_sentiment_df.sample(2000)
neg_rand_sampled_df

Unnamed: 0,star_rating,review_headline,review_body,negative_sentiment
7035,2,No good,Not what I was expected. I thought it would be...,0
1843416,1,Poor quality printer,Do not buy! Poor quality (I guess you get wha...,0
594919,1,Neat is not a software company...,"I have had this scanner for far too long, and ...",0
2054187,1,Not so good,Does not product a staple (miss fires) every o...,0
1394962,1,Does not work,We tried to use this for a music festival and ...,0
...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,0
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",0
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",0
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",0


In [183]:
# reviews_ratings_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])

In [184]:
reviews_ratings_with_sentiment_df = pd.concat([pos_rand_sampled_df, neg_rand_sampled_df])

In [185]:
reviews_ratings_with_sentiment_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,
...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0


# Data Cleaning

## Lower case
- NOTE: Not all reviews are a string. To solve,
    - [ ] Filter out non-strings when/before randomly sampling

In [186]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        if type(text_review) != str:
            converted_str = str(text_review)
            # update_text_review = converted_str.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)
            # print(text_reviews_idx, update_text_review)
            # print()

    update_col_name = col_name + '_lower_cased'
    updated_df[update_col_name] = lower_case_reviews
    
    # updated_df = df
    return updated_df

In [187]:
reviews_df = convert_reviews_to_lower_case(reviews_ratings_with_sentiment_df, 'review_body')

In [188]:
reviews_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,this is the second piece of furniture i've bou...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,"bought this for my daughter, who is an artist ..."
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love theses pens they write so smoothly with n...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,"i never knew i needed this, but this comes in ..."
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product. would recommend. only reason ...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,not what i expected.i bought this for my 9 yea...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,"hey folks, this ink is good factory ink... b..."
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,"first, the 4 ink system is a pain. it tends to..."
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,"i bought the hd2 handset on dec. 5, 2013. my w..."


## Remove HTML and URLs
- [ ] Verify by finding a specific entry with HTML, URL.

In [189]:
def remove_html_and_urls(df:pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    # updated_reviews = []
    # updated_df = df.copy()
    # text_reviews = df[col_name].values

    # for text_reviews_idx in range(len(text_reviews)):
    #     text_review = text_reviews[text_reviews_idx]

    #     if type(text_review) != str:
    #         updated_reviews.append(text_review)
    #     else:
    #         update_text_review = url_pattern.sub(r'', text_review)
    #         updated_reviews.append(update_text_review)

    # updated_df[col_name] = updated_reviews
    # return updated_df

    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    def clean_text(text):
        if isinstance(text, str):
            return url_pattern.sub('', text)
        return text

    df[col_name] = df[col_name].apply(clean_text)
    return df

In [190]:
reviews_no_html_urls_rb_df = remove_html_and_urls(reviews_df, 'review_body_lower_cased')

In [191]:
reviews_no_html_urls_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,this is the second piece of furniture i've bou...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,"bought this for my daughter, who is an artist ..."
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love theses pens they write so smoothly with n...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,"i never knew i needed this, but this comes in ..."
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product. would recommend. only reason ...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,not what i expected.i bought this for my 9 yea...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,"hey folks, this ink is good factory ink... b..."
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,"first, the 4 ink system is a pain. it tends to..."
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,"i bought the hd2 handset on dec. 5, 2013. my w..."


## Remove Non-alphabetical characters
- [ ] If entry has no letters, leave blank?

In [192]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """
    


    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_review)

        # if type(text_review) != str:
        #     updated_reviews.append(text_review)
        # else:
        update_text_review = re.sub(r'[^a-zA-Z\s]', '', text_review)
        # print(text_review)
        updated_reviews.append(update_text_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [193]:
reviews_alph_chars_rb_df = remove_non_alphabetical_characters(reviews_no_html_urls_rb_df, 'review_body_lower_cased')

In [194]:
reviews_alph_chars_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,this is the second piece of furniture ive boug...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,bought this for my daughter who is an artist w...
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love theses pens they write so smoothly with n...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,i never knew i needed this but this comes in s...
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product would recommend only reason fo...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,not what i expectedi bought this for my year ...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,hey folks this ink is good factory ink but w...
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,first the ink system is a pain it tends to us...
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,i bought the hd handset on dec my wife was i...


## Remove extra spaces
- [ ] Verify with a specific entry

In [195]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_review)

        # if type(text_review) != str:
        #     updated_reviews.append(text_review)
        # else:
        update_text_review = re.sub(r' +', ' ', text_review)
        # print(text_review)
        updated_reviews.append(update_text_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [196]:
reviews_no_extra_space_rb_df = remove_extra_spaces(reviews_alph_chars_rb_df, 'review_body_lower_cased')

In [197]:
reviews_no_extra_space_rb_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,this is the second piece of furniture ive boug...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,bought this for my daughter who is an artist w...
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love theses pens they write so smoothly with n...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,i never knew i needed this but this comes in s...
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product would recommend only reason for ...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,not what i expectedi bought this for my year o...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,hey folks this ink is good factory ink but way...
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,first the ink system is a pain it tends to use...
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,i bought the hd handset on dec my wife was in ...


## Remove Contractions
- [ ] Need to update; make my own
- [ ] Beware that removing alpha characters may remove apostrophes, thus need to improve `contraction_mapping`.

In [198]:
contraction_mapping = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "don't": "do not",
    "didn't": "did not",
    "didnt": "did not"
}

In [199]:
def expand_contractions(input_text):
    # Function to replace contractions with their expanded forms
    def replace(match):
        # print("match:", match)
        return contraction_mapping[match.group(0)]

    # Use regular expression to find contractions and replace them
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)
    expanded_text = contraction_pattern.sub(replace, input_text)

    return expanded_text

In [200]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_review)

        # if type(text_review) != str:
        #     updated_reviews.append(text_review)
        # else:
        without_contraction = expand_contractions(text_review)
        # print(text_review)
        updated_reviews.append(without_contraction)

    updated_df[col_name] = updated_reviews
    return updated_df

In [201]:
no_contractions_df = remove_contractions(reviews_no_extra_space_rb_df, 'review_body_lower_cased')

In [202]:
no_contractions_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,this is the second piece of furniture ive boug...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,bought this for my daughter who is an artist w...
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love theses pens they write so smoothly with n...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,i never knew i needed this but this comes in s...
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product would recommend only reason for ...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,not what i expectedi bought this for my year o...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,hey folks this ink is good factory ink but way...
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,first the ink system is a pain it tends to use...
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,i bought the hd handset on dec my wife was in ...


# Pre-processing

## remove the stop words 

In [203]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [204]:
def remove_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        text_review_words = word_tokenize(text_review) 

        filtered_review = []

        for text_review_words_idx in range(len(text_review_words)):
            text_review_word = text_review_words[text_review_words_idx]

            if text_review_word not in stop_words:
                filtered_review.append(text_review_word)
            else:
                print(text_review_word)

        filtered_review = " ".join(filtered_review)
        updated_reviews.append(filtered_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [205]:
no_stop_words_df = remove_stop_words(no_contractions_df, 'review_body_lower_cased')

this
is
the
of
from
where
i
the
with
my
than
a
with
is
to
be
or
the
is
very
that
i
the
am
just
very
with
it
this
for
my
who
is
an
with
the
is
and
up
she
this
and
that
she
can
it
a
of
her
very
with
it
they
so
with
no
for
and
just
i
i
this
but
this
in
so
when
the
are
doing
at
the
and
they
a
the
it
a
more
the
you
with
the
i
have
to
this
of
a
more
all
you
have
to
do
is
it
into
a
and
than
up
and
to
a
with
the
of
you
can
it
as
a
it
has
our
of
at
our
only
for
is
it
with
a
but
it
did
not
and
with
to
a
i
the
and
at
the
same
be
while
both
the
same
the
out
to
be
more
and
the
a
the
is
and
more
and
the
are
while
the
same
the
as
the
this
is
the
you
very
to
in
my
very
very
and
very
i
this
me
from
having
to
a
i
these
to
them
they
are
so
and
very
and
very
for
the
this
up
to
by
the
of
the
is
the
same
and
the
only
i
have
into
and
not
just
this
is
that
to
not
be
as
at
when
i
to
this
to
my
as
i
have
it
with
my
i
this
for
who
is
with
a
and
have
the
who
to
from
the
of
most
have
and
the
will
through
that
to
b

these
were
a
i
and
you
can
through
the
to
a
and
then
have
it
off
as
as
it
these
are
we
have
had
this
for
than
and
it
has
in
it
the
is
all
the
at
my
this
at
and
they
are
all
the
same
this
i
in
the
not
in
in
the
when
i
the
of
the
with
the
in
the
its
so
it
as
very
with
the
for
more
that
i
was
of
the
but
i
only
in
very
i
should
have
just
to
down
the
are
not
and
as
they
on
on
the
other
at
i
it
its
in
and
the
is
and
and
the
itself
is
and
the
for
the
is
that
the
off
i
am
with
a
with
of
on
it
i
have
and
a
if
what
i
very
to
a
at
not
a
my
in
a
had
and
in
the
for
the
because
of
the
the
to
if
i
the
in
i
should
be
this
will
not
to
the
you
to
off
to
the
as
you
should
to
itself
my
for
this
is
to
very
the
is
very
to
you
to
and
at
it
if
you
what
i
once
you
the
you
to
off
the
other
to
it
all
my
did
the
but
is
was
a
i
to
be
about
the
and
it
on
what
you
to
with
the
if
your
and
to
the
will
on
what
you
to
i
with
the
of
them
as
for
my
its
an
of
it
and
to
the
is
a
with
the
the
to
i
did
not
them
on
the
and
the

In [206]:
no_stop_words_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,second piece furniture ive bought amazon accid...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,bought daughter artist mad marker skills paper...
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love theses pens write smoothly skipping perfe...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,never knew needed comes handy kids homework co...
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product would recommend reason stars com...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,expectedi bought year old daughter accompany d...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,hey folks ink good factory ink way overpriced ...
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,first ink system pain tends use photo black ev...
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,bought hd handset dec wife middle long distanc...


## perform lemmatization  

- "I was jogging with Aman for 3 miles"
- "I was jog with Aman for 3 miles"

In [207]:
from nltk.stem import WordNetLemmatizer

def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    updated_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    
    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print("Review: ", text_review)
        
        text_review_words = word_tokenize(text_review) 

        lemmed_sentence = []

        for text_review_words_idx in range(len(text_review_words)): # get each word in the string review
            text_review_word = text_review_words[text_review_words_idx]
            lemmed_word = lem.lemmatize(text_review_word)
            print(text_review_word, lemmed_word)
            lemmed_sentence.append(lemmed_word)
            filtered_review = " ".join(lemmed_sentence)
    
        # print("New sentence", filtered_review)
        # print()

        updated_reviews.append(filtered_review)

    updated_df[col_name] = updated_reviews
    return updated_df

In [208]:
lemmed_df = lemmentize_review(no_stop_words_df, 'review_body_lower_cased')

second second
piece piece
furniture furniture
ive ive
bought bought
amazon amazon
accidentally accidentally
bent bent
metal metal
parts part
hands hand
either either
im im
way way
stronger stronger
middleaged middleaged
woman woman
arthritis arthritis
expected expected
metal metal
weak weak
said said
actually actually
really really
like like
desk desk
careful careful
bought bought
daughter daughter
artist artist
mad mad
marker marker
skills skill
paper paper
nice nice
thick thick
holds hold
well well
loves love
book book
loves love
use use
like like
portfolio portfolio
best best
marker marker
work work
happy happy
love love
theses thesis
pens pen
write write
smoothly smoothly
skipping skipping
perfect perfect
zentangle zentangle
writing writing
never never
knew knew
needed needed
comes come
handy handy
kids kid
homework homework
computer computer
need need
pencil pencil
sharpener sharpener
works work
great great
doesnt doesnt
make make
long long
pointy pointy
ends end
like like
regular

In [209]:
lemmed_df

Unnamed: 0,star_rating,review_headline,review_body,positive_sentiment,negative_sentiment,review_body_lower_cased
85285,4,weak metal but otherwise good,This is the second piece of furniture I've bou...,1.0,,second piece furniture ive bought amazon accid...
1774320,5,perfect for serious marker artist,"Bought this for my daughter, who is an artist ...",1.0,,bought daughter artist mad marker skill paper ...
148474,5,Love these.,love theses pens they write so smoothly with n...,1.0,,love thesis pen write smoothly skipping perfec...
1501212,5,So Handy,"I never knew I needed this, but this comes in ...",1.0,,never knew needed come handy kid homework comp...
1671808,4,Good for price,Great product. Would recommend. Only reason ...,1.0,,great product would recommend reason star come...
...,...,...,...,...,...,...
1105283,1,Not what I expected. i bought this for my ...,Not what I expected.i bought this for my 9 yea...,,0.0,expectedi bought year old daughter accompany d...
216553,2,"Overpriced, exact same, over the counter, 5 bu...","Hey folks, this ink is good factory ink... b...",,0.0,hey folk ink good factory ink way overpriced t...
2121968,1,Printer Failed after 1 year,"First, the 4 ink system is a pain. It tends to...",,0.0,first ink system pain tends use photo black ev...
1583051,1,Fails after 30 days,"I bought the HD2 handset on Dec. 5, 2013. My w...",,0.0,bought hd handset dec wife middle long distanc...


In [210]:
training_size = int(len(lemmed_df) * .80)
training_size

3200

In [211]:
testing_size = int(len(lemmed_df) - training_size)
testing_size

800

In [212]:
training_size + testing_size


4000

In [213]:
X_train_series = lemmed_df['review_body_lower_cased'][:training_size]
len(X_train_series), X_train_series

(3200,
 85285      second piece furniture ive bought amazon accid...
 1774320    bought daughter artist mad marker skill paper ...
 148474     love thesis pen write smoothly skipping perfec...
 1501212    never knew needed come handy kid homework comp...
 1671808    great product would recommend reason star come...
                                  ...                        
 449878     worst printer ever owned first quality issue f...
 495261     easy install new imac wirlessly color quality ...
 2272495    toner already lasted month heavy printing mayb...
 2607051    recently bought hp scanjet c work o x mac old ...
 351438             wrong picture web sitebr satisfie produit
 Name: review_body_lower_cased, Length: 3200, dtype: object)

In [214]:
X_test_series = lemmed_df['review_body_lower_cased'][training_size:]
len(X_test_series), X_test_series

(800,
 428380     purchased printer gift significant december ha...
 223896     opened black ink cartridge printer detect eith...
 203730     ran one full day reboots waited two business d...
 301045         damaged arrived looked fragile decided return
 819098     advertised flimsy cheaply made kept accidental...
                                  ...                        
 1105283    expectedi bought year old daughter accompany d...
 216553     hey folk ink good factory ink way overpriced t...
 2121968    first ink system pain tends use photo black ev...
 1583051    bought hd handset dec wife middle long distanc...
 266584       corner wear becomes deadly weapon stab u pocket
 Name: review_body_lower_cased, Length: 800, dtype: object)

# TF-IDF Feature Extraction

In [215]:
def compute_tf_idf_for_feature_extraction(df:pd.DataFrame):
    """Extract the TF-IDF features from the reviews.

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    vectorized_reviews = []
    feature_vector = []
    updated_reviews = []
    text_reviews = df.values

    vectorizer = TfidfVectorizer(input='content')
    

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print("Review: ", text_review)
        updated_reviews.append(text_review)
        # text_review_list = text_review.split(' ')
        # print("Review2: ", text_review_list)
        
    updated_reviews = ", ".join(updated_reviews)
    # print(type(updated_reviews))
    text_review_list = updated_reviews.split(' ')
    # print(text_review_list)
    X = vectorizer.fit_transform(text_review_list)
    
    # vectorized_reviews = vectorizer.get_feature_names_out()
    # vectorized_reviews.append(vectorized_review)
    # feature_vector.append(X)

    return X

In [216]:
X_train = compute_tf_idf_for_feature_extraction(X_train_series)


In [217]:
X_train.get_shape()

(89966, 9442)

In [218]:
X_test = compute_tf_idf_for_feature_extraction(X_test_series)


In [219]:
X_test

<29995x4987 sparse matrix of type '<class 'numpy.float64'>'
	with 29915 stored elements in Compressed Sparse Row format>

In [220]:
y_train = lemmed_df['sentiment'][:training_size]
len(y_train), y_train

KeyError: 'sentiment'

In [221]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
clf.fit(X_train, y_train)
# pred = clf.predict(X_test)

ValueError: Found input variables with inconsistent numbers of samples: [89966, 3200]

In [56]:
positive_negative_sentiment_split = int(len(lemmed_df)/2)
positive_negative_sentiment_splita

2000

In [None]:
pos_sentiment_features = feature_vector[:positive_negative_sentiment_split]
len(pos_sentiment_features), pos_sentiment_features

In [108]:
pos_sentiment_df = lemmed_df[:positive_negative_sentiment_split]

In [110]:
# pos_sentiment_df

In [None]:
neg_sentiment_df = lemmed_df[positive_negative_sentiment_split:]
neg_sentiment_df

In [None]:
neg_sentiment_features = feature_vector[positive_negative_sentiment_split:]
len(neg_sentiment_features), neg_sentiment_features

In [None]:
pos_sentiment = lemmed_df['positive_sentiment'].dropna()
pos_sentiment

neg_sentiment = lemmed_df['negative_sentiment'].dropna()
neg_sentiment

lemmed_df['sentiment'] = pd.concat([pos_sentiment, neg_sentiment])

lemmed_df['sentiment'].unique()

array([1., 0.])

In [100]:
X_train.get_shape()

AttributeError: 'list' object has no attribute 'get_shape'

In [97]:
y_train = lemmed_df['sentiment'][:training_size]
len(y_train), y_train

(3200,
 1180441    1.0
 1465059    1.0
 1920768    1.0
 399708     1.0
 386037     1.0
           ... 
 1674894    0.0
 1666326    0.0
 2306817    0.0
 1359159    0.0
 1515367    0.0
 Name: sentiment, Length: 3200, dtype: float64)

ValueError: Found input variables with inconsistent numbers of samples: [89608, 3200]

# Perceptron

# SVM

# Logistic Regression

# Naive Bayes