# HW2- Binary Classification 
- Detravious Jamari Brinkley
- CSCI-544: Applied Natural Language Processing
- python version: 3.11.4

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

import sklearn
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [2]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

## Keep Reviews and Ratings

In [3]:
reviews_ratings_df = amazon_reviews_copy_df.loc[0:, ['star_rating', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [4]:
def generate_sample_reviews(df: pd.DataFrame, review_col_name: str, number_of_reviews: int = 3):
    """Include reviews and ratings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    review_col_name: `str`
        The specific_column to get the reviews and ratings of
    
    number_of_reviews: `int`
        Number of samples to include


    Return
    ------
    Nothing; instead, print the reviews with ratings
    """


    columns_to_include = [review_col_name, 'star_rating']

    # Initialize an empty list to store dictionaries
    list_of_dicts = []

    # Iterate over the specified columns and retrieve the first three rows
    for row in df[columns_to_include].head(3).to_dict(orient='records'):
        list_of_dicts.append({'star_rating': row['star_rating'], review_col_name: row[review_col_name]})

    for dictionary in list_of_dicts:
        print(dictionary)

 ## Select 100000 reviews randomly from positive and negative classes


In [5]:
def update_data_type(df: pd.DataFrame, col_name: str):
    """Update the data type of the star ratings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with rating values

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the new sentiment appened

    """

    valid_ratings = ['1','2','3','4','5']
    star_rating_series = df[col_name].copy()

    # Convert type to strings
    star_rating_series.astype('str')

    # Check valid list and see which of our stars match
    rows = star_rating_series.index
    is_rating_in_valid_ratings = rows[star_rating_series.isin(valid_ratings)]

    # Convert to list
    is_rating_in_valid_ratings = is_rating_in_valid_ratings.to_list()

    updated_df = df.iloc[is_rating_in_valid_ratings]
    updated_df[col_name] = updated_df[col_name].astype(int)
    return updated_df

In [6]:
updated_reviews_ratings_df = update_data_type(reviews_ratings_df, 'star_rating')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_df[col_name] = updated_df[col_name].astype(int)


In [7]:
updated_reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [8]:
updated_reviews_ratings_df = updated_reviews_ratings_df.dropna()
updated_reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [9]:
# Check for NaN values
nan_check = updated_reviews_ratings_df.isna()

# Display the DataFrame with True where NaN values exist
print(nan_check)

# Check if any NaN value exists in the DataFrame
if nan_check.any().any():
    print("There are NaN values in the DataFrame.")
else:
    print("There are no NaN values in the DataFrame.")

         star_rating  review_body
0              False        False
1              False        False
2              False        False
3              False        False
4              False        False
...              ...          ...
2640249        False        False
2640250        False        False
2640251        False        False
2640252        False        False
2640253        False        False

[2640080 rows x 2 columns]
There are no NaN values in the DataFrame.


In [10]:
print("# reviews per rating", updated_reviews_ratings_df['star_rating'].value_counts())

# reviews per rating star_rating
5    1582704
4     418348
1     306967
3     193680
2     138381
Name: count, dtype: int64


In [11]:
def sample_star_ratings(df: pd.DataFrame, col_name: str, star_value: int, number_of_reviews: int):
    """Build a subset balanced dataset with reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The dataframe to use
    col_name: `str`
        The name of the column to get reviews from
    star_value: `int`
        The star rating of the review
    number_of_reviews: `int`
        The number of sub reviews to include in sample

    Return
    ------
    rating_df, sampled_rating_df: `tuple`
        All reviews with that rating and the subset reviews with that rating
    """
    
    rating_df = df[df[col_name] == star_value]
    sampled_rating_df = rating_df.sample(n=number_of_reviews)
    return rating_df, sampled_rating_df

In [12]:
subset_reviews = 50000

one_star = 1
rating_one, rating_one_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', one_star, subset_reviews)
two_stars = 2
rating_two, rating_two_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', two_stars, subset_reviews)
three_stars = 3
rating_three, rating_three_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', three_stars, subset_reviews)
four_stars = 4
rating_four, rating_four_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', four_stars, subset_reviews)
five_stars = 5
rating_five, rating_five_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', five_stars, subset_reviews)

In [13]:
sampled_reviews_df = pd.concat([rating_one_sampled, rating_two_sampled, rating_three_sampled, rating_four_sampled, rating_five_sampled])

In [14]:
sampled_reviews_df

Unnamed: 0,star_rating,review_body
2130484,1,This item is extremely cheap...and it shows.<b...
2303466,1,"Phone was not consistent, some callers could n..."
568600,1,Delivery was correct. The product didn't work....
1165512,1,I should have looked at the product descriptio...
790124,1,"I simply could not get it to work, and gave up..."
...,...,...
462789,5,This is a lot better than the various scientif...
468539,5,Nice compact printing calculator.<br />It is s...
1380882,5,Had a little problem with one of the cartridge...
1318890,5,"Great color, quality prints. Would prefer to ..."


In [15]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):
    """Categorizes reviews by adding a rating

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    rating_col: `str`
        Column with rating values
    
    threshold: `int`
        Where to split the ratings such that categories can be formed

    sentiment_type: `str`
        One of three types of sentiment: positive, negative, or neural

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the new sentiment appened
    """


    if sentiment_type == 'negative_review_class':
        positive_review_threshold = df[rating_col].astype('int32') > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'neutral_review_class':
        positive_review_threshold = df[rating_col].astype('int32') == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 2

    elif sentiment_type == 'positive_review_class':
        positive_review_threshold = df[rating_col].astype('int32') < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [16]:
negative_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'negative_review_class')
neutral_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'neutral_review_class')
positive_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'positive_review_class')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


In [17]:
sampled_reviews_ratings_df = pd.concat([negative_review_class_df, neutral_review_class_df, positive_review_class_df])
sampled_reviews_ratings_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,
2082312,4,I purchased this to use for my study abroad tr...,1.0,,
...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0


In [18]:
negative_reviews_df = sampled_reviews_ratings_df['negative_review_class'].dropna()
neutral_reviews_df = sampled_reviews_ratings_df['neutral_review_class'].dropna()
positive_reviews_df = sampled_reviews_ratings_df['positive_review_class'].dropna()

In [19]:
sampled_reviews_ratings_df['review_class'] = pd.concat([negative_reviews_df, neutral_reviews_df, positive_reviews_df])

In [20]:
sampled_reviews_ratings_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0
...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0


# Ignore below

# Data Cleaning

## Lower case

In [21]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        # NOT all reviews are strings, thus all can't be converted to lower cased
        if type(text_review) != str:
            print(True, text_review)
            converted_str = str(text_review)
            lower_case_reviews.append(text_review)
         
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)

    updated_df['lower_cased'] = lower_case_reviews
    return updated_df

In [22]:
reviews_lower_cased = convert_reviews_to_lower_case(sampled_reviews_ratings_df, 'review_body')

In [23]:
reviews_lower_cased

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...
...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...


In [24]:
print("reviews_lower_cased:")
generate_sample_reviews(reviews_lower_cased, 'lower_cased', 3)

reviews_lower_cased:
{'star_rating': 4, 'lower_cased': "i purchased this to lift my 3rd monitor off my desk. the only flaw it has is that it tends to slip down when you adjust the monitor left-to-right. if you're purchasing this to swivel a lot, you may want to consider something else."}
{'star_rating': 4, 'lower_cased': "i was afraid this hanger wouldn't fit over the door that i wanted to close, but it did.  it is a very solid product.  the only downside is that since it is metal, if you close the door with nothing to weigh it down, it makes a lot of jangling noises.  but a plastic hanger would likely be too flimsy, so i don't know how they could get away from that problem."}
{'star_rating': 4, 'lower_cased': "i like my go bible.  i really enjoy listening to scourby read the king james.  the only thing i wish (and maybe i just haven't figured out how yet) is to be able to mark the place i left off, so i can start there the next time without having to remember where i was or search to 

## Remove HTML and URLs

In [25]:
def remove_html_and_urls(df: pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    cleaned_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
            # Check and remove HTML tags
            has_html = bool(re.search('<.*?>', text_review))
            if has_html == True:
                # print("Review", text_reviews_idx, "has HTML -- ", text_review)
                pass

            no_html_review = re.sub('<.*?>', ' ', text_review)
            # print("Review", text_reviews_idx, "without HTML -- ", no_html_review)
        
            # Check and remove URLs
            has_url = bool(re.search(r'http\S+', no_html_review))
            if has_url == True:
                # print("Review", text_reviews_idx, "has URL --", no_html_review)
                pass

            no_html_url_review = re.sub(r'http\S+', '', no_html_review)
            # print("Review", text_reviews_idx, "without HTML, URL -- ", no_html_url_review)
            # print()
            cleaned_reviews.append(no_html_url_review)
        else:
            # print(text_reviews_idx, text_review)
            cleaned_reviews.append(text_review)
            

    updated_df['without_html_urls'] = cleaned_reviews
    return updated_df

In [26]:
no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')

In [27]:
no_html_urls_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased,without_html_urls
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger wouldn't fit over the...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening ...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i didn...
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...
...,...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one,it doesn't have a concordance. need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...,doesn't have very good suction and it'll eat u...


In [28]:
print("without_html_urls:")
generate_sample_reviews(no_html_urls_df, 'without_html_urls', 3)

without_html_urls:
{'star_rating': 4, 'without_html_urls': "i purchased this to lift my 3rd monitor off my desk. the only flaw it has is that it tends to slip down when you adjust the monitor left-to-right. if you're purchasing this to swivel a lot, you may want to consider something else."}
{'star_rating': 4, 'without_html_urls': "i was afraid this hanger wouldn't fit over the door that i wanted to close, but it did.  it is a very solid product.  the only downside is that since it is metal, if you close the door with nothing to weigh it down, it makes a lot of jangling noises.  but a plastic hanger would likely be too flimsy, so i don't know how they could get away from that problem."}
{'star_rating': 4, 'without_html_urls': "i like my go bible.  i really enjoy listening to scourby read the king james.  the only thing i wish (and maybe i just haven't figured out how yet) is to be able to mark the place i left off, so i can start there the next time without having to remember where i w

## Remove Contractions

In [29]:
store_contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he's": "he is",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "mustn't": "must not",
    "shan't": "shall not",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they're": "they are",
    "wasn't": "was not",
    "we're": "we are",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    "you're": "you are",
    "you'll": "you will",
    "you'd": "you would",
    "we'll": "we will",
    "we've": "we have",
    "we'd": "we would",
    "I'm": "I am",
    "i've": "I have",
    "I've": "I have",
    "I'd": "I would",
    "it'll": "it will",
    "they'll": "they will",
    "they've": "they have",
    "they'd": "they would",
    "he'll": "he will",
    "he'd": "he would",
    "she'll": "she will",
    "we'd": "we would",
    "we'll": "we will",
    "you've": "you have",
    "you'd": "you would",
    "you'll": "you will",
    "I'll": "I will",
    "I'd": "I would",
    "it's": "it is",
    "it'd": "it would",
    "i'm": "I am",
    "he's": "he is",
    "he'll": "he will",
    "she's": "she is",
    "she'll": "she will",
    "we're": "we are",
    "we've": "we have",
    "we'll": "we will",
    "you're": "you are",
    "you've": "you have",
    "you'll": "you will",
    "they're": "they are",
    "they've": "they have",
    "they'll": "they will",
    "that's": "that is",
    "that'll": "that will",
    "that'd": "that would",
    "who's": "who is",
    "who'll": "who will",
    "who'd": "who would",
    "what's": "what is",
    "what'll": "what will",
    "what'd": "what would",
    "when's": "when is",
    "when'll": "when will",
    "when'd": "when would",
    "where's": "where is",
    "where'll": "where will",
    "where'd": "where would",
    "why's": "why is",
    "why'll": "why will",
    "why'd": "why would",
    "how's": "how is",
    "how'll": "how will",
    "how'd": "how would"
}


In [30]:
def locate_and_replace_contractions(review):
    """Find the contractions to replace from a specific review

    Parameters
    ----------
    review: `str`
        A specific review

    Return
    ------
    non_contraction_review: `str`
        The updated specific review with contractions expanded
    
    """
    if isinstance(review, str):
        get_words = review.split()

        store_non_contraction_words = []

        for word in get_words:
            if word in store_contractions:
                non_contraction_form = store_contractions[word]
                # print(word, "-->", non_contraction_form)

                store_non_contraction_words.append(non_contraction_form)

            else:
                # print(word)
                store_non_contraction_words.append(word)

        non_contraction_review = ' '.join(store_non_contraction_words)
        return non_contraction_review
    else:
        return review


In [31]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_contractions_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # print("Review", text_reviews_idx, "with possible contraction(s) -- ", text_review)

        without_contraction = locate_and_replace_contractions(text_review)

        # print("Review", text_reviews_idx, "without contraction -- ", without_contraction)
        # print()

        without_contractions_reviews.append(without_contraction)

    updated_df['without_contractions'] = without_contractions_reviews
    return updated_df

In [32]:
no_contractions_df = remove_contractions(no_html_urls_df, 'without_html_urls')

In [33]:
no_contractions_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased,without_html_urls,without_contractions
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger would not fit over th...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening t...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i did ...
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...
...,...,...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one,it doesn't have a concordance. need one,it does not have a concordance. need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...,doesn't have very good suction and it'll eat u...,does not have very good suction and it will ea...


In [34]:
print("without_contractions:")
generate_sample_reviews(no_contractions_df, 'without_contractions', 3)

without_contractions:
{'star_rating': 4, 'without_contractions': 'i purchased this to lift my 3rd monitor off my desk. the only flaw it has is that it tends to slip down when you adjust the monitor left-to-right. if you are purchasing this to swivel a lot, you may want to consider something else.'}
{'star_rating': 4, 'without_contractions': 'i was afraid this hanger would not fit over the door that i wanted to close, but it did. it is a very solid product. the only downside is that since it is metal, if you close the door with nothing to weigh it down, it makes a lot of jangling noises. but a plastic hanger would likely be too flimsy, so i do not know how they could get away from that problem.'}
{'star_rating': 4, 'without_contractions': 'i like my go bible. i really enjoy listening to scourby read the king james. the only thing i wish (and maybe i just have not figured out how yet) is to be able to mark the place i left off, so i can start there the next time without having to remembe

## Remove Non-alphabetical characters

In [35]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """

    alphabetical_char_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        
        if isinstance(text_review, str):

            # Check for non-alphabetical characters
            has_non_alphabetical_char = bool(re.search(r'[^a-zA-Z]', text_review))
            if has_non_alphabetical_char == True:
                # print("Review", text_reviews_idx, "has HTML -- ", text_review)
                pass
            
            # Remove non-alphabetical characters
            with_alphabetical_char = re.sub(r'[^a-zA-Z\s]', ' ', text_review)
            # print("Review", text_reviews_idx, "has HTML -- ", with_alphabetical_char)
            alphabetical_char_reviews.append(with_alphabetical_char)
        else:
            alphabetical_char_reviews.append(text_review)

    updated_df['with_alpha_chars_only'] = alphabetical_char_reviews
    return updated_df

In [36]:
only_alpha_chars_df = remove_non_alphabetical_characters(no_contractions_df, 'without_contractions')

In [37]:
only_alpha_chars_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased,without_html_urls,without_contractions,with_alpha_chars_only
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my rd monitor off my...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening t...,i like my go bible i really enjoy listening t...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i did ...,this item is marked small but i did ...
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...
...,...,...,...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one,it doesn't have a concordance. need one,it does not have a concordance. need one,it does not have a concordance need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster at ...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...,doesn't have very good suction and it'll eat u...,does not have very good suction and it will ea...,does not have very good suction and it will ea...


In [38]:
print("with_alpha_chars_only:")
generate_sample_reviews(only_alpha_chars_df, 'with_alpha_chars_only', 3)

with_alpha_chars_only:
{'star_rating': 4, 'with_alpha_chars_only': 'i purchased this to lift my  rd monitor off my desk  the only flaw it has is that it tends to slip down when you adjust the monitor left to right  if you are purchasing this to swivel a lot  you may want to consider something else '}
{'star_rating': 4, 'with_alpha_chars_only': 'i was afraid this hanger would not fit over the door that i wanted to close  but it did  it is a very solid product  the only downside is that since it is metal  if you close the door with nothing to weigh it down  it makes a lot of jangling noises  but a plastic hanger would likely be too flimsy  so i do not know how they could get away from that problem '}
{'star_rating': 4, 'with_alpha_chars_only': 'i like my go bible  i really enjoy listening to scourby read the king james  the only thing i wish  and maybe i just have not figured out how yet  is to be able to mark the place i left off  so i can start there the next time without having to rem

## Remove extra spaces

In [39]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    single_spaced_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
        # Check if there are any extra spaces
            has_extra_space = bool(re.search(r' +', text_review))
            if has_extra_space == True:
                # print("Review", text_reviews_idx, "has extra space -- ", text_review)
                pass
            
            # Remove extra spaces
            single_spaced_review = re.sub(r' +', ' ', text_review)
            # print("Review", text_reviews_idx, "without extra space -- ", single_spaced_review)
            # print()
            
            single_spaced_reviews.append(single_spaced_review)
        else:
            single_spaced_reviews.append(text_review)

    updated_df['without_extra_space'] = single_spaced_reviews
    return updated_df

In [40]:
no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'with_alpha_chars_only')

In [41]:
no_extra_space_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased,without_html_urls,without_contractions,with_alpha_chars_only,without_extra_space
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my rd monitor off my...,i purchased this to lift my rd monitor off my ...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening t...,i like my go bible i really enjoy listening t...,i like my go bible i really enjoy listening to...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i did ...,this item is marked small but i did ...,this item is marked small but i did not realiz...
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...
...,...,...,...,...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one,it doesn't have a concordance. need one,it does not have a concordance. need one,it does not have a concordance need one,it does not have a concordance need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster at ...,stuck in the drawer until i get a poster at th...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...,doesn't have very good suction and it'll eat u...,does not have very good suction and it will ea...,does not have very good suction and it will ea...,does not have very good suction and it will ea...


In [42]:
print("without_extra_space:")
generate_sample_reviews(no_extra_space_df, 'without_extra_space', 3)

without_extra_space:
{'star_rating': 4, 'without_extra_space': 'i purchased this to lift my rd monitor off my desk the only flaw it has is that it tends to slip down when you adjust the monitor left to right if you are purchasing this to swivel a lot you may want to consider something else '}
{'star_rating': 4, 'without_extra_space': 'i was afraid this hanger would not fit over the door that i wanted to close but it did it is a very solid product the only downside is that since it is metal if you close the door with nothing to weigh it down it makes a lot of jangling noises but a plastic hanger would likely be too flimsy so i do not know how they could get away from that problem '}
{'star_rating': 4, 'without_extra_space': 'i like my go bible i really enjoy listening to scourby read the king james the only thing i wish and maybe i just have not figured out how yet is to be able to mark the place i left off so i can start there the next time without having to remember where i was or sea

# Pre-processing

## remove the stop words 

In [43]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_stop_words_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
            text_review_words = word_tokenize(text_review) 

        

            # print("Before stop word removal", text_reviews_idx, " -- ", text_review)

            filtered_review = []

            for text_review_words_idx in range(len(text_review_words)):
                text_review_word = text_review_words[text_review_words_idx]
                
                # Check if review word is a stop word
                if text_review_word in stop_words:
                    # print("  Stop word -- ", text_review_word)
                    pass
                else:
                    # print(text_review_word, " -- is NOT a stop word in review")
                    filtered_review.append(text_review_word)

            
            filtered_review = " ".join(filtered_review)
            # print("After stop word removal", text_reviews_idx, " -- ", filtered_review)
            # print()
            
            without_stop_words_reviews.append(filtered_review)
        else:
            without_stop_words_reviews.append(text_review)
        

    updated_df['without_stop_words'] = without_stop_words_reviews
    return updated_df

In [44]:
no_stop_words_df = filter_stop_words(no_extra_space_df, 'without_extra_space')

In [45]:
no_stop_words_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased,without_html_urls,without_contractions,with_alpha_chars_only,without_extra_space,without_stop_words
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my rd monitor off my...,i purchased this to lift my rd monitor off my ...,purchased lift rd monitor desk flaw tends slip...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...,afraid hanger would fit door wanted close soli...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening t...,i like my go bible i really enjoy listening t...,i like my go bible i really enjoy listening to...,like go bible really enjoy listening scourby r...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i did ...,this item is marked small but i did ...,this item is marked small but i did not realiz...,item marked small realize size wallet
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,purchased use study abroad trip coming almost ...
...,...,...,...,...,...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one,it doesn't have a concordance. need one,it does not have a concordance. need one,it does not have a concordance need one,it does not have a concordance need one,concordance need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster at ...,stuck in the drawer until i get a poster at th...,stuck drawer get poster ways hang picture seem...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,hanging refrigerator fell magnet get fed produ...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...,doesn't have very good suction and it'll eat u...,does not have very good suction and it will ea...,does not have very good suction and it will ea...,does not have very good suction and it will ea...,good suction eat pencils


In [46]:
print("without_stop_words:")
generate_sample_reviews(no_stop_words_df, 'without_stop_words', 3)

without_stop_words:
{'star_rating': 4, 'without_stop_words': 'purchased lift rd monitor desk flaw tends slip adjust monitor left right purchasing swivel lot may want consider something else'}
{'star_rating': 4, 'without_stop_words': 'afraid hanger would fit door wanted close solid product downside since metal close door nothing weigh makes lot jangling noises plastic hanger would likely flimsy know could get away problem'}
{'star_rating': 4, 'without_stop_words': 'like go bible really enjoy listening scourby read king james thing wish maybe figured yet able mark place left start next time without remember search figure add bookmarks would give five stars'}


## perform lemmatization  

- "A sentence with many words"
    - "words" -> word

In [47]:
def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    lemmed_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]   
        if isinstance(text_review, str):     
            words_in_review = word_tokenize(text_review) 

            # print("Before lem update", text_reviews_idx, " -- ", text_review)
            # print("Lemmed words", words_in_review)
            

            lemmed_sentence = []

            # Split review into words
            for lemmed_words_idx in range(len(words_in_review)):
                word = words_in_review[lemmed_words_idx]
                
                apply_lemmatization = lem.lemmatize(word)
                # print(apply_lemmatization)
                
                lemmed_sentence.append(apply_lemmatization)
                filtered_review = " ".join(lemmed_sentence)
        
            # print("After lem update -- ", filtered_review)
            # print()

            lemmed_reviews.append(filtered_review)
        else:
            lemmed_reviews.append(text_review)

    updated_df['lemmed_reviews'] = lemmed_reviews
    return updated_df

In [48]:
lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')

In [49]:
lemmed_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,review_class,lower_cased,without_html_urls,without_contractions,with_alpha_chars_only,without_extra_space,without_stop_words,lemmed_reviews
2269004,4,I purchased this to lift my 3rd monitor off my...,1.0,,,1.0,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my 3rd monitor off my...,i purchased this to lift my rd monitor off my...,i purchased this to lift my rd monitor off my ...,purchased lift rd monitor desk flaw tends slip...,purchased lift rd monitor desk flaw tends slip...
1595911,4,I was afraid this hanger wouldn't fit over the...,1.0,,,1.0,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger wouldn't fit over the...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...,i was afraid this hanger would not fit over th...,afraid hanger would fit door wanted close soli...,afraid hanger would fit door wanted close soli...
329899,4,I like my Go Bible. I really enjoy listening ...,1.0,,,1.0,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening ...,i like my go bible. i really enjoy listening t...,i like my go bible i really enjoy listening t...,i like my go bible i really enjoy listening to...,like go bible really enjoy listening scourby r...,like go bible really enjoy listening scourby r...
624883,4,This item is marked &#34;small&#34; but I didn...,1.0,,,1.0,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i didn...,this item is marked &#34;small&#34; but i did ...,this item is marked small but i did ...,this item is marked small but i did not realiz...,item marked small realize size wallet,item marked small realize size wallet
2082312,4,I purchased this to use for my study abroad tr...,1.0,,,1.0,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,i purchased this to use for my study abroad tr...,purchased use study abroad trip coming almost ...,purchased use study abroad trip coming almost ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
170042,2,It doesn't have a concordance. Need one,,,3.0,3.0,it doesn't have a concordance. need one,it doesn't have a concordance. need one,it does not have a concordance. need one,it does not have a concordance need one,it does not have a concordance need one,concordance need one,concordance need one
720894,2,Stuck in the drawer until I get a poster. At 5...,,,3.0,3.0,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster. at 5...,stuck in the drawer until i get a poster at ...,stuck in the drawer until i get a poster at th...,stuck drawer get poster ways hang picture seem...,stuck drawer get poster way hang picture seem ...
1415347,2,This was hanging on my refrigerator and fell o...,,,3.0,3.0,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,this was hanging on my refrigerator and fell o...,hanging refrigerator fell magnet get fed produ...,hanging refrigerator fell magnet get fed produ...
1053551,2,Doesn't have very good suction and it'll eat u...,,,3.0,3.0,doesn't have very good suction and it'll eat u...,doesn't have very good suction and it'll eat u...,does not have very good suction and it will ea...,does not have very good suction and it will ea...,does not have very good suction and it will ea...,good suction eat pencils,good suction eat pencil


In [50]:
print("without_unlemmed_words:")
generate_sample_reviews(lemmed_df, 'lemmed_reviews', 3)

without_unlemmed_words:
{'star_rating': 4, 'lemmed_reviews': 'purchased lift rd monitor desk flaw tends slip adjust monitor left right purchasing swivel lot may want consider something else'}
{'star_rating': 4, 'lemmed_reviews': 'afraid hanger would fit door wanted close solid product downside since metal close door nothing weigh make lot jangling noise plastic hanger would likely flimsy know could get away problem'}
{'star_rating': 4, 'lemmed_reviews': 'like go bible really enjoy listening scourby read king james thing wish maybe figured yet able mark place left start next time without remember search figure add bookmark would give five star'}


# Extract Word Embeddings

In [109]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import gensim.downloader as api
pretrained_word_two_vec_model = api.load('word2vec-google-news-300')

2024-02-03 12:03:49,811 : INFO : loading projection weights from /Users/brinkley97/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2024-02-03 12:04:28,843 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/brinkley97/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-02-03T12:04:28.843581', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.6.3-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


In [75]:
# retrieve the vocabulary of a model
def get_model_vocabulary(get_model):
    words_in_model = []
    for index, word in enumerate(get_model.index_to_key):
        words_in_model.append(word)
        # if index == 7:
        #     break
        # print(f"word #{index}/{len(get_model.index_to_key)} is {word}")
    return words_in_model

In [79]:
pre_trained_words = get_model_vocabulary(pretrained_word_two_vec_model)
pre_trained_words[:3]

['</s>', 'in', 'for']

## My Model

In [89]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __init__(self, df: pd.DataFrame, col_name: str):
        self.df = df
        self.col_name = col_name

    def __iter__(self):
        """

        Parameters
        ----------
        df: `pd.DataFrame`
            The data
        
        col_name: `str`
            Column with reviews

        words_in_model: `list`
            Words in Word2Vec model

        Return
        ------
        
        """

        text_reviews = self.df[self.col_name].values

        for text_reviews_idx in range(len(text_reviews)):
            text_review = text_reviews[text_reviews_idx]
            # print(text_reviews_idx, "--", text_review)
    
    
            yield utils.simple_preprocess(text_review)

        

In [111]:
import gensim.models

sentences = MyCorpus(lemmed_df, 'lemmed_reviews')
print("\nSentences", sentences)
my_model = gensim.models.Word2Vec(sentences=sentences, vector_size=300, window=11, min_count=10)

2024-02-03 12:08:24,424 : INFO : collecting all words and their counts
2024-02-03 12:08:24,425 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



Sentences <__main__.MyCorpus object at 0x20df53590>


2024-02-03 12:08:24,871 : INFO : PROGRESS: at sentence #10000, processed 308863 words, keeping 14063 word types
2024-02-03 12:08:25,302 : INFO : PROGRESS: at sentence #20000, processed 619757 words, keeping 19391 word types
2024-02-03 12:08:25,740 : INFO : PROGRESS: at sentence #30000, processed 940990 words, keeping 23322 word types
2024-02-03 12:08:26,147 : INFO : PROGRESS: at sentence #40000, processed 1253427 words, keeping 26607 word types
2024-02-03 12:08:26,551 : INFO : PROGRESS: at sentence #50000, processed 1567861 words, keeping 29555 word types
2024-02-03 12:08:26,839 : INFO : PROGRESS: at sentence #60000, processed 1791134 words, keeping 31787 word types
2024-02-03 12:08:27,112 : INFO : PROGRESS: at sentence #70000, processed 2007511 words, keeping 33716 word types
2024-02-03 12:08:27,374 : INFO : PROGRESS: at sentence #80000, processed 2224876 words, keeping 35390 word types
2024-02-03 12:08:27,626 : INFO : PROGRESS: at sentence #90000, processed 2435456 words, keeping 371

### Similar scores

[ ] Write summary of differences between their model and my model

### PRetrained

In [122]:
# pretrained_word_two_vec_model['hello'].shape

result = pretrained_word_two_vec_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
result


[('queen', 0.7118193507194519)]

### Our model

In [121]:
# Must extract our word vectors from our model 
# my_model.wv['hello']


my_result = my_model.wv.most_similar(positive=['woman', 'king'], negative=['man', 'men'], topn=10)
my_result

[('listed', 0.4044250249862671),
 ('spec', 0.38941407203674316),
 ('published', 0.3860199749469757),
 ('specification', 0.3699833154678345),
 ('shrunk', 0.3680395185947418),
 ('misleading', 0.35684555768966675),
 ('listing', 0.3551398813724518),
 ('implies', 0.3504006564617157),
 ('tabloid', 0.3420540988445282),
 ('largest', 0.3413541913032532)]

In [105]:
my_model_words = get_model_vocabulary(my_model.wv)
# my_model_words[:3]

In [85]:
def extract_word_embeddings(sentence: str, words_in_model, get_model):
    """Extract word embeddings per sentence
    
    """
    sentence_vectorized = []
    updated_sentence = sentence.split(" ")
    print("Before -- ", sentence, "\n", updated_sentence)

    # Split review into words
    for word_idx in range(len(updated_sentence)):
        word = updated_sentence[word_idx]
        # print(word)

        if word in words_in_model:
            print(word)
            vector = get_model[word]
            print("vectorized word")
            sentence_vectorized.append(vector)
        # else:
            # print(False, word)
        
    return sentence_vectorized


In [86]:
def word_embeddings(df: pd.DataFrame, col_name: str, words_in_model: list, model_to_use):
    """Extract word embeddings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    words_in_model: `list`
        Words in Word2Vec model

    Return
    ------
    
    """
    
    sentence_vectorized = []
    # updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        extracted_for_sentence = extract_word_embeddings(text_review, words_in_model,model_to_use)        
        sentence_vectorized.append(extracted_for_sentence)
        
    return sentence_vectorized

In [106]:
word_embedding_features = word_embeddings(lemmed_df[:3], 'lemmed_reviews', pre_trained_words, pretrained_word_two_vec_model)
my_word_embedding_features = word_embeddings(lemmed_df[:3], 'lemmed_reviews', my_model_words, my_model.wv)

Before --  purchased lift rd monitor desk flaw tends slip adjust monitor left right purchasing swivel lot may want consider something else 
 ['purchased', 'lift', 'rd', 'monitor', 'desk', 'flaw', 'tends', 'slip', 'adjust', 'monitor', 'left', 'right', 'purchasing', 'swivel', 'lot', 'may', 'want', 'consider', 'something', 'else']
purchased
vectorized word
lift
vectorized word
rd
vectorized word
monitor
vectorized word
desk
vectorized word
flaw
vectorized word
tends
vectorized word
slip
vectorized word
adjust
vectorized word
monitor
vectorized word
left
vectorized word
right
vectorized word
purchasing
vectorized word
swivel
vectorized word
lot
vectorized word
may
vectorized word
want
vectorized word
consider
vectorized word
something
vectorized word
else
vectorized word
Before --  afraid hanger would fit door wanted close solid product downside since metal close door nothing weigh make lot jangling noise plastic hanger would likely flimsy know could get away problem 
 ['afraid', 'hanger',

In [None]:
# document to vector mapping
    # Filter out words that are not in vocabulary (2 for loop)
    # If document contains no words in the model vocabulary return -- no for loop
    # Avg word embbeding of the whole sentence -- no for loop

In [163]:
def extract_word_embeddings_2(get_model, row):
    
    # print('Before:', row)
    row = row.split()
    word_in_vocab = []

    ## COME BACK TO
    if len(row) == 0:
        pass


    for word in row:
        if word in get_model.key_to_index:
            word_in_vocab.append(word)
        
    #word_in_vocab = " ".join(word_in_vocab)
    # print("After: ", word_in_vocab)
    # print()

    vector_of_sentence = get_model[word_in_vocab]
    # print(type(vector_of_sentence))
    
    return np.mean(vector_of_sentence, axis=0)

### MAY  NEED TO RUN ''.join(the list you append to)
lemmed_df['pretrained_embeddings'] = lemmed_df['lemmed_reviews'][:3].apply(lambda row: extract_word_embeddings_2(pretrained_word_two_vec_model, row))
lemmed_df['my_model_embeddings'] = lemmed_df['lemmed_reviews'][:3].apply(lambda row: extract_word_embeddings_2(my_model.wv, row))
lemmed_df['pretrained_embeddings'].iloc[0].shape

(300,)

In [None]:
lemmed_df

In [129]:
my_word_embedding_features[0][0]

array([-2.03638864e+00,  5.31829059e-01,  1.69695723e+00,  6.77807331e-01,
        1.42202950e+00,  6.82132661e-01,  1.87303555e+00, -2.73913355e-03,
        1.25931478e+00, -6.24008596e-01,  7.06488311e-01, -6.46920145e-01,
       -5.11909127e-01, -2.69071251e-01, -1.77490568e+00,  1.63033471e-01,
        1.34481096e+00,  8.67999077e-01, -3.58683020e-01, -1.43647000e-01,
        1.19896972e+00,  1.20162141e+00,  1.89440501e+00,  1.14940119e+00,
        2.07386994e+00,  6.49928629e-01,  2.51353770e-01, -1.79788578e+00,
       -2.53321439e-01, -4.36661392e-01,  1.52669385e-01, -2.52107650e-01,
        1.27619195e+00, -4.64881867e-01,  1.80250514e+00, -4.53138292e-01,
        6.07064426e-01,  5.54222882e-01, -1.68691683e+00, -1.46844506e+00,
        3.06803167e-01,  1.96174502e+00, -8.07956886e-03, -7.76250124e-01,
       -4.21950519e-01,  3.76216531e-01,  7.61391580e-01,  2.72318912e+00,
        1.03680348e+00, -1.90841615e+00, -6.94572806e-01,  1.39220023e+00,
       -3.76010954e-01, -

## Split Features and Sentiment Labels

In [None]:
sentiments = lemmed_df['sentiment']
sentiments.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_features, sentiments, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Models + Evaluation Metrics

In [None]:
def eval_accuracy(y_true, y_prediction):
    return sklearn.metrics.accuracy_score(y_true, y_prediction)

def eval_precision(y_true, y_prediction):
    return sklearn.metrics.precision_score(y_true, y_prediction)

def eval_recall(y_true, y_prediction):
    return sklearn.metrics.recall_score(y_true, y_prediction)

def eval_f1_score(y_true, y_prediction):
    return sklearn.metrics.f1_score(y_true, y_prediction)

In [None]:
def train_eval_metric(y_train_true, y_train_predictions):
    accuracy = eval_accuracy(y_train_true, y_train_predictions)
    precision = eval_precision(y_train_true, y_train_predictions)
    recall = eval_recall(y_train_true, y_train_predictions)
    f1 = eval_f1_score(y_train_true, y_train_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

def test_eval_metric(y_test_true, y_test_predictions):
    accuracy = eval_accuracy(y_test_true, y_test_predictions)
    precision = eval_precision(y_test_true, y_test_predictions)
    recall = eval_recall(y_test_true, y_test_predictions)
    f1 = eval_f1_score(y_test_true, y_test_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    return metrics_dict

# Perceptron

In [None]:
def perceptron_model(X_train, X_test, y_train, y_test): 

    technique = Perceptron(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [None]:
perceptron_train_metrics, perceptron_test_metrics = perceptron_model(X_train, X_test, y_train, y_test)

In [None]:
perceptron_train_metrics, perceptron_test_metrics

# SVM

In [None]:
def svm_model(X_train, X_test, y_train, y_test): 

    technique = LinearSVC(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [None]:
svm_train_metrics, svm_test_metrics = svm_model(X_train, X_test, y_train, y_test)

In [None]:
svm_train_metrics, svm_test_metrics

# Logistic Regression

In [None]:
def logistic_regression_model(X_train, X_test, y_train, y_test): 

    technique = LogisticRegression(random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics

In [None]:
logistic_regression_train_metrics, logistic_regression_test_metrics = logistic_regression_model(X_train, X_test, y_train, y_test)

In [None]:
logistic_regression_train_metrics, logistic_regression_test_metrics

# Naive Bayes

In [None]:
def naive_bayes_model(X_train, X_test, y_train, y_test): 

    technique = MultinomialNB()
    technique.fit(X_train.toarray(), y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)

    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [None]:
naive_bayes_train_metrics, naive_bayes_test_metrics = naive_bayes_model(X_train, X_test, y_train, y_test)

In [None]:
naive_bayes_train_metrics, naive_bayes_test_metrics