# HW2- Binary Classification 
- Detravious Jamari Brinkley
- CSCI-544: Applied Natural Language Processing
- python version: 3.11.4

In [285]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

import sklearn
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Read Data

In [286]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

## Keep Reviews and Ratings

In [287]:
reviews_ratings_df = amazon_reviews_copy_df.loc[0:, ['star_rating', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [288]:
def generate_sample_reviews(df: pd.DataFrame, review_col_name: str, number_of_reviews: int = 3):
    """Include reviews and ratings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    review_col_name: `str`
        The specific_column to get the reviews and ratings of
    
    number_of_reviews: `int`
        Number of samples to include


    Return
    ------
    Nothing; instead, print the reviews with ratings
    """


    columns_to_include = [review_col_name, 'star_rating']

    # Initialize an empty list to store dictionaries
    list_of_dicts = []

    # Iterate over the specified columns and retrieve the first three rows
    for row in df[columns_to_include].head(3).to_dict(orient='records'):
        list_of_dicts.append({'star_rating': row['star_rating'], review_col_name: row[review_col_name]})

    for dictionary in list_of_dicts:
        print(dictionary)

 ## Select 100000 reviews randomly from positive and negative classes


In [289]:
def update_data_type(df: pd.DataFrame, col_name: str):
    """Update the data type of the star ratings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with rating values

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the new sentiment appened

    """

    valid_ratings = ['1','2','3','4','5']
    star_rating_series = df[col_name].copy()

    # Convert type to strings
    star_rating_series.astype('str')

    # Check valid list and see which of our stars match
    rows = star_rating_series.index
    is_rating_in_valid_ratings = rows[star_rating_series.isin(valid_ratings)]

    # Convert to list
    is_rating_in_valid_ratings = is_rating_in_valid_ratings.to_list()

    updated_df = df.iloc[is_rating_in_valid_ratings]
    updated_df[col_name] = updated_df[col_name].astype(int)
    return updated_df

In [290]:
updated_reviews_ratings_df = update_data_type(reviews_ratings_df, 'star_rating')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_df[col_name] = updated_df[col_name].astype(int)


In [291]:
updated_reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [292]:
updated_reviews_ratings_df = updated_reviews_ratings_df.dropna()
updated_reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [293]:
# Check for NaN values
nan_check = updated_reviews_ratings_df.isna()

# Display the DataFrame with True where NaN values exist
print(nan_check)

# Check if any NaN value exists in the DataFrame
if nan_check.any().any():
    print("There are NaN values in the DataFrame.")
else:
    print("There are no NaN values in the DataFrame.")

         star_rating  review_body
0              False        False
1              False        False
2              False        False
3              False        False
4              False        False
...              ...          ...
2640249        False        False
2640250        False        False
2640251        False        False
2640252        False        False
2640253        False        False

[2640080 rows x 2 columns]
There are no NaN values in the DataFrame.


In [294]:
print("# reviews per rating", updated_reviews_ratings_df['star_rating'].value_counts())

# reviews per rating star_rating
5    1582704
4     418348
1     306967
3     193680
2     138381
Name: count, dtype: int64


In [295]:
def sample_star_ratings(df: pd.DataFrame, col_name: str, star_value: int, number_of_reviews: int):
    """Build a subset balanced dataset with reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The dataframe to use
    col_name: `str`
        The name of the column to get reviews from
    star_value: `int`
        The star rating of the review
    number_of_reviews: `int`
        The number of sub reviews to include in sample

    Return
    ------
    rating_df, sampled_rating_df: `tuple`
        All reviews with that rating and the subset reviews with that rating
    """
    
    rating_df = df[df[col_name] == star_value]
    sampled_rating_df = rating_df.sample(n=number_of_reviews)
    return rating_df, sampled_rating_df

In [296]:
# subset_reviews = 50000
subset_reviews = 300

one_star = 1
rating_one, rating_one_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', one_star, subset_reviews)
two_stars = 2
rating_two, rating_two_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', two_stars, subset_reviews)
three_stars = 3
rating_three, rating_three_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', three_stars, subset_reviews)
four_stars = 4
rating_four, rating_four_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', four_stars, subset_reviews)
five_stars = 5
rating_five, rating_five_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', five_stars, subset_reviews)

In [297]:
sampled_reviews_df = pd.concat([rating_one_sampled, rating_two_sampled, rating_three_sampled, rating_four_sampled, rating_five_sampled])

In [298]:
sampled_reviews_df

Unnamed: 0,star_rating,review_body
163153,1,Used it on a cruise for sign and sail card. 3 ...
1286562,1,"Based on the many excellent reviews, BIG disap..."
483876,1,horrible
815215,1,falls apart straight away
301971,1,These lamps are inexpensive and don't last lon...
...,...,...
134472,5,I placed my credit cards in the holders and th...
2180594,5,Works perfectly every time. I love the paper g...
1381784,5,I though we needed new phones and then decided...
793804,5,good price fast delivery Thanks


In [299]:

def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):
    """Categorizes reviews by adding a rating

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    rating_col: `str`
        Column with rating values
    
    threshold: `int`
        Where to split the ratings such that categories can be formed

    sentiment_type: `str`
        One of three types of sentiment: positive, negative, or neural

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the new sentiment appened
    """


    if sentiment_type == 'negative_review_class':
        positive_review_threshold = df[rating_col].astype('int32') > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'neutral_review_class':
        positive_review_threshold = df[rating_col].astype('int32') == threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 2

    elif sentiment_type == 'positive_review_class':
        positive_review_threshold = df[rating_col].astype('int32') < threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [300]:
negative_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'negative_review_class')
neutral_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'neutral_review_class')
positive_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'positive_review_class')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3


In [301]:
sampled_reviews_ratings_df = pd.concat([negative_review_class_df, neutral_review_class_df, positive_review_class_df])
sampled_reviews_ratings_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class
2528018,4,Bought this phone for my 86 year old uncle. Th...,1.0,,
67050,4,I had used one of these at work and liked it e...,1.0,,
2309113,4,"The size is about the same as an iPhone, its v...",1.0,,
1081311,4,I have found this stand to be very study for m...,1.0,,
1413852,4,This was the perfect refill for my printer. I...,1.0,,
...,...,...,...,...,...
1433552,2,"Do I return my printer and get a next one, I l...",,,3.0
835203,2,Pretty chicken s*** that it does NOT include N...,,,3.0
2553531,2,Use army time.When you set the date use the da...,,,3.0
1015071,2,It gets atleast 2 because of the sound quality...,,,3.0


In [302]:
negative_reviews_df = sampled_reviews_ratings_df['negative_review_class'].dropna()
neutral_reviews_df = sampled_reviews_ratings_df['neutral_review_class'].dropna()
positive_reviews_df = sampled_reviews_ratings_df['positive_review_class'].dropna()

In [303]:
sampled_reviews_ratings_df['binary_review_class'] = pd.concat([negative_reviews_df, positive_reviews_df])
sampled_reviews_ratings_df['ternary_review_class'] = pd.concat([negative_reviews_df, neutral_reviews_df, positive_reviews_df])

In [304]:
sampled_reviews_ratings_df['binary_review_class'].unique()

array([ 1., nan,  3.])

# Ignore below

# Data Cleaning

## Lower case

In [305]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        # NOT all reviews are strings, thus all can't be converted to lower cased
        if type(text_review) != str:
            print(True, text_review)
            converted_str = str(text_review)
            lower_case_reviews.append(text_review)
         
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)

    updated_df['lower_cased'] = lower_case_reviews
    return updated_df

In [306]:
# reviews_lower_cased = convert_reviews_to_lower_case(sampled_reviews_ratings_df, 'review_body')

In [307]:
# reviews_lower_cased

In [308]:
# print("reviews_lower_cased:")
# generate_sample_reviews(reviews_lower_cased, 'lower_cased', 3)

## Remove HTML and URLs

In [309]:
def remove_html_and_urls(df: pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    cleaned_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
            # Check and remove HTML tags
            has_html = bool(re.search('<.*?>', text_review))
            if has_html == True:
                # print("Review", text_reviews_idx, "has HTML -- ", text_review)
                pass

            no_html_review = re.sub('<.*?>', ' ', text_review)
            # print("Review", text_reviews_idx, "without HTML -- ", no_html_review)
        
            # Check and remove URLs
            has_url = bool(re.search(r'http\S+', no_html_review))
            if has_url == True:
                # print("Review", text_reviews_idx, "has URL --", no_html_review)
                pass

            no_html_url_review = re.sub(r'http\S+', '', no_html_review)
            # print("Review", text_reviews_idx, "without HTML, URL -- ", no_html_url_review)
            # print()
            cleaned_reviews.append(no_html_url_review)
        else:
            # print(text_reviews_idx, text_review)
            cleaned_reviews.append(text_review)
            

    updated_df['without_html_urls'] = cleaned_reviews
    return updated_df

In [310]:
# no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')

In [311]:
# no_html_urls_df

In [312]:
# print("without_html_urls:")
# generate_sample_reviews(no_html_urls_df, 'without_html_urls', 3)

## Remove Contractions

In [313]:
store_contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he's": "he is",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "mustn't": "must not",
    "shan't": "shall not",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they're": "they are",
    "wasn't": "was not",
    "we're": "we are",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    "you're": "you are",
    "you'll": "you will",
    "you'd": "you would",
    "we'll": "we will",
    "we've": "we have",
    "we'd": "we would",
    "I'm": "I am",
    "i've": "I have",
    "I've": "I have",
    "I'd": "I would",
    "it'll": "it will",
    "they'll": "they will",
    "they've": "they have",
    "they'd": "they would",
    "he'll": "he will",
    "he'd": "he would",
    "she'll": "she will",
    "we'd": "we would",
    "we'll": "we will",
    "you've": "you have",
    "you'd": "you would",
    "you'll": "you will",
    "I'll": "I will",
    "I'd": "I would",
    "it's": "it is",
    "it'd": "it would",
    "i'm": "I am",
    "he's": "he is",
    "he'll": "he will",
    "she's": "she is",
    "she'll": "she will",
    "we're": "we are",
    "we've": "we have",
    "we'll": "we will",
    "you're": "you are",
    "you've": "you have",
    "you'll": "you will",
    "they're": "they are",
    "they've": "they have",
    "they'll": "they will",
    "that's": "that is",
    "that'll": "that will",
    "that'd": "that would",
    "who's": "who is",
    "who'll": "who will",
    "who'd": "who would",
    "what's": "what is",
    "what'll": "what will",
    "what'd": "what would",
    "when's": "when is",
    "when'll": "when will",
    "when'd": "when would",
    "where's": "where is",
    "where'll": "where will",
    "where'd": "where would",
    "why's": "why is",
    "why'll": "why will",
    "why'd": "why would",
    "how's": "how is",
    "how'll": "how will",
    "how'd": "how would"
}


In [314]:
def locate_and_replace_contractions(review):
    """Find the contractions to replace from a specific review

    Parameters
    ----------
    review: `str`
        A specific review

    Return
    ------
    non_contraction_review: `str`
        The updated specific review with contractions expanded
    
    """
    if isinstance(review, str):
        get_words = review.split()

        store_non_contraction_words = []

        for word in get_words:
            if word in store_contractions:
                non_contraction_form = store_contractions[word]
                # print(word, "-->", non_contraction_form)

                store_non_contraction_words.append(non_contraction_form)

            else:
                # print(word)
                store_non_contraction_words.append(word)

        non_contraction_review = ' '.join(store_non_contraction_words)
        return non_contraction_review
    else:
        return review


In [315]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_contractions_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # print("Review", text_reviews_idx, "with possible contraction(s) -- ", text_review)

        without_contraction = locate_and_replace_contractions(text_review)

        # print("Review", text_reviews_idx, "without contraction -- ", without_contraction)
        # print()

        without_contractions_reviews.append(without_contraction)

    updated_df['without_contractions'] = without_contractions_reviews
    return updated_df

In [316]:
# no_contractions_df = remove_contractions(no_html_urls_df, 'without_html_urls')

In [317]:
# no_contractions_df

In [318]:
# print("without_contractions:")
# generate_sample_reviews(no_contractions_df, 'without_contractions', 3)

## Remove Non-alphabetical characters

In [319]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """

    alphabetical_char_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        
        if isinstance(text_review, str):

            # Check for non-alphabetical characters
            has_non_alphabetical_char = bool(re.search(r'[^a-zA-Z]', text_review))
            if has_non_alphabetical_char == True:
                # print("Review", text_reviews_idx, "has HTML -- ", text_review)
                pass
            
            # Remove non-alphabetical characters
            with_alphabetical_char = re.sub(r'[^a-zA-Z\s]', ' ', text_review)
            # print("Review", text_reviews_idx, "has HTML -- ", with_alphabetical_char)
            alphabetical_char_reviews.append(with_alphabetical_char)
        else:
            alphabetical_char_reviews.append(text_review)

    updated_df['with_alpha_chars_only'] = alphabetical_char_reviews
    return updated_df

In [320]:
# only_alpha_chars_df = remove_non_alphabetical_characters(no_contractions_df, 'without_contractions')

In [321]:
# only_alpha_chars_df

In [322]:
# print("with_alpha_chars_only:")
# generate_sample_reviews(only_alpha_chars_df, 'with_alpha_chars_only', 3)

## Remove extra spaces

In [323]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    single_spaced_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
        # Check if there are any extra spaces
            has_extra_space = bool(re.search(r' +', text_review))
            if has_extra_space == True:
                # print("Review", text_reviews_idx, "has extra space -- ", text_review)
                pass
            
            # Remove extra spaces
            single_spaced_review = re.sub(r' +', ' ', text_review)
            # print("Review", text_reviews_idx, "without extra space -- ", single_spaced_review)
            # print()
            
            single_spaced_reviews.append(single_spaced_review)
        else:
            single_spaced_reviews.append(text_review)

    updated_df['without_extra_space'] = single_spaced_reviews
    return updated_df

In [324]:
# no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'with_alpha_chars_only')

In [325]:
# no_extra_space_df

In [326]:
# print("without_extra_space:")
# generate_sample_reviews(no_extra_space_df, 'without_extra_space', 3)

# Pre-processing

## remove the stop words 

In [327]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_stop_words_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
            text_review_words = word_tokenize(text_review) 

        

            # print("Before stop word removal", text_reviews_idx, " -- ", text_review)

            filtered_review = []

            for text_review_words_idx in range(len(text_review_words)):
                text_review_word = text_review_words[text_review_words_idx]
                
                # Check if review word is a stop word
                if text_review_word in stop_words:
                    # print("  Stop word -- ", text_review_word)
                    pass
                else:
                    # print(text_review_word, " -- is NOT a stop word in review")
                    filtered_review.append(text_review_word)

            
            filtered_review = " ".join(filtered_review)
            # print("After stop word removal", text_reviews_idx, " -- ", filtered_review)
            # print()
            
            without_stop_words_reviews.append(filtered_review)
        else:
            without_stop_words_reviews.append(text_review)
        

    updated_df['without_stop_words'] = without_stop_words_reviews
    return updated_df

In [328]:
# no_stop_words_df = filter_stop_words(no_extra_space_df, 'without_extra_space')

In [329]:
# no_stop_words_df

In [330]:
# print("without_stop_words:")
# generate_sample_reviews(no_stop_words_df, 'without_stop_words', 3)

## perform lemmatization  

- "A sentence with many words"
    - "words" -> word

In [331]:
def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    lemmed_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]   
        if isinstance(text_review, str):     
            words_in_review = word_tokenize(text_review) 

            # print("Before lem update", text_reviews_idx, " -- ", text_review)
            # print("Lemmed words", words_in_review)
            

            lemmed_sentence = []

            # Split review into words
            for lemmed_words_idx in range(len(words_in_review)):
                word = words_in_review[lemmed_words_idx]
                
                apply_lemmatization = lem.lemmatize(word)
                # print(apply_lemmatization)
                
                lemmed_sentence.append(apply_lemmatization)
                filtered_review = " ".join(lemmed_sentence)
        
            # print("After lem update -- ", filtered_review)
            # print()

            lemmed_reviews.append(filtered_review)
        else:
            lemmed_reviews.append(text_review)

    updated_df['lemmed_reviews'] = lemmed_reviews
    return updated_df

In [332]:
# lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')

In [333]:
# lemmed_df

In [334]:
# print("without_unlemmed_words:")
# generate_sample_reviews(lemmed_df, 'lemmed_reviews', 3)

In [335]:
def preprocess_data(df, col_name):
    """Perform lower case, remove HTML and URLs, remove contractions, remove non-alphabetical characters, remove extra spaces, remove stop words, and lemmatize"""

    print("original reviews:")
    generate_sample_reviews(df, col_name, 3)

    reviews_lower_cased = convert_reviews_to_lower_case(df, col_name)
    print("reviews_lower_cased:")
    generate_sample_reviews(reviews_lower_cased, 'lower_cased', 3)

    no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')
    print("without_html_urls:")
    generate_sample_reviews(no_html_urls_df, 'without_html_urls', 3)

    no_contractions_df = remove_contractions(no_html_urls_df, 'without_html_urls')
    print("without_contractions:")
    generate_sample_reviews(no_contractions_df, 'without_contractions', 3)

    only_alpha_chars_df = remove_non_alphabetical_characters(no_contractions_df, 'without_contractions')
    print("with_alpha_chars_only:")
    generate_sample_reviews(only_alpha_chars_df, 'with_alpha_chars_only', 3)

    no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'with_alpha_chars_only')
    print("without_extra_space:")
    generate_sample_reviews(no_extra_space_df, 'without_extra_space', 3)

    no_stop_words_df = filter_stop_words(no_extra_space_df, 'without_extra_space')
    print("without_stop_words:")
    generate_sample_reviews(no_stop_words_df, 'without_stop_words', 3)
    
    lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')
    print("without_unlemmed_words:")
    generate_sample_reviews(lemmed_df, 'lemmed_reviews', 3)

    return lemmed_df


In [336]:
cleaned_reviews_df = preprocess_data(sampled_reviews_ratings_df, 'review_body')

original reviews:
{'star_rating': 4, 'review_body': 'Bought this phone for my 86 year old uncle. The amplification is fine and the phone is simple (simple is the key word). At least he can hear us now when the family calls to check on him.'}
{'star_rating': 4, 'review_body': "The size is about the same as an iPhone, its very light too. The packaging it comes in is very inspired by Apple and comes with a soft pouch case and some connection wires including the AC adapter. The instructions are terrible, There's not much too them and it really doesn't give you enough info. Specifically, what the various adapter plugs are for. Yes, there are a few that are obvious like the iPhone/ipod/ipad but there are 2 that i don't know what they would be for.<br /><br />There is no adapter included to work with my laptop. I have a macbook pro that uses dvi and a smaller netbook pc but it uses a regular HDMI cable that is way too big and the included plugs look like cell phone mini usb's. It does come wi

In [337]:
cleaned_reviews_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,binary_review_class,ternary_review_class,lower_cased,without_html_urls,without_contractions,with_alpha_chars_only,without_extra_space,without_stop_words,lemmed_reviews
2528018,4,Bought this phone for my 86 year old uncle. Th...,1.0,,,1.0,1.0,bought this phone for my 86 year old uncle. th...,bought this phone for my 86 year old uncle. th...,bought this phone for my 86 year old uncle. th...,bought this phone for my year old uncle th...,bought this phone for my year old uncle the am...,bought phone year old uncle amplification fine...,bought phone year old uncle amplification fine...
67050,4,I had used one of these at work and liked it e...,1.0,,,1.0,1.0,i had used one of these at work and liked it e...,i had used one of these at work and liked it e...,i had used one of these at work and liked it e...,i had used one of these at work and liked it e...,i had used one of these at work and liked it e...,used one work liked enough buy one home thing ...,used one work liked enough buy one home thing ...
2309113,4,"The size is about the same as an iPhone, its v...",1.0,,,1.0,1.0,"the size is about the same as an iphone, its v...","the size is about the same as an iphone, its v...","the size is about the same as an iphone, its v...",the size is about the same as an iphone its v...,the size is about the same as an iphone its ve...,size iphone light packaging comes inspired app...,size iphone light packaging come inspired appl...
1081311,4,I have found this stand to be very study for m...,1.0,,,1.0,1.0,i have found this stand to be very study for m...,i have found this stand to be very study for m...,i have found this stand to be very study for m...,i have found this stand to be very study for m...,i have found this stand to be very study for m...,found stand study large dell monitor love stan...,found stand study large dell monitor love stan...
1413852,4,This was the perfect refill for my printer. I...,1.0,,,1.0,1.0,this was the perfect refill for my printer. i...,this was the perfect refill for my printer. i...,this was the perfect refill for my printer. i ...,this was the perfect refill for my printer i ...,this was the perfect refill for my printer i h...,perfect refill printer issues certainly better...,perfect refill printer issue certainly better ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1433552,2,"Do I return my printer and get a next one, I l...",,,3.0,3.0,3.0,"do i return my printer and get a next one, i l...","do i return my printer and get a next one, i l...","do i return my printer and get a next one, i l...",do i return my printer and get a next one i l...,do i return my printer and get a next one i li...,return printer get next one like printer worki...,return printer get next one like printer worki...
835203,2,Pretty chicken s*** that it does NOT include N...,,,3.0,3.0,3.0,pretty chicken s*** that it does not include n...,pretty chicken s*** that it does not include n...,pretty chicken s*** that it does not include n...,pretty chicken s that it does not include n...,pretty chicken s that it does not include nece...,pretty chicken include necessary printer cable...,pretty chicken include necessary printer cable...
2553531,2,Use army time.When you set the date use the da...,,,3.0,3.0,3.0,use army time.when you set the date use the da...,use army time.when you set the date use the da...,use army time.when you set the date use the da...,use army time when you set the date use the da...,use army time when you set the date use the da...,use army time set date use day month first mon...,use army time set date use day month first mon...
1015071,2,It gets atleast 2 because of the sound quality...,,,3.0,3.0,3.0,it gets atleast 2 because of the sound quality...,it gets atleast 2 because of the sound quality...,it gets atleast 2 because of the sound quality...,it gets atleast because of the sound quality...,it gets atleast because of the sound quality t...,gets atleast sound quality quality plastic ban...,get atleast sound quality quality plastic band...


In [338]:
binary_embeddings_df = cleaned_reviews_df.dropna(subset=['binary_review_class'])
len(binary_embeddings_df), binary_embeddings_df['star_rating'].unique()

(1200, array([4, 5, 1, 2]))

In [339]:
binary_review_class = binary_embeddings_df['binary_review_class']
binary_review_class.unique()

array([1., 3.])

In [340]:
binary_text = binary_embeddings_df.loc[:, ['lemmed_reviews']]
binary_text

Unnamed: 0,lemmed_reviews
2528018,bought phone year old uncle amplification fine...
67050,used one work liked enough buy one home thing ...
2309113,size iphone light packaging come inspired appl...
1081311,found stand study large dell monitor love stan...
1413852,perfect refill printer issue certainly better ...
...,...
1433552,return printer get next one like printer worki...
835203,pretty chicken include necessary printer cable...
2553531,use army time set date use day month first mon...
1015071,get atleast sound quality quality plastic band...


In [341]:
### Train test split so I can have the same train
binary_X_train, binary_X_test, binary_y_train, binary_y_test = train_test_split(binary_text, binary_review_class, test_size=0.2, random_state=42)
binary_X_train.shape, binary_X_test.shape, binary_y_train.shape, binary_y_test.shape

((960, 1), (240, 1), (960,), (240,))

In [342]:
binary_X_train.head(7)

Unnamed: 0,lemmed_reviews
936833,know long last I week far good beat price
2099821,far month problem slipped right actually made ...
2005400,print excellent picture like kodak printer pro...
313758,work correctly button make good contact offend...
2345455,I owned printer day far extremely happy family...
55679,love tape use weekly seems last forever remove...
2325152,excited order compatible cartridge right tried...


# Extract Word Embeddings

In [343]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import gensim.downloader as api
pretrained_word_two_vec_model = api.load('word2vec-google-news-300')

2024-02-07 12:20:04,877 : INFO : loading projection weights from /Users/brinkley97/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2024-02-07 12:20:39,618 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /Users/brinkley97/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-02-07T12:20:39.618667', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.6.3-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


## My Model

In [344]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __init__(self, df: pd.DataFrame, col_name: str):
        self.df = df
        self.col_name = col_name

    def __iter__(self):
        """

        Parameters
        ----------
        df: `pd.DataFrame`
            The data
        
        col_name: `str`
            Column with reviews

        words_in_model: `list`
            Words in Word2Vec model

        Return
        ------
        
        """

        text_reviews = self.df[self.col_name].values

        for text_reviews_idx in range(len(text_reviews)):
            text_review = text_reviews[text_reviews_idx]
            # print(text_reviews_idx, "--", text_review)

            yield utils.simple_preprocess(text_review)
        

In [347]:
import gensim.models

# X train
binary_X_train_sentences = MyCorpus(binary_X_train, 'lemmed_reviews')
# sentences = MyCorpus(sampled_reviews_ratings_df, 'review_body')
# print("\nSentences", binary_X_train_sentences)
my_binary_X_train_model = gensim.models.Word2Vec(sentences=binary_X_train_sentences, vector_size=300, window=11, min_count=10)

# X test - get embeddings from my_binary_X_train_model -- vec_king = my_binary_X_train_model.wv['king']
# binary_X_test_sentences = MyCorpus(binary_X_test, 'lemmed_reviews')
# sentences = MyCorpus(sampled_reviews_ratings_df, 'review_body')
# print("\nSentences", binary_X_test_sentences)

# Terneary Case



2024-02-07 12:21:17,505 : INFO : collecting all words and their counts
2024-02-07 12:21:17,506 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-02-07 12:21:17,549 : INFO : collected 4423 word types from a corpus of 27478 raw words and 960 sentences
2024-02-07 12:21:17,549 : INFO : Creating a fresh vocabulary
2024-02-07 12:21:17,552 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 600 unique words (13.57% of original 4423, drops 3823)', 'datetime': '2024-02-07T12:21:17.552062', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.6.3-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2024-02-07 12:21:17,552 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 18730 word corpus (68.16% of original 27478, drops 8748)', 'datetime': '2024-02-07T12:21:17.552739', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (cla

### Similar scores

[ ] Write summary of differences between their model and my model

### PRetrained

In [None]:
my_pretrained_binary_X_train_model = my_binary_X_train_model.wv
my_pretrained_binary_X_train_model

In [346]:
# TODO: Fix with proper exs

result = pretrained_word_two_vec_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

my_result = my_pretrained_binary_X_train_model.most_similar(positive=['started', 'even'], negative=['plus', 'toner'], topn=10)
print(my_result)


[('queen', 0.7118193507194519)]
[('daughter', 0.02274164743721485), ('pack', 0.020964179188013077), ('terrible', 0.020802440121769905), ('caller', 0.020002959296107292), ('larger', 0.019834857434034348), ('useless', 0.019533362239599228), ('loud', 0.019526176154613495), ('paying', 0.01949603110551834), ('luck', 0.019292298704385757), ('internet', 0.019116980955004692)]


### Our model

In [348]:
def word_embeddings(df: pd.DataFrame, col_name: str, model_to_use):
    """Extract word embeddings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    model_to_use:
        Either the pretrained model or my pretrained model

    Return
    ------
    
    """

    sentence_vectorized = []
    mean_sentences_vectorized = []
    sentences = df[col_name].values

    for sentences_idx in range(len(sentences)):
        vectorized_words = []
        sentence = sentences[sentences_idx]
        # print("Sentence", sentences_idx)
        # print("Sentence", sentences_idx, "Pre-vectorized -- ", sentence)
        for word in sentence.split(" "):
            if word in model_to_use.key_to_index:
                vector_of_word = model_to_use[word]
                vectorized_words.append(vector_of_word)
                # print("--->", word, "is in model with vector lenght of", len(vector_of_word))
            else:
                vector_of_word = np.random.rand(model_to_use.vector_size)
                vectorized_words.append(vector_of_word)
                
        sentence_vectorized.append(vectorized_words)
        # print("Sentence", sentences_idx, "Post-vectorized \n")

        mean_of_sentence = np.mean(sentence_vectorized[sentences_idx], axis=0)
        # print(len(mean_of_sentence), mean_of_sentence)
        mean_sentences_vectorized.append(mean_of_sentence)


    return mean_sentences_vectorized

In [349]:
binary_X_train

Unnamed: 0,lemmed_reviews
936833,know long last I week far good beat price
2099821,far month problem slipped right actually made ...
2005400,print excellent picture like kodak printer pro...
313758,work correctly button make good contact offend...
2345455,I owned printer day far extremely happy family...
...,...
1801078,think fault really read see start july year st...
730657,love idea picture however really disappointing...
2361758,basic functionality needed phone construction ...
819301,ink splotch feather every paper try besides bl...


In [365]:
print("Pretrained Train")
pretrained_binary_train_embeddings = word_embeddings(binary_X_train, 'lemmed_reviews', pretrained_word_two_vec_model)

print("Pretrained Test")
pretrained_binary_test_embeddings = word_embeddings(binary_X_test, 'lemmed_reviews', pretrained_word_two_vec_model)

Pretrained Train
Pretrained Test


In [366]:
pretrained_binary_train_embeddings = np.array(pretrained_binary_train_embeddings)
pretrained_binary_test_embeddings = np.array(pretrained_binary_test_embeddings)

pretrained_binary_train_embeddings.shape, pretrained_binary_test_embeddings.shape


((960, 300), (240, 300))

In [367]:
print("My-pretrained Train")
my_binary_train_embeddings = word_embeddings(binary_X_train, 'lemmed_reviews', my_pretrained_binary_X_train_model)

print("My-pretrained Test")
my_binary_test_embeddings = word_embeddings(binary_X_test, 'lemmed_reviews', my_pretrained_binary_X_train_model)

My-pretrained Train
My-pretrained Test


In [368]:
my_binary_train_embeddings = np.array(my_binary_train_embeddings)
my_binary_test_embeddings = np.array(my_binary_test_embeddings)

my_binary_train_embeddings.shape, my_binary_test_embeddings.shape

((960, 300), (240, 300))

In [369]:
# # embeddings_df = lemmed_df.iloc[:, [5, 1, 12]]
# embeddings_df = binary_embeddings_df
# embeddings_df['pretrained_embeddings'] = pretrained_embeddings
# embeddings_df['my_embeddings'] = my_embeddings
# embeddings_df.head(7)

In [370]:
def tf_idf_feature_extraction(df: pd.DataFrame, col_name: str):
    """Extract the TF-IDF features from the reviews.

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    tf_idf_features:
        A matrix containing the TF-IDF features extracted
        
    """
    
    vectorizer = TfidfVectorizer()
    tf_idf_features = vectorizer.fit_transform(df[col_name])

    return tf_idf_features

In [371]:
# tf_idf_features = tf_idf_feature_extraction(lemmed_df, 'lemmed_reviews')


In [372]:
# tf_idf_features

## Simple models

# Models + Evaluation Metrics

In [374]:
def eval_accuracy(y_true, y_prediction):
    return sklearn.metrics.accuracy_score(y_true, y_prediction)

def eval_precision(y_true, y_prediction):
    return sklearn.metrics.precision_score(y_true, y_prediction)

def eval_recall(y_true, y_prediction):
    return sklearn.metrics.recall_score(y_true, y_prediction)

def eval_f1_score(y_true, y_prediction):
    return sklearn.metrics.f1_score(y_true, y_prediction)

In [360]:
def train_eval_metric(y_train_true, y_train_predictions):
    accuracy = eval_accuracy(y_train_true, y_train_predictions)
    # precision = eval_precision(y_train_true, y_train_predictions)
    # recall = eval_recall(y_train_true, y_train_predictions)
    # f1 = eval_f1_score(y_train_true, y_train_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        # 'Precision': precision,
        # 'Recall': recall,
        # 'F1 Score': f1
    }

    return metrics_dict

def test_eval_metric(y_test_true, y_test_predictions):
    accuracy = eval_accuracy(y_test_true, y_test_predictions)
    # precision = eval_precision(y_test_true, y_test_predictions)
    # recall = eval_recall(y_test_true, y_test_predictions)
    # f1 = eval_f1_score(y_test_true, y_test_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
        # 'Precision': precision,
        # 'Recall': recall,
        # 'F1 Score': f1
    }

    return metrics_dict

# Perceptron

In [375]:
def perceptron_model(X_train, X_test, y_train, y_test): 

    technique = Perceptron(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [376]:
my_binary_train_embeddings.shape, my_binary_test_embeddings.shape, binary_y_train.shape, binary_y_test.shape

((960, 300), (240, 300), (960,), (240,))

In [382]:
# Pretrained model
perceptron_train_metrics, perceptron_test_metrics = perceptron_model(pretrained_binary_train_embeddings, pretrained_binary_test_embeddings, binary_y_train, binary_y_test)

In [383]:
perceptron_train_metrics, perceptron_test_metrics

({'Accuracy': 0.771875}, {'Accuracy': 0.6666666666666666})

In [384]:
# My model
my_perceptron_train_metrics, my_perceptron_test_metrics = perceptron_model(my_binary_train_embeddings, my_binary_test_embeddings, binary_y_train, binary_y_test)
 

In [385]:
my_perceptron_train_metrics, my_perceptron_test_metrics

({'Accuracy': 0.571875}, {'Accuracy': 0.48333333333333334})

# SVM

In [None]:
def svm_model(X_train, X_test, y_train, y_test): 

    technique = LinearSVC(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [386]:
svm_train_metrics, svm_test_metrics = svm_model(pretrained_binary_train_embeddings, pretrained_binary_test_embeddings, binary_y_train, binary_y_test)



In [387]:
svm_train_metrics, svm_test_metrics

({'Accuracy': 0.8666666666666667}, {'Accuracy': 0.7875})

In [388]:
my_svm_train_metrics, my_svm_test_metrics = svm_model(my_binary_train_embeddings, my_binary_test_embeddings, binary_y_train, binary_y_test)



In [389]:
my_svm_train_metrics, my_svm_test_metrics

({'Accuracy': 0.7395833333333334}, {'Accuracy': 0.5708333333333333})

## What do I conclude from comparing performances

## Feedforward Neural Network
- MLP for sentiment analysis classification
- 2 hidden layers each with 50 and 10 nodes, respectively
- Cross entropy loss
- I decide other hyperparameters (ie: nonlinearity, #epochs, etc)

In [None]:
pretrained_embeddings.shape, my_embeddings.shape

### Binary Classification

In [None]:
binary_review_class = binary_embeddings_df['binary_review_class']
binary_review_class.unique()

In [None]:
import torch

import numpy as np
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    """Define the NN architecture"""

    def __init__(self, hidden_1: int, hidden_2: int, height: int, width: int, dropout_rate: float, output_classes: int):
        super(Net, self).__init__()

        self.height = heightght = height
        self.width = width
        # linear layer (1200 x 300 dot 300 x 50 -> 1200 x 50)
        self.fc1 = nn.Linear(300, 50)
        # linear layer (1200 x 50 dot 50 x 10 -> 1200 x 10)
        self.fc2 = nn.Linear(50, 10)
        # linear layer (1200 x 50 dot 50 x 10 -> 1200 x 10)
        self.fc3 = nn.Linear(10, 2) # change to 3 for ternery
        # dropout to prevent overfitting
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        
        # add hidden layer, with relu activation function, dropout, relu, dropout, output
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)

        return x
    
    def train_network(self, number_of_epochs: int, optimizer, criterion_function, train_loader, valid_loader):
        # set initial "min" to infinity
        valid_loss_min = np.Inf

        for epoch in range(number_of_epochs):
            train_loss = 0.0
            valid_loss = 0.0

            ###################
            # train the model #
            ###################
            self.train() # prep model for training
            for data, target in train_loader:
                # clear the gradients of all optimized variables
                optimizer.zero_grad()
                # forward pass to compute predictions, loss, backward pass to compute gradient wrt model params
                output = self(data)
                loss = criterion_function(output, target)
                loss.backward()
                optimizer.step()
                # update running training loss
                train_loss += loss.item() * data.size(0)
            
            ######################    
            # validate the model #
            ######################
            self.eval() # prep model for evaluation
            for data, target in valid_loader:
                # forward pass to compute predictions, loss, update running validation loss
                output = self(data)
                loss = criterion_function(output, target)
                valid_loss += loss.item() * data.size(0)
            
            # print training/validation statistics 
            # calculate average loss over an epoch
            train_loss = train_loss / len(train_loader.dataset)
            valid_loss = valid_loss / len(valid_loader.dataset)
            
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
                epoch+1, 
                train_loss,
                valid_loss
                ))
            
            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                valid_loss_min,
                valid_loss))
                torch.save(self.state_dict(), 'nn_model.pt')
                valid_loss_min = valid_loss

    def predict(self, data_loader):
        prediction_list = []
        for i, batch in enumerate(data_loader):
            outputs = self(batch)    
            _, predicted = torch.max(outputs.data, 1)
            prediction_list.append(predicted.cpu())
        
        return prediction_list

### Ternary Classification

In [None]:
ternary_review_class = ternary_embeddings_df['ternary_review_class']
ternary_review_class.unique()

In [None]:
# logistic_regression_train_metrics, logistic_regression_test_metrics = logistic_regression_model(X_train, X_test, y_train, y_test)

In [None]:
# logistic_regression_train_metrics, logistic_regression_test_metrics

# Naive Bayes

In [None]:
def naive_bayes_model(X_train, X_test, y_train, y_test): 

    technique = MultinomialNB()
    technique.fit(X_train.toarray(), y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)

    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [None]:
# naive_bayes_train_metrics, naive_bayes_test_metrics = naive_bayes_model(X_train, X_test, y_train, y_test)

In [None]:
# naive_bayes_train_metrics, naive_bayes_test_metrics