# HW2- Binary and Ternary Classification for Sentiment Analysis
- Detravious Jamari Brinkley
- CSCI-544: Applied Natural Language Processing
- python version: 3.11.4

In [1]:
import re
import nltk
import logging
import sklearn
import gensim.models

import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import gensim.downloader as api
import torch.nn.functional as F

from gensim import utils
from bs4 import BeautifulSoup

from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from gensim.test.utils import datapath
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron, LogisticRegression

nltk.download('wordnet')
nltk.download('omw-1.4')
pretrained_word_two_vec_model = api.load('word2vec-google-news-300')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/brinkley97/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# HW Outline

1. Dataset Generation
    1. Read data
    2. Keep reviews and ratings
    3. Create binary and ternary classes
    4. Clean data steps
    4. Clean data function
    5. Split data
    6. Load pretrained model and train my model
        - Get similarity for the pretrained model
        - Get similarity for my trained model
2. Word Embedding
    - Get word embeddings for pretrained model
    - Get word embeddings for my model
3. Simple models
    - Get accuracy for perceptron on pretrained model
    - Get accuracy for svm on pretrained model
    - Get accuracy for perceptron on my model
    - Get accuracy for svm on my model
    - What do I conclude from comparing performances
4. Feedforward Neural Networks (FFNN)
5. Convolutional Neural Networks (CNN)

# 1. Dataset Generation

- [x] Load the Amazon reviews dataset
- [x] Build a balanced dataset of 250K reviews along with their ratings through random selection
    - [x] Rating 1: 50K instances
    - [x] Rating 2: 50K instances
    - [x] Rating 3: 50K instances
    - [x] Rating 4: 50K instances
    - [x] Rating 5: 50K instances
- [x] Create ternary labels using the ratings
    - [x] Class 1: Ratings 4 and 5 (positive sentiment)
    - [x] Class 2: Ratings 1 and 2 (negative sentiment)
    - [x] Class 3: Rating 3 (neutral sentiment)

## Read Data

In [2]:
dataset = "../datasets/amazon_reviews_us_Office_Products_v1_00.tsv"
amazon_reviews_copy_df = pd.read_csv(dataset, sep='\t', on_bad_lines='skip', low_memory=False)

## Keep Reviews and Ratings

In [3]:
reviews_ratings_df = amazon_reviews_copy_df.loc[0:, ['star_rating', 'review_body']]
reviews_ratings_df.reset_index(drop=True)

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [4]:
def generate_sample_reviews(df: pd.DataFrame, review_col_name: str, number_of_reviews: int = 3):
    """Include reviews and ratings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    review_col_name: `str`
        The specific_column to get the reviews and ratings of
    
    number_of_reviews: `int`
        Number of samples to include


    Return
    ------
    Nothing; instead, print the reviews with ratings
    """


    columns_to_include = [review_col_name, 'star_rating']

    # Initialize an empty list to store dictionaries
    list_of_dicts = []

    # Iterate over the specified columns and retrieve the first three rows
    for row in df[columns_to_include].head(3).to_dict(orient='records'):
        list_of_dicts.append({'star_rating': row['star_rating'], review_col_name: row[review_col_name]})

    for dictionary in list_of_dicts:
        print(dictionary)

 ## Create binary and ternary classes


In [5]:
def update_data_type(df: pd.DataFrame, col_name: str):
    """Update the data type of the star ratings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with rating values

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the new sentiment appened

    """

    valid_ratings = ['1','2','3','4','5']
    star_rating_series = df[col_name].copy()

    # Convert type to strings
    star_rating_series.astype('str')

    # Check valid list and see which of our stars match
    rows = star_rating_series.index
    is_rating_in_valid_ratings = rows[star_rating_series.isin(valid_ratings)]

    # Convert to list
    is_rating_in_valid_ratings = is_rating_in_valid_ratings.to_list()

    updated_df = df.iloc[is_rating_in_valid_ratings]
    updated_df[col_name] = updated_df[col_name].astype(int)
    return updated_df

In [6]:
updated_reviews_ratings_df = update_data_type(reviews_ratings_df, 'star_rating')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_df[col_name] = updated_df[col_name].astype(int)


In [7]:
updated_reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [8]:
updated_reviews_ratings_df = updated_reviews_ratings_df.dropna()
updated_reviews_ratings_df

Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."
3,1,Although this was labeled as &#34;new&#34; the...
4,4,Gorgeous colors and easy to use
...,...,...
2640249,4,I can't live anymore whithout my Palm III. But...
2640250,4,Although the Palm Pilot is thin and compact it...
2640251,4,This book had a lot of great content without b...
2640252,5,I am teaching a course in Excel and am using t...


In [9]:
# Check for NaN values
nan_check = updated_reviews_ratings_df.isna()

# Display the DataFrame with True where NaN values exist
print(nan_check)

# Check if any NaN value exists in the DataFrame
if nan_check.any().any():
    print("There are NaN values in the DataFrame.")
else:
    print("There are no NaN values in the DataFrame.")

         star_rating  review_body
0              False        False
1              False        False
2              False        False
3              False        False
4              False        False
...              ...          ...
2640249        False        False
2640250        False        False
2640251        False        False
2640252        False        False
2640253        False        False

[2640080 rows x 2 columns]
There are no NaN values in the DataFrame.


In [10]:
print("# reviews per rating", updated_reviews_ratings_df['star_rating'].value_counts())

# reviews per rating star_rating
5    1582704
4     418348
1     306967
3     193680
2     138381
Name: count, dtype: int64


In [11]:
def sample_star_ratings(df: pd.DataFrame, col_name: str, star_value: int, number_of_reviews: int):
    """Build a subset balanced dataset with reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The dataframe to use
    col_name: `str`
        The name of the column to get reviews from
    star_value: `int`
        The star rating of the review
    number_of_reviews: `int`
        The number of sub reviews to include in sample

    Return
    ------
    rating_df, sampled_rating_df: `tuple`
        All reviews with that rating and the subset reviews with that rating
    """
    
    rating_df = df[df[col_name] == star_value]
    sampled_rating_df = rating_df.sample(n=number_of_reviews)
    return rating_df, sampled_rating_df

## [ ] Change #reviews per class

In [12]:
# subset_reviews = 50000
subset_reviews = 200

one_star = 1
rating_one, rating_one_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', one_star, subset_reviews)
two_stars = 2
rating_two, rating_two_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', two_stars, subset_reviews)
three_stars = 3
rating_three, rating_three_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', three_stars, subset_reviews)
four_stars = 4
rating_four, rating_four_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', four_stars, subset_reviews)
five_stars = 5
rating_five, rating_five_sampled = sample_star_ratings(updated_reviews_ratings_df, 'star_rating', five_stars, subset_reviews)

In [13]:
sampled_reviews_df = pd.concat([rating_one_sampled, rating_two_sampled, rating_three_sampled, rating_four_sampled, rating_five_sampled])

In [14]:
sampled_reviews_df

Unnamed: 0,star_rating,review_body
1179381,1,not a real projector . It is a piece
1190334,1,"When it works, this laser printer gets the job..."
1775668,1,I do not recommend this product at all as it r...
2248304,1,None of the pens worked upon arrival. All six...
2014722,1,The cartridges were empty! I installed the fir...
...,...,...
121749,5,These were exactly what I was looking for to c...
1227060,5,"Have not used the item much yet, but overall I..."
2413043,5,About a year ago we purchased the Panasonic KX...
1611857,5,I use this flash card in my digital frame so t...


In [15]:
def separate_reviews_by_rating(df: pd.DataFrame, rating_col: str, threshold: int, sentiment_type: str):
    """Categorizes reviews by adding a rating

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    rating_col: `str`
        Column with rating values
    
    threshold: `int`
        Where to split the ratings such that categories can be formed

    sentiment_type: `str`
        One of three types of sentiment: positive, negative, or neural

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the new sentiment appened
    """


    if sentiment_type == 'positive_review_class':
        positive_review_threshold = df[rating_col].astype('int32') > threshold
        df = df[positive_review_threshold]
        df[sentiment_type] = 1

    elif sentiment_type == 'negative_review_class':
        negative_review_threshold = df[rating_col].astype('int32') < threshold
        df = df[negative_review_threshold]
        df[sentiment_type] = 2

    elif sentiment_type == 'neutral_review_class':
        neutral_review_threshold = df[rating_col].astype('int32') == threshold
        df = df[neutral_review_threshold]
        df[sentiment_type] = 3
        
    return df

In [16]:
negative_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'negative_review_class')
neutral_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'neutral_review_class')
positive_review_class_df = separate_reviews_by_rating(sampled_reviews_df, 'star_rating', 3, 'positive_review_class')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 3
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[sentiment_type] = 1


In [17]:
sampled_reviews_ratings_df = pd.concat([negative_review_class_df, neutral_review_class_df, positive_review_class_df])
sampled_reviews_ratings_df

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class
1179381,1,not a real projector . It is a piece,2.0,,
1190334,1,"When it works, this laser printer gets the job...",2.0,,
1775668,1,I do not recommend this product at all as it r...,2.0,,
2248304,1,None of the pens worked upon arrival. All six...,2.0,,
2014722,1,The cartridges were empty! I installed the fir...,2.0,,
...,...,...,...,...,...
121749,5,These were exactly what I was looking for to c...,,,1.0
1227060,5,"Have not used the item much yet, but overall I...",,,1.0
2413043,5,About a year ago we purchased the Panasonic KX...,,,1.0
1611857,5,I use this flash card in my digital frame so t...,,,1.0


In [18]:
negative_reviews_df = sampled_reviews_ratings_df['negative_review_class'].dropna()
neutral_reviews_df = sampled_reviews_ratings_df['neutral_review_class'].dropna()
positive_reviews_df = sampled_reviews_ratings_df['positive_review_class'].dropna()

In [19]:
sampled_reviews_ratings_df['binary_review_class'] = pd.concat([negative_reviews_df, positive_reviews_df])
sampled_reviews_ratings_df['ternary_review_class'] = pd.concat([negative_reviews_df, neutral_reviews_df, positive_reviews_df])

In [20]:
print(sampled_reviews_ratings_df['binary_review_class'].unique())
print(sampled_reviews_ratings_df['ternary_review_class'].unique())

[ 2. nan  1.]
[2. 3. 1.]


## Clean data steps

### Lower case

In [21]:
def convert_reviews_to_lower_case(df: pd.DataFrame, col_name: str):
    """Convert all reviews to lower case

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the lower cased reviews
    """
    
    lower_case_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    
    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        # print(text_reviews_idx, type(text_review), text_review)

        # NOT all reviews are strings, thus all can't be converted to lower cased
        if type(text_review) != str:
            print(True, text_review)
            converted_str = str(text_review)
            lower_case_reviews.append(text_review)
         
        else:
            update_text_review = text_review.lower()
            lower_case_reviews.append(update_text_review)

    updated_df['lower_cased'] = lower_case_reviews
    return updated_df

In [22]:
# reviews_lower_cased = convert_reviews_to_lower_case(sampled_reviews_ratings_df, 'review_body')

In [23]:
# reviews_lower_cased

In [24]:
# print("reviews_lower_cased:")
# generate_sample_reviews(reviews_lower_cased, 'lower_cased', 3)

### Remove HTML and URLs

In [25]:
def remove_html_and_urls(df: pd.DataFrame, col_name: str):
    """Remove HTML and URLs from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the html_and_urls removed
    """
    
    # url_pattern = re.compile(r'https?://\S+|www\. \S+')

    cleaned_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
            # Check and remove HTML tags
            has_html = bool(re.search('<.*?>', text_review))
            if has_html == True:
                # print("Review", text_reviews_idx, "has HTML -- ", text_review)
                pass

            no_html_review = re.sub('<.*?>', ' ', text_review)
            # print("Review", text_reviews_idx, "without HTML -- ", no_html_review)
        
            # Check and remove URLs
            has_url = bool(re.search(r'http\S+', no_html_review))
            if has_url == True:
                # print("Review", text_reviews_idx, "has URL --", no_html_review)
                pass

            no_html_url_review = re.sub(r'http\S+', '', no_html_review)
            # print("Review", text_reviews_idx, "without HTML, URL -- ", no_html_url_review)
            # print()
            cleaned_reviews.append(no_html_url_review)
        else:
            # print(text_reviews_idx, text_review)
            cleaned_reviews.append(text_review)
            

    updated_df['without_html_urls'] = cleaned_reviews
    return updated_df

In [26]:
# no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')

In [27]:
# no_html_urls_df

In [28]:
# print("without_html_urls:")
# generate_sample_reviews(no_html_urls_df, 'without_html_urls', 3)

### Remove Contractions

In [29]:
store_contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he's": "he is",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "mustn't": "must not",
    "shan't": "shall not",
    "she's": "she is",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they're": "they are",
    "wasn't": "was not",
    "we're": "we are",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    "you're": "you are",
    "you'll": "you will",
    "you'd": "you would",
    "we'll": "we will",
    "we've": "we have",
    "we'd": "we would",
    "I'm": "I am",
    "i've": "I have",
    "I've": "I have",
    "I'd": "I would",
    "it'll": "it will",
    "they'll": "they will",
    "they've": "they have",
    "they'd": "they would",
    "he'll": "he will",
    "he'd": "he would",
    "she'll": "she will",
    "we'd": "we would",
    "we'll": "we will",
    "you've": "you have",
    "you'd": "you would",
    "you'll": "you will",
    "I'll": "I will",
    "I'd": "I would",
    "it's": "it is",
    "it'd": "it would",
    "i'm": "I am",
    "he's": "he is",
    "he'll": "he will",
    "she's": "she is",
    "she'll": "she will",
    "we're": "we are",
    "we've": "we have",
    "we'll": "we will",
    "you're": "you are",
    "you've": "you have",
    "you'll": "you will",
    "they're": "they are",
    "they've": "they have",
    "they'll": "they will",
    "that's": "that is",
    "that'll": "that will",
    "that'd": "that would",
    "who's": "who is",
    "who'll": "who will",
    "who'd": "who would",
    "what's": "what is",
    "what'll": "what will",
    "what'd": "what would",
    "when's": "when is",
    "when'll": "when will",
    "when'd": "when would",
    "where's": "where is",
    "where'll": "where will",
    "where'd": "where would",
    "why's": "why is",
    "why'll": "why will",
    "why'd": "why would",
    "how's": "how is",
    "how'll": "how will",
    "how'd": "how would"
}


In [30]:
def locate_and_replace_contractions(review):
    """Find the contractions to replace from a specific review

    Parameters
    ----------
    review: `str`
        A specific review

    Return
    ------
    non_contraction_review: `str`
        The updated specific review with contractions expanded
    
    """
    if isinstance(review, str):
        get_words = review.split()

        store_non_contraction_words = []

        for word in get_words:
            if word in store_contractions:
                non_contraction_form = store_contractions[word]
                # print(word, "-->", non_contraction_form)

                store_non_contraction_words.append(non_contraction_form)

            else:
                # print(word)
                store_non_contraction_words.append(word)

        non_contraction_review = ' '.join(store_non_contraction_words)
        return non_contraction_review
    else:
        return review


In [31]:
def remove_contractions(df:pd.DataFrame, col_name: str):
    """Remove contractions from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_contractions_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        # print("Review", text_reviews_idx, "with possible contraction(s) -- ", text_review)

        without_contraction = locate_and_replace_contractions(text_review)

        # print("Review", text_reviews_idx, "without contraction -- ", without_contraction)
        # print()

        without_contractions_reviews.append(without_contraction)

    updated_df['without_contractions'] = without_contractions_reviews
    return updated_df

In [32]:
# no_contractions_df = remove_contractions(no_html_urls_df, 'without_html_urls')

In [33]:
# no_contractions_df

In [34]:
# print("without_contractions:")
# generate_sample_reviews(no_contractions_df, 'without_contractions', 3)

### Remove Non-alphabetical characters

In [35]:
def remove_non_alphabetical_characters(df:pd.DataFrame, col_name: str):
    """Remove Non-alphabetical characters from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the non-alphabetical characters removed
    """

    alphabetical_char_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]
        
        if isinstance(text_review, str):

            # Check for non-alphabetical characters
            has_non_alphabetical_char = bool(re.search(r'[^a-zA-Z]', text_review))
            if has_non_alphabetical_char == True:
                # print("Review", text_reviews_idx, "has HTML -- ", text_review)
                pass
            
            # Remove non-alphabetical characters
            with_alphabetical_char = re.sub(r'[^a-zA-Z\s]', ' ', text_review)
            # print("Review", text_reviews_idx, "has HTML -- ", with_alphabetical_char)
            alphabetical_char_reviews.append(with_alphabetical_char)
        else:
            alphabetical_char_reviews.append(text_review)

    updated_df['with_alpha_chars_only'] = alphabetical_char_reviews
    return updated_df

In [36]:
# only_alpha_chars_df = remove_non_alphabetical_characters(no_contractions_df, 'without_contractions')

In [37]:
# only_alpha_chars_df

In [38]:
# print("with_alpha_chars_only:")
# generate_sample_reviews(only_alpha_chars_df, 'with_alpha_chars_only', 3)

### Remove extra spaces

In [39]:
def remove_extra_spaces(df:pd.DataFrame, col_name: str):
    """Remove extra spaces from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    single_spaced_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values
    # print(text_reviews)

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
        # Check if there are any extra spaces
            has_extra_space = bool(re.search(r' +', text_review))
            if has_extra_space == True:
                # print("Review", text_reviews_idx, "has extra space -- ", text_review)
                pass
            
            # Remove extra spaces
            single_spaced_review = re.sub(r' +', ' ', text_review)
            # print("Review", text_reviews_idx, "without extra space -- ", single_spaced_review)
            # print()
            
            single_spaced_reviews.append(single_spaced_review)
        else:
            single_spaced_reviews.append(text_review)

    updated_df['without_extra_space'] = single_spaced_reviews
    return updated_df

In [40]:
# no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'with_alpha_chars_only')

In [41]:
# no_extra_space_df

In [42]:
# print("without_extra_space:")
# generate_sample_reviews(no_extra_space_df, 'without_extra_space', 3)

### Remove the stop words 

In [43]:
def filter_stop_words(df:pd.DataFrame, col_name: str):
    """Filter stop words out from all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    without_stop_words_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    stop_words = set(stopwords.words("english"))

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]

        if isinstance(text_review, str):
            text_review_words = word_tokenize(text_review) 

        

            # print("Before stop word removal", text_reviews_idx, " -- ", text_review)

            filtered_review = []

            for text_review_words_idx in range(len(text_review_words)):
                text_review_word = text_review_words[text_review_words_idx]
                
                # Check if review word is a stop word
                if text_review_word in stop_words:
                    # print("  Stop word -- ", text_review_word)
                    pass
                else:
                    # print(text_review_word, " -- is NOT a stop word in review")
                    filtered_review.append(text_review_word)

            
            filtered_review = " ".join(filtered_review)
            # print("After stop word removal", text_reviews_idx, " -- ", filtered_review)
            # print()
            
            without_stop_words_reviews.append(filtered_review)
        else:
            without_stop_words_reviews.append(text_review)
        

    updated_df['without_stop_words'] = without_stop_words_reviews
    return updated_df

In [44]:
# no_stop_words_df = filter_stop_words(no_extra_space_df, 'without_extra_space')

In [45]:
# no_stop_words_df

In [46]:
# print("without_stop_words:")
# generate_sample_reviews(no_stop_words_df, 'without_stop_words', 3)

### Perform lemmatization  

- "A sentence with many words"
    - "words" -> word

In [47]:
def lemmentize_review(df:pd.DataFrame, col_name: str):
    """Lemmentize all reviews

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    df: `pd.DataFrame`
        An updated DataFrame with the extra spaces removed
    """
    
    lemmed_reviews = []
    updated_df = df.copy()
    text_reviews = df[col_name].values

    lem = WordNetLemmatizer()

    for text_reviews_idx in range(len(text_reviews)):
        text_review = text_reviews[text_reviews_idx]   
        if isinstance(text_review, str):     
            words_in_review = word_tokenize(text_review) 

            # print("Before lem update", text_reviews_idx, " -- ", text_review)
            # print("Lemmed words", words_in_review)
            

            lemmed_sentence = []

            # Split review into words
            for lemmed_words_idx in range(len(words_in_review)):
                word = words_in_review[lemmed_words_idx]
                
                apply_lemmatization = lem.lemmatize(word)
                # print(apply_lemmatization)
                
                lemmed_sentence.append(apply_lemmatization)
                filtered_review = " ".join(lemmed_sentence)
        
            # print("After lem update -- ", filtered_review)
            # print()

            lemmed_reviews.append(filtered_review)
        else:
            lemmed_reviews.append(text_review)

    updated_df['lemmed_reviews'] = lemmed_reviews
    return updated_df

In [48]:
# lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')

In [49]:
# lemmed_df

In [50]:
# print("without_unlemmed_words:")
# generate_sample_reviews(lemmed_df, 'lemmed_reviews', 3)

## Clean data function

In [51]:
def preprocess_data(df, col_name):
    """Perform lower case, remove HTML and URLs, remove contractions, remove non-alphabetical characters, remove extra spaces, remove stop words, and lemmatize"""

    print("original reviews:")
    # generate_sample_reviews(df, col_name, 3)

    reviews_lower_cased = convert_reviews_to_lower_case(df, col_name)
    print("reviews_lower_cased:")
    # generate_sample_reviews(reviews_lower_cased, 'lower_cased', 3)

    no_html_urls_df = remove_html_and_urls(reviews_lower_cased, 'lower_cased')
    print("without_html_urls:")
    # generate_sample_reviews(no_html_urls_df, 'without_html_urls', 3)

    no_contractions_df = remove_contractions(no_html_urls_df, 'without_html_urls')
    print("without_contractions:")
    # generate_sample_reviews(no_contractions_df, 'without_contractions', 3)

    only_alpha_chars_df = remove_non_alphabetical_characters(no_contractions_df, 'without_contractions')
    print("with_alpha_chars_only:")
    # generate_sample_reviews(only_alpha_chars_df, 'with_alpha_chars_only', 3)

    no_extra_space_df = remove_extra_spaces(only_alpha_chars_df, 'with_alpha_chars_only')
    print("without_extra_space:")
    # generate_sample_reviews(no_extra_space_df, 'without_extra_space', 3)

    no_stop_words_df = filter_stop_words(no_extra_space_df, 'without_extra_space')
    print("without_stop_words:")
    # generate_sample_reviews(no_stop_words_df, 'without_stop_words', 3)
    
    lemmed_df = lemmentize_review(no_stop_words_df, 'without_stop_words')
    print("without_unlemmed_words:")
    # print(lemmed_df["ternary_review_class"].unique())
    # generate_sample_reviews(lemmed_df, 'lemmed_reviews', 3)

    return lemmed_df


In [52]:
cleaned_reviews_df = preprocess_data(sampled_reviews_ratings_df, 'review_body')

original reviews:
reviews_lower_cased:
without_html_urls:
without_contractions:
with_alpha_chars_only:
without_extra_space:
without_stop_words:
without_unlemmed_words:


In [53]:
print(sampled_reviews_ratings_df['binary_review_class'].unique())
print(sampled_reviews_ratings_df['ternary_review_class'].unique())

print(cleaned_reviews_df['binary_review_class'].unique())
print(cleaned_reviews_df['ternary_review_class'].unique())

[ 2. nan  1.]
[2. 3. 1.]
[ 2. nan  1.]
[2. 3. 1.]


## Split data

In [54]:
def split_data(df, review_class):
    embeddings_df = df.dropna(subset=[review_class])
    # print(len(embeddings_df), embeddings_df['star_rating'].unique())

    specific_review_class = embeddings_df[review_class]
    # print(specific_review_class.unique())

    text = embeddings_df.loc[:, ['lemmed_reviews', 'star_rating', review_class]]
    # print(text)

    ### Train test split so I can have the same train
    X_train, X_test, y_train, y_test = train_test_split(text, specific_review_class, test_size=0.2, random_state=42)
    # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    return X_train, X_test, y_train, y_test

In [55]:
cleaned_reviews_df.head(3)

Unnamed: 0,star_rating,review_body,negative_review_class,neutral_review_class,positive_review_class,binary_review_class,ternary_review_class,lower_cased,without_html_urls,without_contractions,with_alpha_chars_only,without_extra_space,without_stop_words,lemmed_reviews
1179381,1,not a real projector . It is a piece,2.0,,,2.0,2.0,not a real projector . it is a piece,not a real projector . it is a piece,not a real projector . it is a piece,not a real projector it is a piece,not a real projector it is a piece,real projector piece,real projector piece
1190334,1,"When it works, this laser printer gets the job...",2.0,,,2.0,2.0,"when it works, this laser printer gets the job...","when it works, this laser printer gets the job...","when it works, this laser printer gets the job...",when it works this laser printer gets the job...,when it works this laser printer gets the job ...,works laser printer gets job done unfortunatel...,work laser printer get job done unfortunately ...
1775668,1,I do not recommend this product at all as it r...,2.0,,,2.0,2.0,i do not recommend this product at all as it r...,i do not recommend this product at all as it r...,i do not recommend this product at all as it r...,i do not recommend this product at all as it r...,i do not recommend this product at all as it r...,recommend product resulted power supplies two ...,recommend product resulted power supply two op...


In [56]:
# print("Binary\n")
binary_X_train, binary_X_test, binary_y_train, binary_y_test = split_data(cleaned_reviews_df, 'binary_review_class')
# print("\nTernary")
ternary_X_train, ternary_X_test, ternary_y_train, ternary_y_test = split_data(cleaned_reviews_df, 'ternary_review_class')

In [57]:
ternary_X_train

Unnamed: 0,lemmed_reviews,star_rating,ternary_review_class
1015252,cheaply made ut broke could use,1,2.0
882960,fit well warm cheaply made velcro keep mitten ...,3,3.0
1034797,great eraser,4,1.0
1891130,refilis yet see refill available would preferr...,3,3.0
1976530,blob ink ever smooth writing every time qualit...,5,1.0
...,...,...,...
2128683,warned two tone died even pinter showed left o...,1,2.0
1912167,basket sit inside bottom basket take half spac...,2,2.0
877745,great picture quality print,5,1.0
1701849,pleased size laminator issue even though light...,3,3.0


In [160]:
ternary_y_train

1015252    2.0
882960     3.0
1034797    1.0
1891130    3.0
1976530    1.0
          ... 
2128683    2.0
1912167    2.0
877745     1.0
1701849    3.0
1088615    2.0
Name: ternary_review_class, Length: 800, dtype: float64

## Load pretrained model and train my model

In [58]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __init__(self, df: pd.DataFrame, col_name: str):
        self.df = df
        self.col_name = col_name

    def __iter__(self):
        """

        Parameters
        ----------
        df: `pd.DataFrame`
            The data
        
        col_name: `str`
            Column with reviews

        words_in_model: `list`
            Words in Word2Vec model
        
        """

        text_reviews = self.df[self.col_name].values

        for text_reviews_idx in range(len(text_reviews)):
            text_review = text_reviews[text_reviews_idx]
            # print(text_reviews_idx, "--", text_review)

            yield utils.simple_preprocess(text_review)
        

In [59]:
print("Binary Case")
binary_X_train_sentences = MyCorpus(binary_X_train, 'lemmed_reviews')
my_binary_X_train_model = gensim.models.Word2Vec(sentences=binary_X_train_sentences, vector_size=300, window=11, min_count=10)

# X test - get embeddings from my_binary_X_train_model -- vec_king = my_binary_X_train_model.wv['king']
# binary_X_test_sentences = MyCorpus(binary_X_test, 'lemmed_reviews')
# sentences = MyCorpus(sampled_reviews_ratings_df, 'review_body')
# print("\nSentences", binary_X_test_sentences)

print("\nTernary Case")
ternary_X_train_sentences = MyCorpus(ternary_X_train, 'lemmed_reviews')
my_ternary_X_train_model = gensim.models.Word2Vec(sentences=ternary_X_train_sentences, vector_size=300, window=11, min_count=10)

2024-02-09 15:11:10,605 : INFO : collecting all words and their counts
2024-02-09 15:11:10,606 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-02-09 15:11:10,652 : INFO : collected 3864 word types from a corpus of 20194 raw words and 640 sentences


Binary Case


2024-02-09 15:11:10,653 : INFO : Creating a fresh vocabulary
2024-02-09 15:11:10,657 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 442 unique words (11.44% of original 3864, drops 3422)', 'datetime': '2024-02-09T15:11:10.657063', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.6.3-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2024-02-09 15:11:10,658 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 12580 word corpus (62.30% of original 20194, drops 7614)', 'datetime': '2024-02-09T15:11:10.658107', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.6.3-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2024-02-09 15:11:10,661 : INFO : deleting the raw counts dictionary of 3864 items
2024-02-09 15:11:10,662 : INFO : sample=0.001 downsamples 101 most-common words
2024-02-09 15:11:10,662 


Ternary Case


2024-02-09 15:11:11,110 : INFO : EPOCH 2: training on 25696 raw words (13796 effective words) took 0.0s, 321155 effective words/s
2024-02-09 15:11:11,167 : INFO : EPOCH 3: training on 25696 raw words (13772 effective words) took 0.0s, 319368 effective words/s
2024-02-09 15:11:11,219 : INFO : EPOCH 4: training on 25696 raw words (13799 effective words) took 0.0s, 445750 effective words/s
2024-02-09 15:11:11,220 : INFO : Word2Vec lifecycle event {'msg': 'training on 128480 raw words (68944 effective words) took 0.3s, 247139 effective words/s', 'datetime': '2024-02-09T15:11:11.220213', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]', 'platform': 'macOS-13.6.3-x86_64-i386-64bit', 'event': 'train'}
2024-02-09 15:11:11,220 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=543, vector_size=300, alpha=0.025>', 'datetime': '2024-02-09T15:11:11.220671', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 1

### Similar scores

- [x] Write summary of differences between their model and my model
    - My model doesn't perform as well as the pretrained because the pretrained has trained on more data compared to my model. Thus, my model don't have sufficient training compared to the pretrained model.

In [60]:
result = pretrained_word_two_vec_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.7118193507194519)]


In [61]:
my_trained_binary_X_train_model = my_binary_X_train_model.wv
my_trained_binary_X_train_model

my_trained_ternary_X_train_model = my_ternary_X_train_model.wv
my_trained_ternary_X_train_model

<gensim.models.keyedvectors.KeyedVectors at 0x1bd4a5c10>

In [62]:
# TODO: Fix with proper exs
# my_result = my_trained_binary_X_train_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)
# print(my_result)

# 2. Word embeddings

- Word embeddings [vector representation of each word]
- TUTORIAL: [Word2Vec Model](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html)
    - Follow for the purpose of using the Gensimm library
---

## (a) Pretrained Word2Vec

- [x] Load the pretrained “word2vec-google-news-300” Word2Vec model
- [x] Extract word embeddings (per word)
- [ ] Check semantic similarities (ie: (1) King − Man + Woman = Queen, (2) excellent ∼ outstanding) of my own

## (b) My trained Word2Vec
- [x] Train a Word2Vec model using my own dataset
    - [ ] Set the embedding size to be 300
    - [ ] Set the window size to be 11
    - [ ] Consider a minimum word count of 10
- [ ] Check semantic similarities
    - [ ] What do you conclude from comparing vectors generated by yourself and the pretrained model? (see answer below)
    - [ ] Which of the Word2Vec models seems to encode semantic similarities between words better? (see answer below)

In [63]:
def word_embeddings(df: pd.DataFrame, col_name: str, model_to_use):
    """Extract word embeddings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    model_to_use:
        Either the pretrained model or my pretrained model

    Return
    ------
    mean_sentences_vectorized
    """

    sentence_vectorized = []
    mean_sentences_vectorized = []
    sentences = df[col_name].values

    for sentences_idx in range(len(sentences)):
        vectorized_words = []
        sentence = sentences[sentences_idx]
        # print("Sentence", sentences_idx)
        # print("Sentence", sentences_idx, "Pre-vectorized -- ", sentence)
        for word_idx, word in enumerate(sentence.split(" ")):
            if word in model_to_use.key_to_index:
                vector_of_word = model_to_use[word]
                vectorized_words.append(vector_of_word)
            else:
                vector_of_word = np.random.rand(model_to_use.vector_size)
                vectorized_words.append(vector_of_word)

        sentence_vectorized.append(vectorized_words)
        # print("Sentence", sentences_idx, "Post-vectorized \n")
        mean_of_sentence = np.mean(sentence_vectorized[sentences_idx], axis=0)
        mean_sentences_vectorized.append(mean_of_sentence)
    print(len(mean_sentences_vectorized))
    return mean_sentences_vectorized

In [64]:
# binary_X_train

### Extract embeddings for pretrained model

In [65]:
print("Binary Pretrained Train")
pretrained_binary_train_embeddings = word_embeddings(binary_X_train, 'lemmed_reviews', pretrained_word_two_vec_model)
pretrained_binary_train_embeddings = np.array(pretrained_binary_train_embeddings)

print("Binary Pretrained Test")
pretrained_binary_test_embeddings = word_embeddings(binary_X_test, 'lemmed_reviews', pretrained_word_two_vec_model)
pretrained_binary_test_embeddings = np.array(pretrained_binary_test_embeddings)

pretrained_binary_train_embeddings.shape, pretrained_binary_test_embeddings.shape

Binary Pretrained Train
640
Binary Pretrained Test
160


((640, 300), (160, 300))

In [66]:
print("Ternary Pretrained Train")
pretrained_ternary_train_embeddings = word_embeddings(ternary_X_train, 'lemmed_reviews', pretrained_word_two_vec_model)
pretrained_ternary_train_embeddings = np.array(pretrained_ternary_train_embeddings)

print("Ternary Pretrained Test")
pretrained_ternary_test_embeddings = word_embeddings(ternary_X_test, 'lemmed_reviews', pretrained_word_two_vec_model)
pretrained_ternary_test_embeddings = np.array(pretrained_ternary_test_embeddings)

pretrained_ternary_train_embeddings.shape, pretrained_ternary_test_embeddings.shape

Ternary Pretrained Train
800
Ternary Pretrained Test
200


((800, 300), (200, 300))

### Extract embeddings for my model

In [67]:
print("Binary My Model Train")
my_trained_binary_train_embeddings = word_embeddings(binary_X_train, 'lemmed_reviews', my_trained_binary_X_train_model)
my_trained_binary_train_embeddings = np.array(my_trained_binary_train_embeddings)

print("Binary My Model Test")
my_trained_binary_test_embeddings = word_embeddings(binary_X_test, 'lemmed_reviews', my_trained_binary_X_train_model)
my_trained_binary_test_embeddings = np.array(my_trained_binary_test_embeddings)

Binary My Model Train
640
Binary My Model Test
160


In [68]:
print("Ternary My Model Train")
my_ternary_train_embeddings = word_embeddings(ternary_X_train, 'lemmed_reviews', my_trained_ternary_X_train_model)
my_ternary_train_embeddings = np.array(my_ternary_train_embeddings)

print("Ternary My Model Test")
my_ternary_test_embeddings = word_embeddings(ternary_X_test, 'lemmed_reviews', my_trained_ternary_X_train_model)
my_ternary_test_embeddings = np.array(my_ternary_test_embeddings)

my_ternary_train_embeddings.shape, my_ternary_test_embeddings.shape

Ternary My Model Train
800
Ternary My Model Test
200


((800, 300), (200, 300))

In [69]:
def concat_word_embeddings(df: pd.DataFrame, col_name: str, model_to_use, about: str):
    """Extract word embeddings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    model_to_use:
        Either the pretrained model or my pretrained model

    aboout: `str`
        Specifics of model, classification, and train/test
        
    Return
    ------
    concatenated_vectors: list
        List of concatenated vectors for each review (first 10 Word2Vec vectors)
    """
    print("About:", about)
    mean_concatenated_vectors = []
    sentences = df[col_name].values

    for sentence in sentences:
        vectorized_words = []
        words = sentence.split(" ")[:10]  # Select the first 10 words
        for word in words:
            if word in model_to_use.key_to_index:
                vector_of_word = model_to_use[word]
            else:
                vector_of_word = np.random.rand(model_to_use.vector_size)
            vectorized_words.append(vector_of_word)
        
        concatenated_features = np.concatenate(vectorized_words, axis=0)
        # Ensure dimensionality of 300 for each sentence
        if concatenated_features.shape[0] < 300:
            # If concatenated_features has less than 300 dimensions, pad it with zeros
            concatenated_features = np.pad(concatenated_features, ((0, 300 - concatenated_features.shape[0]), (0, 0)), mode='constant')
        elif concatenated_features.shape[0] > 300:
            # If concatenated_features has more than 300 dimensions, truncate it
            concatenated_features = concatenated_features[:300 :]
            
        mean_concatenated_vectors.append(concatenated_features)

    mean_concatenated_vectors = np.array(mean_concatenated_vectors)
    print("   Concat embeddings shape --- ", mean_concatenated_vectors.shape)
    print()
    return mean_concatenated_vectors


In [70]:
print("Binary")
pretrained_binary_train_concat_embeddings = concat_word_embeddings(binary_X_train, 'lemmed_reviews', pretrained_word_two_vec_model, "Pretrained --- Binary --- Train")
pretrained_binary_test_concat_embeddings = concat_word_embeddings(binary_X_test, 'lemmed_reviews', pretrained_word_two_vec_model, "Pretrained --- Binary --- Test")

my_binary_train_concat_embeddings = concat_word_embeddings(binary_X_train, 'lemmed_reviews', my_trained_binary_X_train_model, "My trained --- Binary --- Train")
my_binary_test_concat_embeddings = concat_word_embeddings(binary_X_test, 'lemmed_reviews', my_trained_ternary_X_train_model, "My trained --- Binary --- Test")


print("--- Ternary ---")
pretrained_ternary_train_concat_embeddings = concat_word_embeddings(ternary_X_train, 'lemmed_reviews', pretrained_word_two_vec_model, "Pretrained --- Ternary --- Train")
pretrained_ternary_test_concat_embeddings = concat_word_embeddings(ternary_X_test, 'lemmed_reviews', pretrained_word_two_vec_model, "Pretrained --- Ternary --- Test")

my_ternary_train_concat_embeddings = concat_word_embeddings(ternary_X_train, 'lemmed_reviews', my_trained_ternary_X_train_model, "My trained --- Ternary --- Train")
my_ternary_test_concat_embeddings = concat_word_embeddings(ternary_X_test, 'lemmed_reviews', my_trained_ternary_X_train_model, "My trained --- Ternary --- Test")

Binary
About: Pretrained --- Binary --- Train
   Concat embeddings shape ---  (640, 300)

About: Pretrained --- Binary --- Test
   Concat embeddings shape ---  (160, 300)

About: My trained --- Binary --- Train
   Concat embeddings shape ---  (640, 300)

About: My trained --- Binary --- Test
   Concat embeddings shape ---  (160, 300)

--- Ternary ---
About: Pretrained --- Ternary --- Train
   Concat embeddings shape ---  (800, 300)

About: Pretrained --- Ternary --- Test
   Concat embeddings shape ---  (200, 300)

About: My trained --- Ternary --- Train
   Concat embeddings shape ---  (800, 300)

About: My trained --- Ternary --- Test
   Concat embeddings shape ---  (200, 300)



In [71]:
def tf_idf_feature_extraction(df: pd.DataFrame, col_name: str):
    """Extract the TF-IDF features from the reviews.

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    Return
    ------
    tf_idf_features:
        A matrix containing the TF-IDF features extracted
    """
    
    vectorizer = TfidfVectorizer()
    tf_idf_features = vectorizer.fit_transform(df[col_name])

    return tf_idf_features

In [72]:
cleaned_reviews_df = cleaned_reviews_df.dropna(subset=['binary_review_class'])

In [73]:
tf_idf_features = tf_idf_feature_extraction(cleaned_reviews_df, 'lemmed_reviews')

In [74]:
tf_idf_features[0]

<1x4399 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [75]:
binnary_reviews = cleaned_reviews_df['binary_review_class']
binnary_reviews.shape

(800,)

In [76]:
tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(tf_idf_features, binnary_reviews, test_size=0.2, random_state=42)

# 3. Simple models

**GOAL:** Train simple models below, report the accuracy metric, and understand performances

---

- [ ] Train a perceptron and report accuracy on the testing split for
    - [ ] Pretrained average embeddings
    - [ ] My trained average embeddings
    - [ ] TF-IDF embeddings
- [ ] Train a support vector machine (SVM) and report accuracy on the testing split for
    - [ ] Pretrained average embeddings
    - [ ] My trained average embeddings
    - [ ] TF-IDF embeddings
- [ ] Compare
    - [ ] What do you conclude from comparing performances for the models trained using the three different feature types (TF-IDF, pretrained Word2Vec, your trained Word2Vec)? (see below)

In [77]:
def store_results(results_dict, w2v_type, method, classification, accuracy):
    """
    Store results in a dictionary.

    Parameters:
    results_dict (dict): Dictionary to store the results.
    w2v_type (str): Type of word2vec.
    method (str): Method used.
    classification (str): Type of classification.
    accuracy (float): Accuracy of the classification.

    Returns:
    pandas.DataFrame: DataFrame containing the stored results.
    """
    results_dict['W2V Type'].append(w2v_type)
    results_dict['Method'].append(method)
    results_dict['Classification'].append(classification)
    results_dict['Accuracy'].append(accuracy)

    return pd.DataFrame(results_dict)  # Return DataFrame with a single row

results_dict = {'W2V Type': [], 'Method': [], 'Classification': [], 'Accuracy': []}


In [78]:
def eval_accuracy(y_true, y_prediction):
    return sklearn.metrics.accuracy_score(y_true, y_prediction)

In [79]:
def train_eval_metric(y_train_true, y_train_predictions):
    accuracy = eval_accuracy(y_train_true, y_train_predictions)

    metrics_dict = {
        'Accuracy': accuracy,
    }

    return metrics_dict

def test_eval_metric(y_test_true, y_test_predictions):
    accuracy = eval_accuracy(y_test_true, y_test_predictions)
    
    metrics_dict = {
        'Accuracy': accuracy,
    }

    return metrics_dict

### Get accuracy for perceptron on pretrained model

In [80]:
def perceptron_model(X_train, X_test, y_train, y_test): 

    technique = Perceptron(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [81]:
tfidf_perceptron_train_metrics, tfidf_perceptron_test_metrics = perceptron_model(tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test)

In [82]:
tfidf_perceptron_train_metrics, tfidf_perceptron_test_metrics

({'Accuracy': 0.9953125}, {'Accuracy': 0.76875})

In [83]:
list(tfidf_perceptron_test_metrics.values())[0]

0.76875

In [84]:
# Update the dictionary with new results

results_df = store_results(results_dict, 'TF-IDF-train', 'Perceptron-avg', 'Binary', list(tfidf_perceptron_train_metrics.values())[0])
# results_df = store_results(results_dict, 'TF-IDF-test', 'Perceptron-avg', 'Binary', list(tfidf_perceptron_test_metrics.values())[0])
print(results_df)

       W2V Type          Method Classification  Accuracy
0  TF-IDF-train  Perceptron-avg         Binary  0.995313


In [85]:
# Pretrained model
perceptron_train_metrics, perceptron_test_metrics = perceptron_model(pretrained_binary_train_embeddings, pretrained_binary_test_embeddings, binary_y_train, binary_y_test)

In [86]:
perceptron_train_metrics, perceptron_test_metrics

({'Accuracy': 0.6921875}, {'Accuracy': 0.625})

In [87]:
# Update the dictionary with new results
results_df = store_results(results_dict, 'Pretrained-train', 'Perceptron-avg', 'Binary', list(perceptron_train_metrics.values())[0])
# results_df = store_results(results_dict, 'Pretrained-test', 'Perceptron-avg', 'Binary', list(perceptron_test_metrics.values())[0])
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187


### Get accuracy for perceptron on my model

In [88]:
my_trained_binary_train_embeddings.shape, my_trained_binary_test_embeddings.shape, binary_y_train.shape, binary_y_test.shape

((640, 300), (160, 300), (640,), (160,))

In [89]:
# My model
my_perceptron_train_metrics, my_perceptron_test_metrics = perceptron_model(my_trained_binary_train_embeddings, my_trained_binary_test_embeddings, binary_y_train, binary_y_test)

In [90]:
my_perceptron_train_metrics, my_perceptron_test_metrics

({'Accuracy': 0.61875}, {'Accuracy': 0.45625})

In [91]:
results_df = store_results(results_dict, 'My model-train', 'Perceptron-avg', 'Binary', list(my_perceptron_train_metrics.values())[0])
# results_df = store_results(results_dict, 'My model-test', 'Perceptron-avg', 'Binary', list(my_perceptron_test_metrics.values())[0])
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750


In [92]:
def svm_model(X_train, X_test, y_train, y_test): 

    technique = LinearSVC(tol=1e-3, random_state=0)
    technique.fit(X_train, y_train)
    y_train_predictions = technique.predict(X_train)
    y_test_predictions = technique.predict(X_test)


    train_metrics = train_eval_metric(y_train, y_train_predictions)
    test_metrics = test_eval_metric(y_test, y_test_predictions)

    return train_metrics, test_metrics


In [93]:
tfidf_svm_train_metrics, tfidf_svm_test_metrics = svm_model(tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test)



In [94]:
tfidf_svm_train_metrics, tfidf_svm_test_metrics

({'Accuracy': 0.9921875}, {'Accuracy': 0.80625})

In [95]:
results_df = store_results(results_dict, 'TF-IDF-train', 'SVM-avg', 'Binary', list(tfidf_svm_train_metrics.values())[0])
# results_df = store_results(results_dict, 'TF-IDF-test', 'SVM-avg', 'Binary', list(tfidf_svm_test_metrics.values())[0])
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188


### Get accuracy for svm on pretrained model

In [96]:
svm_train_metrics, svm_test_metrics = svm_model(pretrained_binary_train_embeddings, pretrained_binary_test_embeddings, binary_y_train, binary_y_test)



In [97]:
svm_train_metrics, svm_test_metrics

({'Accuracy': 0.8859375}, {'Accuracy': 0.76875})

In [98]:
results_df = store_results(results_dict, 'Pretrained-train', 'SVM-avg', 'Binary', list(svm_train_metrics.values())[0])
# results_df = store_results(results_dict, 'Pretrained-test', 'SVM-avg', 'Binary', list(svm_test_metrics.values())[0])
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188
4  Pretrained-train         SVM-avg         Binary  0.885938


### Get accuracy for svm on pretrained model

In [99]:
my_svm_train_metrics, my_svm_test_metrics = svm_model(my_trained_binary_train_embeddings, my_trained_binary_test_embeddings, binary_y_train, binary_y_test)



In [100]:
my_svm_train_metrics, my_svm_test_metrics

({'Accuracy': 0.8015625}, {'Accuracy': 0.55625})

In [101]:
results_df = store_results(results_dict, 'My model-train', 'SVM-avg', 'Binary', list(my_svm_train_metrics.values())[0])
# results_df = store_results(results_dict, 'My model-test', 'SVM-avg', 'Binary', list(my_svm_test_metrics.values())[0])
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188
4  Pretrained-train         SVM-avg         Binary  0.885938
5    My model-train         SVM-avg         Binary  0.801562


# 4. Feedforward Neural Networks (FFNN)

**GOAL:** Train a CNN for sentiment analysis classification, report the accuracy metric, and understand performances

---
- [ ] Train a feedforward multilayer perceptron (MLP) network
    - [ ] 2 hidden layers each with 50 and 10 nodes, respectively
    - [ ] Cross entropy loss
    - [ ] Decide other hyperparameters (ie: nonlinearity, #epochs, etc)
- [ ] TUTORIAL: [Pytorch Multi-Layer Perceptron, MNIST](https://www.kaggle.com/code/mishra1993/pytorch-multi-layer-perceptron-mnist/notebook) for image data

---

## (a) Average Embeddings
 
- [ ] Train a FFNN and report accuracy on the testing split for on average embeddings for
    - [ ] Pretrained 
    - [ ] My trained average embeddings
- [ ] Train a FFNN and report accuracy on the testing split for
    - [ ] Pretrained average embeddings
    - [ ] My trained average embeddings

## (b) Concatenate Embeddings

- [ ] Concatenate the first 10 Word2Vec vectors for each review

- [ ] Compare
    - [ ] What do you conclude from comparing performances for the models trained using the three different feature types (TF-IDF, pretrained Word2Vec, your trained Word2Vec)? (see below)

### Binary Classification

In [102]:
# binary_review_class = binary_embeddings_df['binary_review_class']
# binary_review_class.unique()

In [103]:
import torch

import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset

class Net(nn.Module):
    """Define the NN architecture"""

    def __init__(self, num_h1_nodes: int, num_h2_nodes: int, d: int, num_output_classes: int, dropout_rate: float):
        super(Net, self).__init__()

        # self.height = heightght = height
        # self.width = width
        # linear layer (1200 x 300 dot 300 x 50 -> 1200 x 50)
        self.fc1 = nn.Linear(d, num_h1_nodes)
        # linear layer (1200 x 50 dot 50 x 10 -> 1200 x 10)
        self.fc2 = nn.Linear(num_h1_nodes, num_h2_nodes)
        # linear layer (1200 x 50 dot 50 x 10 -> 1200 x 10)
        self.fc3 = nn.Linear(num_h2_nodes, num_output_classes) # change to 3 for ternery
        # dropout to prevent overfitting
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        
        # add hidden layer, with relu activation function, dropout, relu, dropout, output
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)

        return x
    
    def train_network(self, number_of_epochs: int, optimizer, criterion_function, train_loader):
        # set initial "min" to infinity
        valid_loss_min = np.Inf

        for epoch in range(number_of_epochs):
            train_loss = 0.0
            

            ###################
            # train the model #
            ###################
            self.train() # prep model for training
            for data, target in train_loader:
                # clear the gradients of all optimized variables
                optimizer.zero_grad()
                # forward pass to compute predictions, loss, backward pass to compute gradient wrt model params
                output = self(data)
                loss = criterion_function(output, target)
                loss.backward()
                optimizer.step()
                # update running training loss
                train_loss += loss.item() * data.size(0)
            
            # print training statistics 
            # calculate average loss over an epoch
            train_loss = train_loss / len(train_loader.dataset)
            # print("train_loss:", train_loss)
            
            # print('Epoch: {} \tTraining Loss: {:.6f}'.format(
            #     epoch+1, 
            #     train_loss,
            #     ))
            
            # # save model if validation loss has decreased
            # if train_loss <= valid_loss_min:
            #     print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            #     valid_loss_min,
            #     train_loss))
            #     torch.save(self.state_dict(), 'nn_model.pt')
            #     valid_loss_min = train_loss

    def predict(self, data_loader):
        predictions = []
        ground_truth = []

        for i, (inputs, targets) in enumerate(data_loader):
            outputs = self(inputs)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            ground_truth.extend(targets.cpu().numpy())

        # Convert predictions and ground truth lists to numpy arrays
        predictions = np.array(predictions)
        ground_truth = np.array(ground_truth)

        # Calculate accuracy
        accuracy = (predictions == ground_truth).mean()
        print("Accuracy:", accuracy)

        return accuracy

In [104]:
N_binary_embeddings, d_binary_embeddings = my_trained_binary_train_embeddings.shape
print(N_binary_embeddings, d_binary_embeddings)

640 300


In [105]:
num_h1_nodes = 50
num_h2_nodes = 10
num_output_classes = 2
dropout_rate = 0.2

net_model = Net(num_h1_nodes, num_h2_nodes, d_binary_embeddings, num_output_classes, dropout_rate)
print(net_model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [106]:
def train_binary_model(binary_train_embeddings, binary_train_y):
    binary_train_y = binary_train_y.replace(2, 0)
    data_tensor = torch.tensor(binary_train_embeddings, dtype=torch.float32)
    target_tensor = torch.tensor(binary_train_y.values, dtype=torch.long)
    print(data_tensor.size(), target_tensor.size())
    dataset = TensorDataset(data_tensor, target_tensor)
    
    print(len(dataset))

    # how many samples per batch to load
    batch_size = 64

    # as a positive integer will turn on multi-process data loading with the specified number of loader worker processes; otherwise, single-process data loading
    num_workers = 0

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    number_of_epochs = 25
    # number_of_epochs = 50
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net_model.parameters(), lr=0.01)
    output_of_model = net_model.train_network(number_of_epochs, optimizer, criterion, train_loader)
    
    return output_of_model

In [107]:
def predict_binary_model(binary_test_embeddings, binary_test_y):
    binary_test_y = binary_test_y.replace(2, 0)
    target_array = binary_test_y.values
    data_tensor = torch.tensor(binary_test_embeddings, dtype=torch.float32)
    # print(len(data_tensor))

    # Create a PyTorch tensor from the NumPy array
    target_tensor = torch.tensor(target_array, dtype=torch.long)  # Assuming target is of type long/int

    # Create a TensorDataset
    dataset = TensorDataset(data_tensor, target_tensor)

    # Define batch size
    batch_size = 1

    # Define number of DataLoader workers
    num_workers = 0  # Set this to a positive integer to enable multi-process data loading

    # Create DataLoader
    test_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    # Assuming net_model is an instance of your model
    predictions = net_model.predict(test_loader)
    # predictions = np.array(predictions)
    # predictions_flat = predictions.flatten()

    return predictions

In [108]:
train_binary_model(pretrained_binary_train_embeddings, binary_y_train)
acc_avg_pretrained_binary = predict_binary_model(pretrained_binary_test_embeddings, binary_y_test)

torch.Size([640, 300]) torch.Size([640])
640
Accuracy: 0.54375


In [109]:
results_df = store_results(results_dict, 'Pretrained', 'FFN-avg', 'Binary', acc_avg_pretrained_binary)
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188
4  Pretrained-train         SVM-avg         Binary  0.885938
5    My model-train         SVM-avg         Binary  0.801562
6        Pretrained         FFN-avg         Binary  0.543750


In [110]:
train_binary_model(my_trained_binary_train_embeddings, binary_y_train)
acc_avg_my_binary = predict_binary_model(my_trained_binary_test_embeddings, binary_y_test)
results_df = store_results(results_dict, 'My model', 'FFN-avg', 'Binary', acc_avg_my_binary)
print(results_df)

torch.Size([640, 300]) torch.Size([640])
640
Accuracy: 0.525
           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188
4  Pretrained-train         SVM-avg         Binary  0.885938
5    My model-train         SVM-avg         Binary  0.801562
6        Pretrained         FFN-avg         Binary  0.543750
7          My model         FFN-avg         Binary  0.525000


In [111]:
train_binary_model(pretrained_binary_train_concat_embeddings, binary_y_train)
acc_concat_pretrained_binary = predict_binary_model(pretrained_binary_test_concat_embeddings, binary_y_test)

torch.Size([640, 300]) torch.Size([640])
640
Accuracy: 0.575


In [112]:
results_df = store_results(results_dict, 'Pretrained', 'FFN-concat', 'Binary', acc_concat_pretrained_binary)
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188
4  Pretrained-train         SVM-avg         Binary  0.885938
5    My model-train         SVM-avg         Binary  0.801562
6        Pretrained         FFN-avg         Binary  0.543750
7          My model         FFN-avg         Binary  0.525000
8        Pretrained      FFN-concat         Binary  0.575000


In [113]:
train_binary_model(my_binary_train_concat_embeddings, binary_y_train)
acc_concat_my_binary = predict_binary_model(my_binary_test_concat_embeddings, binary_y_test)

torch.Size([640, 300]) torch.Size([640])
640
Accuracy: 0.55


In [114]:
results_df = store_results(results_dict, 'My model', 'FFN-concat', 'Binary', acc_concat_my_binary)
print(results_df)

           W2V Type          Method Classification  Accuracy
0      TF-IDF-train  Perceptron-avg         Binary  0.995313
1  Pretrained-train  Perceptron-avg         Binary  0.692187
2    My model-train  Perceptron-avg         Binary  0.618750
3      TF-IDF-train         SVM-avg         Binary  0.992188
4  Pretrained-train         SVM-avg         Binary  0.885938
5    My model-train         SVM-avg         Binary  0.801562
6        Pretrained         FFN-avg         Binary  0.543750
7          My model         FFN-avg         Binary  0.525000
8        Pretrained      FFN-concat         Binary  0.575000
9          My model      FFN-concat         Binary  0.550000


### Ternary Classification

In [115]:
# N_ternary_embeddings, d_ternary_embeddings = my_ternary_train_embeddings.shape

In [116]:
num_h1_nodes = 50
num_h2_nodes = 10
num_output_classes = 3
dropout_rate = 0.2

net_model = Net(num_h1_nodes, num_h2_nodes, d_binary_embeddings, num_output_classes, dropout_rate)
print(net_model)

Net(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [117]:
def train_ternary_model(ternary_train_embeddings, ternary_train_y):
    ternary_train_y = ternary_train_y.replace(2, 0)
    ternary_train_y = ternary_train_y.replace(3, 2)
    data_tensor = torch.tensor(ternary_train_embeddings, dtype=torch.float32)
    target_tensor = torch.tensor(ternary_train_y.values, dtype=torch.long)
    dataset = TensorDataset(data_tensor, target_tensor)

    # how many samples per batch to load
    batch_size = 64

    # as a positive integer will turn on multi-process data loading with the specified number of loader worker processes; otherwise, single-process data loading
    num_workers = 0

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    number_of_epochs = 3
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net_model.parameters(), lr=0.01)
    output_of_model = net_model.train_network(number_of_epochs, optimizer, criterion, train_loader)
    
    return output_of_model

In [118]:
def predict_ternary_model(ternary_test_embeddings, ternary_test_y):
    ternary_test_y = ternary_test_y.replace(2, 0)
    ternary_test_y = ternary_test_y.replace(3, 2)
    target_array = ternary_test_y.values
    data_tensor = torch.tensor(ternary_test_embeddings, dtype=torch.float32)

    # Create a PyTorch tensor from the NumPy array
    target_tensor = torch.tensor(target_array, dtype=torch.long)  # Assuming target is of type long/int

    # Create a TensorDataset
    dataset = TensorDataset(data_tensor, target_tensor)

    # Define batch size
    batch_size = 1

    # Define number of DataLoader workers
    num_workers = 0  # Set this to a positive integer to enable multi-process data loading

    # Create DataLoader
    test_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    # Assuming net_model is an instance of your model
    predictions = net_model.predict(test_loader)
    # predictions = np.array(predictions)
    # predictions_flat = predictions.flatten()

    return predictions

In [119]:
train_ternary_model(pretrained_ternary_train_embeddings, ternary_y_train)
acc_avg_pretrained_ternary = predict_ternary_model(pretrained_ternary_test_embeddings, ternary_y_test)

Accuracy: 0.185


In [120]:
results_df = store_results(results_dict, 'Pretrained', 'FFN-avg', 'Ternary', acc_avg_pretrained_ternary)
print(results_df)

            W2V Type          Method Classification  Accuracy
0       TF-IDF-train  Perceptron-avg         Binary  0.995313
1   Pretrained-train  Perceptron-avg         Binary  0.692187
2     My model-train  Perceptron-avg         Binary  0.618750
3       TF-IDF-train         SVM-avg         Binary  0.992188
4   Pretrained-train         SVM-avg         Binary  0.885938
5     My model-train         SVM-avg         Binary  0.801562
6         Pretrained         FFN-avg         Binary  0.543750
7           My model         FFN-avg         Binary  0.525000
8         Pretrained      FFN-concat         Binary  0.575000
9           My model      FFN-concat         Binary  0.550000
10        Pretrained         FFN-avg        Ternary  0.185000


In [121]:
train_ternary_model(my_ternary_train_embeddings, ternary_y_train)
acc_avg_my_ternary = predict_ternary_model(my_ternary_test_embeddings, ternary_y_test)

Accuracy: 0.365


In [122]:
results_df = store_results(results_dict, 'My model', 'FFN-avg', 'Ternary', acc_avg_my_ternary)
print(results_df)

            W2V Type          Method Classification  Accuracy
0       TF-IDF-train  Perceptron-avg         Binary  0.995313
1   Pretrained-train  Perceptron-avg         Binary  0.692187
2     My model-train  Perceptron-avg         Binary  0.618750
3       TF-IDF-train         SVM-avg         Binary  0.992188
4   Pretrained-train         SVM-avg         Binary  0.885938
5     My model-train         SVM-avg         Binary  0.801562
6         Pretrained         FFN-avg         Binary  0.543750
7           My model         FFN-avg         Binary  0.525000
8         Pretrained      FFN-concat         Binary  0.575000
9           My model      FFN-concat         Binary  0.550000
10        Pretrained         FFN-avg        Ternary  0.185000
11          My model         FFN-avg        Ternary  0.365000


In [123]:
train_ternary_model(pretrained_ternary_train_concat_embeddings, ternary_y_train)
acc_concat_pretrained_ternary = predict_ternary_model(pretrained_ternary_test_concat_embeddings, ternary_y_test)

Accuracy: 0.39


In [124]:
results_df = store_results(results_dict, 'Pretrained', 'FFN-concat', 'Ternary', acc_concat_pretrained_ternary)
print(results_df)

            W2V Type          Method Classification  Accuracy
0       TF-IDF-train  Perceptron-avg         Binary  0.995313
1   Pretrained-train  Perceptron-avg         Binary  0.692187
2     My model-train  Perceptron-avg         Binary  0.618750
3       TF-IDF-train         SVM-avg         Binary  0.992188
4   Pretrained-train         SVM-avg         Binary  0.885938
5     My model-train         SVM-avg         Binary  0.801562
6         Pretrained         FFN-avg         Binary  0.543750
7           My model         FFN-avg         Binary  0.525000
8         Pretrained      FFN-concat         Binary  0.575000
9           My model      FFN-concat         Binary  0.550000
10        Pretrained         FFN-avg        Ternary  0.185000
11          My model         FFN-avg        Ternary  0.365000
12        Pretrained      FFN-concat        Ternary  0.390000


In [125]:
train_ternary_model(my_ternary_train_concat_embeddings, ternary_y_train)
acc_concat_my_ternary = predict_ternary_model(my_ternary_test_concat_embeddings, ternary_y_test)

Accuracy: 0.39


In [126]:
results_df = store_results(results_dict, 'My model', 'FFN-concat', 'Ternary', acc_concat_my_ternary)
print(results_df)

            W2V Type          Method Classification  Accuracy
0       TF-IDF-train  Perceptron-avg         Binary  0.995313
1   Pretrained-train  Perceptron-avg         Binary  0.692187
2     My model-train  Perceptron-avg         Binary  0.618750
3       TF-IDF-train         SVM-avg         Binary  0.992188
4   Pretrained-train         SVM-avg         Binary  0.885938
5     My model-train         SVM-avg         Binary  0.801562
6         Pretrained         FFN-avg         Binary  0.543750
7           My model         FFN-avg         Binary  0.525000
8         Pretrained      FFN-concat         Binary  0.575000
9           My model      FFN-concat         Binary  0.550000
10        Pretrained         FFN-avg        Ternary  0.185000
11          My model         FFN-avg        Ternary  0.365000
12        Pretrained      FFN-concat        Ternary  0.390000
13          My model      FFN-concat        Ternary  0.390000


# 5. Convolutional Neural Networks (CNN)

**GOAL:** Train a CNN for sentiment analysis classification

---

- [ ] 2 layer CNN with output channel sizes of 50 and 10, respectively
- [ ] Limit each review to 50 words
    - [ ] If more, truncate to 50
    - [ ] If less pad with 0s to make 50
- [ ] Use cross entropy
- [ ] Select hyperparameters (ie: nonlinearity, #epochs, etc.) of my choosing
- [ ] Train for binary classification
- [ ] Train for ternary classification
- [ ] Report for testing split

## Condition word embeddings

In [187]:
def condition_word_embeddings(df: pd.DataFrame, col_name: str, max_sentence_length: int, model_to_use, about):
    """Extract word embeddings

    Parameters
    ----------
    df: `pd.DataFrame`
        The data
    
    col_name: `str`
        Column with reviews

    model_to_use:
        Either the pretrained model or my pretrained model

    Return
    ------
    conditioned_sequences
    """
    print("About: ", about)
    sentence_vectorized = []
    mean_sentences_vectorized = []
    concatenated_features = []
    sentences = df[col_name].values

    for sentences_idx in range(len(sentences)):
        vectorized_words = []
        sentence = sentences[sentences_idx]
        # print("Sentence", sentences_idx, sentence)
        words = sentence.split(" ")
        # print("Before -- ", len(words))
        if len(words) > max_sentence_length:
            words = words[:max_sentence_length]
        elif len(words) < max_sentence_length:
            words += [''] * (max_sentence_length - len(words))

        # print("After -- ", len(words), words)

        for word in words:
            if word in model_to_use.key_to_index:
                vector_of_word = model_to_use[word]
                vectorized_words.append(vector_of_word)
            else:
                vector_of_word = np.random.rand(model_to_use.vector_size)
                vectorized_words.append(vector_of_word)

        sentence_vectorized.append(vectorized_words)
    # print(sentence_vectorized)
        # print()
    
        
    tensors = []
    for words in sentence_vectorized:
        tensors.append(torch.tensor(words))
    padded_sequences = pad_sequence(tensors, batch_first=True, padding_value=0)
    print("Embeddings shape: ", padded_sequences.shape)
    # padded_sequences_updated = np.array(padded_sequences)
    # print("Embeddings shape: ", padded_sequences_updated.shape)
    print()
    return padded_sequences
    # return sentence_vectorized

In [188]:
print("--- Binary ---")
pretrained_binary_train_50_embeddings = condition_word_embeddings(binary_X_train, 'lemmed_reviews', 50, pretrained_word_two_vec_model, "Pretrained --- Binary --- Train")
pretrained_binary_test_50_embeddings = condition_word_embeddings(binary_X_test, 'lemmed_reviews', 50, pretrained_word_two_vec_model, "Pretrained --- Binary --- Test")

my_trained_binary_train_50_embeddings = condition_word_embeddings(binary_X_train, 'lemmed_reviews', 50, my_trained_binary_X_train_model, "My trained --- Binary --- Train")
my_trained_binary_test_50_embeddings = condition_word_embeddings(binary_X_test, 'lemmed_reviews', 50, my_trained_binary_X_train_model, "My trained --- Binary --- Test")


print("--- Ternary ---")
pretrained_ternary_train_50_embeddings = condition_word_embeddings(ternary_X_train, 'lemmed_reviews', 50, pretrained_word_two_vec_model, "Pretrained --- Ternary --- Train")
pretrained_ternary_test_50_embeddings = condition_word_embeddings(ternary_X_test, 'lemmed_reviews', 50, pretrained_word_two_vec_model, "Pretrained --- Ternary --- Test")

my_trained_ternary_train_50_embeddings = condition_word_embeddings(ternary_X_train, 'lemmed_reviews', 50, my_trained_ternary_X_train_model, "My trained --- Ternary --- Train")
my_trained_ternary_test_50_embeddings = condition_word_embeddings(ternary_X_test, 'lemmed_reviews', 50, my_trained_ternary_X_train_model, "My trained --- Ternary --- Test")


--- Binary ---
About:  Pretrained --- Binary --- Train
Embeddings shape:  torch.Size([640, 50, 300])

About:  Pretrained --- Binary --- Test
Embeddings shape:  torch.Size([160, 50, 300])

About:  My trained --- Binary --- Train
Embeddings shape:  torch.Size([640, 50, 300])

About:  My trained --- Binary --- Test
Embeddings shape:  torch.Size([160, 50, 300])

--- Ternary ---
About:  Pretrained --- Ternary --- Train
Embeddings shape:  torch.Size([800, 50, 300])

About:  Pretrained --- Ternary --- Test
Embeddings shape:  torch.Size([200, 50, 300])

About:  My trained --- Ternary --- Train
Embeddings shape:  torch.Size([800, 50, 300])

About:  My trained --- Ternary --- Test
Embeddings shape:  torch.Size([200, 50, 300])



In [195]:
def format_and_load_data(embeddings: list, y_true: list, classification_type: str, about: str):
    """
    
    """
    batch_size = 64
    
    """Mapping
    
    1 --- (positive) --- 0 
    2 --- (negative) --- 1 
    3 --- (neutral)  --- 2 
    
    """
    print("\nAbout: ", about, "\n")
    print("Original mappings --- ", y_true.unique())
    if classification_type == "binary":
        map_y_true_values = y_true.replace(1, 0)
        map_y_true_values = map_y_true_values.replace(2, 1)

    elif classification_type == "ternary":
        map_y_true_values = y_true.replace(1, 0)
        map_y_true_values = map_y_true_values.replace(2, 1)
        map_y_true_values = map_y_true_values.replace(3, 2)
        # print("Remappings for ", classification_type, "---", map_y_true_values.unique())

    else:
        print("Invalid classification type")

    print("Remappings        --- ", map_y_true_values.unique())
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
    y_true_tensor = torch.tensor(map_y_true_values.values, dtype=torch.long)
    dataset = TensorDataset(embeddings_tensor, y_true_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)    
    
    return loader

In [196]:
len(pretrained_ternary_train_50_embeddings)

800

In [197]:
# print("--- Binary ---")
pretrained_binary_train_loader = format_and_load_data(pretrained_binary_train_50_embeddings, binary_y_train, "binary", "Pretrained x Binary x Train")
pretrained_binary_test_loader = format_and_load_data(pretrained_binary_test_50_embeddings, binary_y_test, "binary", "Pretrained x Binary x Test")

my_trained_binary_train_loader = format_and_load_data(my_trained_binary_train_50_embeddings, binary_y_train, "binary", "My trained x Binary x Train")
my_trained_binary_test_loader = format_and_load_data(my_trained_binary_test_50_embeddings, binary_y_test, "binary", "My trained x Binary x Test")

print()
print("--- Ternary ---")
pretrained_ternary_train_loader = format_and_load_data(pretrained_ternary_train_50_embeddings, ternary_y_train, "ternary", "Pretrained x Ternary x Train")
pretrained_ternary_test_loader = format_and_load_data(pretrained_ternary_test_50_embeddings, ternary_y_test, "ternary", "Pretrained x Ternary x Test")

my_trained_ternary_train_loader = format_and_load_data(my_trained_ternary_train_50_embeddings, ternary_y_train, "ternary", "My trained x Ternary x Test")
my_trained_ternary_test_loader = format_and_load_data(my_trained_ternary_test_50_embeddings, ternary_y_test, "ternary", "My trained x Ternary x Test")


About:  Pretrained x Binary x Train 

Original mappings ---  [2. 1.]
Remappings        ---  [1. 0.]

About:  Pretrained x Binary x Test 

Original mappings ---  [1. 2.]
Remappings        ---  [0. 1.]

About:  My trained x Binary x Train 

Original mappings ---  [2. 1.]
Remappings        ---  [1. 0.]

About:  My trained x Binary x Test 

Original mappings ---  [1. 2.]
Remappings        ---  [0. 1.]

--- Ternary ---

About:  Pretrained x Ternary x Train 

Original mappings ---  [2. 3. 1.]
Remappings        ---  [1. 2. 0.]

About:  Pretrained x Ternary x Test 

Original mappings ---  [3. 1. 2.]
Remappings        ---  [2. 0. 1.]

About:  My trained x Ternary x Test 

Original mappings ---  [2. 3. 1.]
Remappings        ---  [1. 2. 0.]

About:  My trained x Ternary x Test 

Original mappings ---  [3. 1. 2.]
Remappings        ---  [2. 0. 1.]


  embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)


In [132]:
# num_channels = 300
# embedding_size = 300
# output_channels1 = 50
# output_channels2 = 10
# kernel_size1 = 3
# kernel_size2 = 3 
# num_classes = 2

# nn.Conv1d(16, 33, 3, stride=2)
# nn.Conv1d(640, 50, 300) --> (num_channels=300, output_channels1=50, kernel_size1=3)
# (num_channels=300, output_channels1=50, kernel_size1=3) --> (num_channels=50, output_channels1=10, kernel_size2=3)

# nn.Linear(2960, num_classes)

In [198]:
class CNN_Net(nn.Module):
    def __init__(self, embedding_size, num_channels, output_channels1, output_channels2, kernel_size1, kernel_size2, num_classes):
        super(CNN_Net, self).__init__()
        
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=num_channels, out_channels=output_channels1, kernel_size=kernel_size1),
            nn.ReLU(),
            nn.Conv1d(in_channels=output_channels1, out_channels=output_channels2, kernel_size=kernel_size2),
            nn.ReLU())

        # to calc 460, refer to docs and write my own utils func
        self.fc_layers = nn.Sequential(
            nn.Linear(460, num_classes),
            nn.ReLU()
        )

    def forward(self, x):
        # print("Before permute ---", x.shape)
        x = x.permute(0, 2, 1) 
        # print("After permute ---", x.shape)
        x = self.conv_layers(x)
        # print("After Conv layers ---", x.shape)
        # print("1st", x.shape)
        # Flatten the output of the convolutional layers
        x = x.view(x.size(0), -1)
        # print("After Flatten ---", x.shape)
        x = self.fc_layers(x)
        # print("After Fully connected layers ---", x.shape)
        # print()
        return x

    def train_network(self, number_of_epochs: int, optimizer, criterion_function, train_loader, test_loader):
        for epoch in range(number_of_epochs):
            print("Epoch --- ", epoch)
            correct_train = 0.0
            total_train_loss = 0.0
            
            ###################
            # train the model #
            ###################
            self.train() # prep model for training
    
            for data, target in train_loader:
                # clear the gradients of all optimized variables
                optimizer.zero_grad()

                # predictions
                output = self(data)
                # print("Output", output.shape)
                
                # target = target.squeeze()
                
                loss = criterion_function(output, target)
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item() * data.size(0)
                _, predicted = torch.max(output.data, 1)
                
                correct_train += (predicted == target).sum().item()
               
            avg_total_train_loss = total_train_loss / len(train_loader.dataset)
            print("Train avg total loss --- ", avg_total_train_loss)
        
            total_evaluation_loss = 0
            self.eval()
            with torch.no_grad():
                for data, target in test_loader:
                    output = self(data)
                    # target = target.squeeze()
                    loss = criterion_function(output, target)
                    total_evaluation_loss += loss.item() * data.size(0)
                    
            avg_total_evaluation_loss = total_evaluation_loss / len(train_loader.dataset)
            print("Evaluation avg total loss --- ", avg_total_evaluation_loss)
            
            train_accuracy = correct_train / len(train_loader.dataset)
            print(f'Epoch: {epoch+1} \tTraining Loss: {loss:.6f} \tTraining Accuracy: {train_accuracy * 100:.2f}%')
            print()
        print('Finished Training')

In [199]:
N, N_words, embedding_size = pretrained_binary_train_50_embeddings.shape
N, N_words, embedding_size

(640, 50, 300)

In [200]:
# channels: for each word has 300 dims, so 300 features
num_channels = 300

output_channels1 = 50
output_channels2 = 10
kernel_size1 = 3
kernel_size2 = 3 
num_classes = 2  # Number of output classes, adjust as needed

cnn_net_model = CNN_Net(embedding_size, num_channels, output_channels1, output_channels2, kernel_size1, kernel_size2, num_classes)
print(cnn_net_model)

CNN_Net(
  (conv_layers): Sequential(
    (0): Conv1d(300, 50, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): Conv1d(50, 10, kernel_size=(3,), stride=(1,))
    (3): ReLU()
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=460, out_features=2, bias=True)
    (1): ReLU()
  )
)


In [201]:
number_of_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(cnn_net_model.parameters(), lr=1, momentum=0.9)
# print("2-train_binary_cnn_model")
output_of_model = cnn_net_model.train_network(number_of_epochs, optimizer, criterion, pretrained_binary_train_loader, pretrained_binary_test_loader)

2-train_binary_cnn_model
Epoch ---  0
Train avg total loss ---  0.6931009232997895
Evaluation avg total loss ---  0.1732868254184723
Epoch: 1 	Training Loss: 0.693147 	Training Accuracy: 50.00%

Epoch ---  1
Train avg total loss ---  0.6931473016738892
Evaluation avg total loss ---  0.1732868254184723
Epoch: 2 	Training Loss: 0.693147 	Training Accuracy: 50.00%

Epoch ---  2
Train avg total loss ---  0.6931473016738892
Evaluation avg total loss ---  0.1732868254184723
Epoch: 3 	Training Loss: 0.693147 	Training Accuracy: 50.00%

Epoch ---  3
Train avg total loss ---  0.6931473016738892
Evaluation avg total loss ---  0.1732868254184723
Epoch: 4 	Training Loss: 0.693147 	Training Accuracy: 50.00%

Epoch ---  4
Train avg total loss ---  0.6931473016738892
Evaluation avg total loss ---  0.1732868254184723
Epoch: 5 	Training Loss: 0.693147 	Training Accuracy: 50.00%

Epoch ---  5
Train avg total loss ---  0.6931473016738892
Evaluation avg total loss ---  0.1732868254184723
Epoch: 6 	Trainin

#### STOP HERE --- IGNORE BELOW #####

In [None]:
# results_df = store_results(results_dict, 'Pretrained', 'CNN-50', 'Binary', acc_50_pretrained_binary)
# print(results_df)

In [None]:
# train_binary_cnn_model(my_binary_train_embeddings, binary_y_train)
# acc_50_my_binary = predict_binary_cnn_model(my_binary_train_embeddings, binary_y_train)

In [None]:
# results_df = store_results(results_dict, 'My model', 'CNN-500', 'Binary', acc_50_my_binary)
# print(results_df)

### Ternary Classification

In [None]:
N_ternary_embeddings, d_ternary_embeddings = my_ternary_train_embeddings.shape

In [None]:
num_h1_nodes = 50
num_h2_nodes = 10
num_output_classes = 3
dropout_rate = 0.2

net_model = CNN_Net(num_h1_nodes, num_h2_nodes, d_binary_embeddings, num_output_classes, dropout_rate)
print(net_model)

In [None]:
def train_ternary_cnn_model(ternary_train_embeddings, ternary_train_y):
    ternary_train_y = ternary_train_y.replace(2, 0)
    ternary_train_y = ternary_train_y.replace(3, 2)
    data_tensor = torch.tensor(ternary_train_embeddings, dtype=torch.float32)
    target_tensor = torch.tensor(ternary_train_y.values, dtype=torch.long)
    dataset = TensorDataset(data_tensor, target_tensor)

    # how many samples per batch to load
    batch_size = 64

    # as a positive integer will turn on multi-process data loading with the specified number of loader worker processes; otherwise, single-process data loading
    num_workers = 0

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    number_of_epochs = 30
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net_model.parameters(), lr=0.01)
    output_of_model = cnn_net_model.train_network(number_of_epochs, optimizer, criterion, train_loader)
    
    return output_of_model

In [None]:
def predict_ternary_cnn_model(ternary_test_embeddings, ternary_test_y):
    ternary_test_y = ternary_test_y.replace(2, 0)
    ternary_test_y = ternary_test_y.replace(3, 2)
    target_array = ternary_test_y.values
    data_tensor = torch.tensor(ternary_test_embeddings, dtype=torch.float32)


    # Create a PyTorch tensor from the NumPy array
    target_tensor = torch.tensor(target_array, dtype=torch.long)  # Assuming target is of type long/int

    # Create a TensorDataset
    dataset = TensorDataset(data_tensor, target_tensor)

    # Define batch size
    batch_size = 1

    # Define number of DataLoader workers
    num_workers = 0  # Set this to a positive integer to enable multi-process data loading

    # Create DataLoader
    test_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

    # Assuming net_model is an instance of your model
    predictions = cnn_net_model.cnn_predict(test_loader)
    # predictions = np.array(predictions)
    # predictions_flat = predictions.flatten()

    return predictions

In [None]:
train_ternary_cnn_model(pretrained_ternary_train_embeddings, ternary_y_train)
# acc_50_pretrained_ternary = predict_ternary_cnn_model(pretrained_ternary_test_embeddings, ternary_y_test)

In [None]:
results_df = store_results(results_dict, 'Pretrained', 'CNN-50', 'Ternary', acc_50_pretrained_ternary)
print(results_df)

In [None]:
train_ternary_cnn_model(my_ternary_train_embeddings, ternary_y_train)
acc_50_my_ternary = predict_ternary_cnn_model(my_ternary_test_embeddings, ternary_y_test)

In [None]:
results_df = store_results(results_dict, 'My model', 'CNN-50', 'Ternary', acc_50_my_ternary)
print(results_df)

In [None]:
# print(output.shape) # [batch_size = 64 , number_of_class (ternary = 3), (bin = 2) ]
#             print(output[0])