<center><h1>Project 2</h1></center>
<br>
<center><font size="5">Name - Spandan Patil</font></center>

In [None]:
"""
Dependencies version:

pandas - 2.2.3
numpy - 1.26.4
nltk - 3.9.1
bs4 - 0.0.2
gensim - 4.3.3
scikit-learn - 1.6.1
torch - 2.5.1+cu124
python - 3.12.8

"""

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt_tab')
import re
from bs4 import BeautifulSoup
import gensim
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Spandan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Spandan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Spandan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. Dataset Generation

In [None]:
# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

In [2]:
# I am reading the amazon kitchen products review dataset and storing it into a Dataframe object using Pandas

df = pd.read_csv('./amazon_reviews_us_Office_Products_v1_00.tsv', sep='\t', on_bad_lines="skip")

In [3]:
# Out of all the features in the dataset, I am combining the review_headlines with the review_body column as input feature and taking star_rating as my output label.
df["review_body"] = df["review_headline"] + " " + df["review_body"]
df = df[["review_body", "star_rating"]]

In [4]:
# Here I am converting all the values present in the star_rating columns into numeric using the in-built pandas function to_numeric, also by setting the errors parameter to 'coerce' all the values which are not able to be converted to numeric will be set to NaN. In our case these would be mainly the rows having the invalid date values. Hence, we can just drop all the NaN value rows present in the star_ratings column which are be generated using the dropna function our pandas dataframe. After the processing we can see that all the rows present in the dataset are having a star_ratings label from 1 to 5.
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
df = df.dropna(subset=['star_rating'])
df = df.dropna(subset=['review_body'])

In [5]:

# Here I am creating the sampled df, which consists of randomly sample 50,000 rows from each of the star rating from 1 to 5. I am also shuffling the sampled df at the end.
sampled_df = pd.DataFrame(columns=["review_body", "star_rating"])

r_state=34

for r in range(1, 6):
    r_sample = df[df["star_rating"] == r]
    r_sample = r_sample.sample(n=50000, random_state=r_state)
    sampled_df = pd.concat([sampled_df, r_sample], ignore_index=True)

sampled_df = sampled_df.sample(frac=1, random_state=r_state).reset_index(drop=True)

In [6]:
# I am freeing up the memory by removing the uwanted variable for better RAM usage.
del df
del r_sample

In [7]:
# As we can see here the sampled df have 50,000 row for each star rating from 1 to 5.
sampled_df["star_rating"].value_counts()

star_rating
5.0    50000
1.0    50000
4.0    50000
3.0    50000
2.0    50000
Name: count, dtype: int64

In [8]:
#This is the function used to create our sentiment label class - positive(1), negative(2), neutral(3).
def create_label(rating):
    if rating > 3:
        return 1
    elif rating == 3:
        return 3
    else:
        return 2

In [9]:
# I am creating the output label class by applying the create_label function to the star_rating column.
sampled_df["senti_label"] = sampled_df["star_rating"].apply(create_label)

sampled_df = sampled_df[["review_body", "senti_label"]]

In [10]:
# Here we can see we have our output label class created.
sampled_df.head()

Unnamed: 0,review_body,senti_label
0,Energizer ER-P512 NiMH Cordless Phone Battery ...,1
1,I'm very disappointed with this product The pa...,2
2,Just what I needed! Just what I needed! Hard t...,1
3,great! I like this lobby dustpan. It works gre...,1
4,This is NOT a high yield cartridge! Even thoug...,3


## 2. Word Embedding

### a)

In [None]:
# Here i have saved the google news 300 word embedding model for better performance.
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
wv.save("gensim_model_w2v_gnews_300.model")

In [12]:
# Loading the download and saved google news 300 word embedding model.
wv = gensim.models.KeyedVectors.load("gensim_model_w2v_gnews_300.model")

In [13]:
# Checking the first example for google news model, check the similar score of great and worst. As we can see the similar score is very less.
similarity_score = wv.similarity("great", "worst")
print(f"Similarity Score of great and worst: {similarity_score}")


Similarity Score of great and worst: 0.22344698011875153


In [14]:
# We are check the expression him - male + female ~ her, but the google news model is not giving the correct answer.
similarity_score = wv.most_similar(positive=["him", "male"], negative=["female"], topn=1)
print(f"Most similar to him + female = male: {similarity_score}")


Most similar to him + female = male: [('me', 0.6085236072540283)]


### b)

In [None]:
# Here i am training my own word embedding model, using the review body column in the dataset, and saving it for better efficiency.

my_model = gensim.models.Word2Vec(sentences=sampled_df["review_body"].apply(lambda r: nltk.tokenize.word_tokenize(r)), vector_size=300, window=11, min_count=10)
my_model.save("gensim_model_w2v_my_model.model")

In [16]:
# Here i am loading my trained word embedding model, which i have created using review body column in the data. I am only loading the keyVectors instead of the whole model.
my_model = gensim.models.Word2Vec.load("gensim_model_w2v_my_model.model").wv

In [17]:
# Here i am checking the same example again, we can see that the similarity score is even lower than google news model, which is a better estimate.
similarity_score = my_model.similarity("great", "worst")
print(f"Similarity Score of great and worst: {similarity_score}")

Similarity Score of great and worst: 0.2037562131881714


In [18]:
# Here we can see that the expression him - male + female ~ her is giving correct output. 
similarity_score = my_model.most_similar(positive=["him", "female"], negative=["male"], topn=1)
print(f"Most similar to him + female = male: {similarity_score}")



Most similar to him + female = male: [('her', 0.6315240263938904)]


Comparing the two example of 1) calculating the similarity score between great and worst, 2) Checking the validity of expression him - male + femaler ~ her, we can conclude that the word embedding model which we have trains performs better than the pretrained google news 300 model.

## 3. Simple models

### Data cleaning and preprocessing

In [19]:

# This is the function i am using to clean the reviews and do some preprocessing, as i did in HW1, i am not performing lemmanization since it is reducing the accuracy in my case.
def clean_review(review):
    
    # This is the dictionary having all the commonly used contradictions in the lower case format. These include format with ' or without it. For example didn't and didnt both are included.
    contractions_dict = {
        "i'm": "i am",
        "you're": "you are",
        "he's": "he is",
        "he'll": "he will",
        "she'll": "she will",
        "we'll": "we will",
        "they'll": "they will",
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "hasn't": "has not",
        "haven't": "have not",
        "won't": "will not",
        "wouldn't": "would not",
        "can't": "cannot",
        "couldn't": "could not",
        "shouldn't": "should not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "mustn't": "must not",
        "needn't": "need not",
        "she's": "she is",
        "it's": "it is",
        "we're": "we are",
        "they're": "they are",
        "i've": "i have",
        "you've": "you have",
        "he's": "he has",
        "she's": "she has",
        "it's": "it has",
        "we've": "we have",
        "they've": "they have",
        "i'd": "i would",
        "you'd": "you would",
        "he'd": "he would",
        "she'd": "she would",
        "we'd": "we would",
        "they'd": "they would",
        "i'll": "i will",
        "you'll": "you will",
        "let's": "let us",
        "that's": "that is",
        "who's": "who is",
        "what's": "what is",
        "where's": "where is",
        "how's": "how is",
        "there's": "there is",
        "here's": "here is",
        "didnt": "did not",
        "cant": "cannot",
        "wont": "will not",
        "dont": "do not",
        "doesnt": "does not",
        "shouldnt": "should not",
        "wouldnt": "would not",
        "mustnt": "must not",
        "neednt": "need not",
        "letd": "let us"
    }

    # Here i am converting the review to string and then converting them to lowercase using in-built function 
    review = str(review).lower()
    # Here using the Beautiful Soup HTML.PARSER i am extracting only text data from the reviews discarding the html tag or elements.
    review = BeautifulSoup(review, "html.parser").get_text()
    # I am using a simple regex to remove all the URLs from the text.
    review = re.sub(r'http\S+|www\S+', '', review)
    # Here we are going through the contradiction dictionary and replacing each of the contradiction with their expanded form if they are found in the review. 
    for contraction, expanded in contractions_dict.items():
        review = review.replace(contraction, expanded)
    # Here i am removing all the non-alphabetic characters execpt for the spaces. This step needs to be done after the contradiction expansion as this will result in the ' getting removed also, we can cause issues when detecting the contradictions.
    review = re.sub(r'[^a-zA-Z\s]', '', review)
    # Here i am coverting all the multiple consecutive spaces with single space using a regex.
    review = re.sub(r'\s+', ' ', review)
    # Here i am removing the leading and trailing spaces.
    review = review.strip()
    # I am spliting the review into a list of word based on single space as separator
    words = review.split()
    # I am filtering out all the stop words and only including the ones which are not present in our stopwords list.
    filtered_words = [word for word in words if word not in stop_words]
    # I am joining all the converted word using a single space.
    return ' '.join(filtered_words)


In [20]:
# Here i am applying the clean_review function to each review in the review_body column.
sampled_df["review_body"] = sampled_df["review_body"].apply(clean_review)

In [21]:
# Here i am tokenizing the cleaned reviews to prepare them for converting into word embeddings.
sampled_df['review_body'] = sampled_df['review_body'].apply(lambda r: word_tokenize(r))

In [22]:
# Here i am getting the word embedding for all the words present in the review and taking the mean of each of the dimension to form the sentence embedding.
def review_embedding(review, keyVec):
    w_vec = [keyVec[w] for w in review if w in keyVec]  # Get word vectors
    if len(w_vec) == 0:
        return np.zeros(300)  # Return zero vector if no words found
    return np.mean(w_vec, axis=0)  # Compute the mean of word vectors


In [23]:
# Here i am creating two copys of the sampled df to be used for performing embedding via google news 300 modela and my own trained model.
g_news_df = sampled_df.copy()
my_model_df = sampled_df.copy()

In [24]:
# Here i am performing the sentence emebeding on the review body column with the keyVectors of google news 300 model.
g_news_df["review_embedding"] = g_news_df["review_body"].apply(lambda review: review_embedding(review, wv))
g_news_df = g_news_df[["review_embedding", "senti_label"]] 

In [25]:
# Here i am performing the sentence emebeding on the review body column with the keyVectors of my own trained model.
my_model_df["review_embedding"] = my_model_df["review_body"].apply(lambda review: review_embedding(review, my_model))
my_model_df = my_model_df[["review_embedding", "senti_label"]] 

In [26]:
# Here i am creating the binary classification dataset with google new embeddings.
binary_g_news_df = g_news_df.copy()
binary_g_news_df = binary_g_news_df[binary_g_news_df["senti_label"] != 3] 

In [27]:
# Here i am creating the binary classification dataset with my model embeddings.
binary_my_model_df = my_model_df.copy()
binary_my_model_df = binary_my_model_df[binary_my_model_df["senti_label"] != 3]

In [28]:
# Here i am spliting the dataset into input label X and output label y for binary classification using google news embedding. Also i am converting the 300 dimension to columns, to fit better for the models.
b_g_news_X = pd.DataFrame(binary_g_news_df['review_embedding'].tolist(), columns=[f"d_{i}" for i in range(300)])
b_g_news_y = binary_g_news_df["senti_label"]

In [29]:
# Here i am spliting the dataset into input label X and output label y for binary classification using my model embedding. Also i am converting the 300 dimension to columns, to fit better for the models.
b_my_model_X = pd.DataFrame(binary_my_model_df['review_embedding'].tolist(), columns=[f"d_{i}" for i in range(300)])
b_my_model_y = binary_my_model_df["senti_label"]

In [30]:
# Here i am performing train test split for the binary classification using google news embeddings.
b_g_news_X_train, b_g_news_X_test, b_g_news_y_train, b_g_news_y_test = train_test_split(b_g_news_X, b_g_news_y, test_size=0.2)

In [31]:
# Here i am performing train test split for the binary classification using my model embeddings.
b_my_model_X_train, b_my_model_X_test, b_my_model_y_train, b_my_model_y_test = train_test_split(b_my_model_X, b_my_model_y, test_size=0.2)

### Perceptron Model

In [32]:
from sklearn.linear_model import Perceptron

#### Google News

In [33]:
# Here I am using percepton model with hyperparamter tuning of increasing the number of epoch to 10000 and enabling early stopping if the accuracy doesn't increase.
b_g_news_clf = Perceptron(max_iter=10000, early_stopping=True)
b_g_news_clf.fit(b_g_news_X_train, b_g_news_y_train)

b_g_news_y_pred = b_g_news_clf.predict(b_g_news_X_test)

b_g_news_accuracy_test = accuracy_score(b_g_news_y_test, b_g_news_y_pred)

perceptron_g_news_acc = b_g_news_accuracy_test
print(f'Perceptron Google News 300 Test Accuracy: {perceptron_g_news_acc:.4f}')


Perceptron Google News 300 Test Accuracy: 0.8220


#### My Model

In [34]:
# Here I am using percepton model with hyperparamter tuning of increasing the number of epoch to 10000 and enabling early stopping if the accuracy doesn't increase.
b_my_model_clf = Perceptron(max_iter=10000, early_stopping=True)
b_my_model_clf.fit(b_my_model_X_train, b_my_model_y_train)

b_my_model_y_pred = b_my_model_clf.predict(b_my_model_X_test)

b_my_model_accuracy_test = accuracy_score(b_my_model_y_test, b_my_model_y_pred)

perceptron_my_model_acc = b_my_model_accuracy_test

print(f'Perceptron My Model Test Accuracy: {perceptron_my_model_acc:.4f}')


Perceptron My Model Test Accuracy: 0.8330


For the Perceptron model the Test accuracy - TF-IDF: 0.8864.
Hence, we can say that for perceptron model TF-IDF word embedding worked the best.

### SVM Model

In [35]:
from sklearn.svm import LinearSVC

#### Google News

In [36]:
# Here I am using LinearSVC model with hyperparamter tuning of increasing the number of epoch to 10000. I am using LinearSVC here instead of SVC due to the large size of data. In the Sklearn documentation, it is given that using SVC may be impratical beyond the tens of thousands of samples, and LinearSVC is preferred for larger datasets.
b_g_news_clf = LinearSVC(max_iter=10000)
b_g_news_clf.fit(b_g_news_X_train, b_g_news_y_train)

b_g_news_y_pred = b_g_news_clf.predict(b_g_news_X_test)

b_g_news_accuracy_test = accuracy_score(b_g_news_y_test, b_g_news_y_pred)

svm_g_news_acc = b_g_news_accuracy_test

print(f'SVM Google News 300 Test Accuracy: {svm_g_news_acc:.4f}')

SVM Google News 300 Test Accuracy: 0.8454


#### My Model

In [37]:
# Here I am using LinearSVC model with hyperparamter tuning of increasing the number of epoch to 10000. I am using LinearSVC here instead of SVC due to the large size of data. In the Sklearn documentation, it is given that using SVC may be impratical beyond the tens of thousands of samples, and LinearSVC is preferred for larger datasets.
b_my_model_clf = LinearSVC(max_iter=10000)
b_my_model_clf.fit(b_my_model_X_train, b_my_model_y_train)

b_my_model_y_pred = b_my_model_clf.predict(b_my_model_X_test)

b_my_model_accuracy_test = accuracy_score(b_my_model_y_test, b_my_model_y_pred)

svm_my_model_acc = b_my_model_accuracy_test

print(f'SVM My Model Test Accuracy: {svm_my_model_acc:.4f}')


SVM My Model Test Accuracy: 0.8870


For the SVM model the test accurarcy - TF-IDF : 0.9187.
Hence, we can say that for SVM model TF-IDF word embedding worked the best.

## 4. Feedforward Neural Networks

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [39]:
# Here i am checking if cuda is avaiable and setting the processing device to gpu or cpu.
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

CUDA device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [40]:
# Here i am creating a FNN classifier model, based of the instructions given in the pdf, having two hidden layer having 50 and 10 nodes, which are giving in as inputs.
class ClassiferModel(nn.Module):
    def __init__(self, in_size, h1, h2, out_size):
        super(ClassiferModel, self).__init__()
        self.fc1 = nn.Linear(in_size, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, out_size)
        self.relu = nn.ReLU()

    def forward(self, r):
        r = self.relu(self.fc1(r))  
        r = self.relu(self.fc2(r))  
        r = self.fc3(r)
        return r 

### a)

#### Binary Classifier

##### Google News

In [41]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
b_g_news_X_train_tensor = torch.tensor(b_g_news_X_train.to_numpy(), dtype=torch.float32)
b_g_news_y_train_tensor = torch.tensor(b_g_news_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
b_g_news_X_test_tensor = torch.tensor(b_g_news_X_test.to_numpy(), dtype=torch.float32)
b_g_news_y_test_tensor = torch.tensor(b_g_news_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [42]:
# Creating the FNN classification model object and pushing it to the device. 
b_fnnModel = ClassiferModel(300, 50, 10, 2).to(device)

# Here i am using cross entropy loss as the loss function
loss_function = nn.CrossEntropyLoss()
# Here i am using the Adam optimizer with learning rate as 0.001
optimizer = optim.Adam(b_fnnModel.parameters(), lr=0.001)

In [43]:
# Here i am using the predefined tensordataset and dataloader for training the model in batches.
train_dataset = TensorDataset(b_g_news_X_train_tensor, b_g_news_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [44]:
# Here i am traing the model for 10 epoches with each batch of size 32.
epochs = 10
for epoch in range(epochs):
    # Setting the model to training mode
    b_fnnModel.train().to(device)  
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Moving the input and output features to GPU
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad() 
        outputs = b_fnnModel(inputs)
        loss = loss_function(outputs, labels)
        loss.backward() 
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}') 

Epoch 1/10, Loss: 0.3527926102474332
Epoch 2/10, Loss: 0.30803374334722755
Epoch 3/10, Loss: 0.2969717775590718
Epoch 4/10, Loss: 0.2897937513425946
Epoch 5/10, Loss: 0.28395568826943635
Epoch 6/10, Loss: 0.2793238676607609
Epoch 7/10, Loss: 0.2745317488208413
Epoch 8/10, Loss: 0.2705873997010291
Epoch 9/10, Loss: 0.26719293488934637
Epoch 10/10, Loss: 0.2636081786170602


In [45]:
# Setting the model to evaluation mode, and calculating the test accuracy.
b_fnnModel.eval() 
with torch.no_grad():
    # Moving the input and output features to GPU
    X_test_tensor, y_test_tensor = b_g_news_X_test_tensor.to(device), b_g_news_y_test_tensor.to(device)  
    outputs = b_fnnModel(X_test_tensor) 
    _, predicted = torch.max(outputs, 1) 
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu()) 

b_fnn_g_news_avg_acc = accuracy
print(f'FNN Binary Classification Google News Accuracy: {b_fnn_g_news_avg_acc*100:.2f}%')

FNN Binary Classification Google News Accuracy: 87.74%


In [46]:
del b_fnnModel
for inputs, labels in train_loader:
    del inputs
    del labels
del b_g_news_X_test_tensor
del b_g_news_y_test_tensor

##### My Model

In [47]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
b_my_model_X_train_tensor = torch.tensor(b_my_model_X_train.to_numpy(), dtype=torch.float32)
b_my_model_y_train_tensor = torch.tensor(b_my_model_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
b_my_model_X_test_tensor = torch.tensor(b_my_model_X_test.to_numpy(), dtype=torch.float32)
b_my_model_y_test_tensor = torch.tensor(b_my_model_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [48]:
# Creating the FNN classification model object and pushing it to the device. 
b_fnnModel = ClassiferModel(300, 50, 10, 2).to(device)

# Here i am using cross entropy loss as the loss function
loss_function = nn.CrossEntropyLoss()  
# Here i am using the Adam optimizer with learning rate as 0.001
optimizer = optim.Adam(b_fnnModel.parameters(), lr=0.001) 

In [49]:
# Here i am using the predefined tensordataset and dataloader for training the model in batches.
train_dataset = TensorDataset(b_my_model_X_train_tensor, b_my_model_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [50]:
# Here i am traing the model for 10 epoches with each batch of size 32.
epochs = 10
for epoch in range(epochs):
    # Setting the model to training mode
    b_fnnModel.train()  
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Moving the input and output features to GPU
        inputs, labels = inputs.to(device), labels.to(device) 
        optimizer.zero_grad() 
        outputs = b_fnnModel(inputs)  
        loss = loss_function(outputs, labels)  
        loss.backward()  
        optimizer.step() 
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.27263959611430766
Epoch 2/10, Loss: 0.24708786134645344
Epoch 3/10, Loss: 0.23792280987277628
Epoch 4/10, Loss: 0.2316527714509517
Epoch 5/10, Loss: 0.22662251801416278
Epoch 6/10, Loss: 0.22271582175157964
Epoch 7/10, Loss: 0.21905108323842287
Epoch 8/10, Loss: 0.2152104529503733
Epoch 9/10, Loss: 0.21267272356562317
Epoch 10/10, Loss: 0.21000256772860884


In [51]:
# Setting the model to evaluation mode, and calculating the test accuracy.
b_fnnModel.eval() 
with torch.no_grad():
    # Moving the input and output features to GPU
    X_test_tensor, y_test_tensor = b_my_model_X_test_tensor.to(device), b_my_model_y_test_tensor.to(device)  
    outputs = b_fnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1) 
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())  

b_fnn_my_model_avg_acc = accuracy
print(f'FNN Binary Classification My Model Accuracy: {b_fnn_my_model_avg_acc*100:.2f}%')

FNN Binary Classification My Model Accuracy: 89.90%


In [52]:
del b_fnnModel
for inputs, labels in train_loader:
    del inputs
    del labels
del b_my_model_X_test_tensor
del b_my_model_y_test_tensor

#### Memory Deallocation

In [53]:
del binary_g_news_df
del binary_my_model_df
del b_g_news_X
del b_g_news_y
del b_g_news_X_train
del b_g_news_X_test
del b_g_news_y_train
del b_g_news_y_test
del b_my_model_X_train
del b_my_model_X_test
del b_my_model_y_train
del b_my_model_y_test

#### Ternary Classifier

In [54]:
# Here i am spliting the dataset into input label X and output label y for Ternary classification using google news embedding. Also i am converting the 300 dimension to columns, to fit better for the models.
t_g_news_X = pd.DataFrame(g_news_df['review_embedding'].tolist(), columns=[f"d_{i}" for i in range(300)])
t_g_news_y = g_news_df["senti_label"]

In [55]:
# Here i am performing train test split for the tinary classification using google news embeddings.
t_g_news_X_train, t_g_news_X_test, t_g_news_y_train, t_g_news_y_test = train_test_split(t_g_news_X, t_g_news_y, test_size=0.2)

In [56]:
# Here i am spliting the dataset into input label X and output label y for Ternary classification using my model embedding. Also i am converting the 300 dimension to columns, to fit better for the models.
t_my_model_X = pd.DataFrame(my_model_df['review_embedding'].tolist(), columns=[f"d_{i}" for i in range(300)])
t_my_model_y = my_model_df["senti_label"]

In [57]:
# Here i am performing train test split for the tinary classification using my model embeddings.
t_my_model_X_train, t_my_model_X_test, t_my_model_y_train, t_my_model_y_test = train_test_split(t_my_model_X, t_my_model_y, test_size=0.2)

##### Google News

In [58]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
t_g_news_X_train_tensor = torch.tensor(t_g_news_X_train.to_numpy(), dtype=torch.float32)
t_g_news_y_train_tensor = torch.tensor(t_g_news_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
t_g_news_X_test_tensor = torch.tensor(t_g_news_X_test.to_numpy(), dtype=torch.float32)
t_g_news_y_test_tensor = torch.tensor(t_g_news_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long) 

In [59]:
# Creating the FNN classification model object and pushing it to the device. 
t_fnnModel = ClassiferModel(300, 50, 10, 3).to(device)
# Here i am using cross entropy loss as the loss function
loss_function = nn.CrossEntropyLoss()  
# Here i am using the Adam optimizer with learning rate as 0.001
optimizer = optim.Adam(t_fnnModel.parameters(), lr=0.001)  

In [60]:
# Here i am using the predefined tensordataset and dataloader for training the model in batches.
train_dataset = TensorDataset(t_g_news_X_train_tensor, t_g_news_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [61]:
# Here i am traing the model for 10 epoches with each batch of size 32.
epochs = 10
for epoch in range(epochs):
    # Setting the model to training mode
    t_fnnModel.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Moving the input and output features to GPU
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad()  
        outputs = t_fnnModel(inputs)  
        loss = loss_function(outputs, labels)  
        loss.backward()  
        optimizer.step() 
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.7270138023281097
Epoch 2/10, Loss: 0.6616519906377792
Epoch 3/10, Loss: 0.6417183092784882
Epoch 4/10, Loss: 0.6324936809945106
Epoch 5/10, Loss: 0.6248148479747773
Epoch 6/10, Loss: 0.6192641370248795
Epoch 7/10, Loss: 0.6145317017912865
Epoch 8/10, Loss: 0.6104091087841987
Epoch 9/10, Loss: 0.6068663216114044
Epoch 10/10, Loss: 0.6036971897053719


In [62]:
# Setting the model to evaluation mode, and calculating the test accuracy.
t_fnnModel.eval()
with torch.no_grad(): 
     # Moving the input and output features to GPU
    X_test_tensor, y_test_tensor = t_g_news_X_test_tensor.to(device), t_g_news_y_test_tensor.to(device)  
    outputs = t_fnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1) 
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())

t_fnn_g_news_avg_acc = accuracy
print(f'FNN Ternary Classification Google News Accuracy: {t_fnn_g_news_avg_acc*100:.2f}%')

FNN Ternary Classification Google News Accuracy: 73.55%


In [63]:
del t_fnnModel
for inputs, labels in train_loader:
    del inputs  
    del labels
del t_g_news_X_test_tensor
del t_g_news_y_test_tensor

##### My Model

In [64]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
t_my_model_X_train_tensor = torch.tensor(t_my_model_X_train.to_numpy(), dtype=torch.float32)
t_my_model_y_train_tensor = torch.tensor(t_my_model_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
t_my_model_X_test_tensor = torch.tensor(t_my_model_X_test.to_numpy(), dtype=torch.float32)
t_my_model_y_test_tensor = torch.tensor(t_my_model_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long) 

In [65]:
# Creating the FNN classification model object and pushing it to the device. 
t_fnnModel = ClassiferModel(300, 50, 10, 3).to(device)
# Here i am using cross entropy loss as the loss function
loss_function = nn.CrossEntropyLoss()  
# Here i am using the Adam optimizer with learning rate as 0.001
optimizer = optim.Adam(t_fnnModel.parameters(), lr=0.001)

In [66]:
# Here i am using the predefined tensordataset and dataloader for training the model in batches.
train_dataset = TensorDataset(t_my_model_X_train_tensor, t_my_model_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [67]:
# Here i am traing the model for 10 epoches with each batch of size 32.
epochs = 10
for epoch in range(epochs):
    # Setting the model to training mode
    t_fnnModel.train()  
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Moving the input and output features to GPU
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad() 
        outputs = t_fnnModel(inputs)  
        loss = loss_function(outputs, labels)  
        loss.backward() 
        optimizer.step() 
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.6039382109904289
Epoch 2/10, Loss: 0.568785041539669
Epoch 3/10, Loss: 0.5591183296942711
Epoch 4/10, Loss: 0.5520433605527878
Epoch 5/10, Loss: 0.5461914832425118
Epoch 6/10, Loss: 0.5422986579918861
Epoch 7/10, Loss: 0.5388446628618241
Epoch 8/10, Loss: 0.5357346608424187
Epoch 9/10, Loss: 0.5327037195944786
Epoch 10/10, Loss: 0.5298448876214027


In [68]:
# Setting the model to evaluation mode, and calculating the test accuracy.
t_fnnModel.eval()  
with torch.no_grad(): 
    # Moving the input and output features to GPU
    X_test_tensor, y_test_tensor = t_my_model_X_test_tensor.to(device), t_my_model_y_test_tensor.to(device)  
    outputs = t_fnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())  

t_fnn_my_model_avg_acc = accuracy
print(f'FNN Ternary Classification My Model Accuracy: {t_fnn_my_model_avg_acc*100:.2f}%')

FNN Ternary Classification My Model Accuracy: 76.39%


In [69]:
del t_fnnModel
for inputs, labels in train_loader:
    del inputs 
    del labels
del t_my_model_X_test_tensor
del t_my_model_y_test_tensor

#### Memory Deallocation

In [70]:
del g_news_df
del my_model_df
del t_g_news_X
del t_g_news_y
del t_g_news_X_train
del t_g_news_X_test
del t_g_news_y_train
del t_g_news_y_test
del t_my_model_X_train
del t_my_model_X_test
del t_my_model_y_train
del t_my_model_y_test

### b)

In [71]:
# Here i am creating two copies of sampled df for google news embedding and my model embedding. Here instead of average, we are concatenating first 10 word embeddings
g_news_concat_df = sampled_df.copy()
my_model_concat_df = sampled_df.copy()

In [72]:
# This function converts the word to word embedding and creates a flatten word embedding vector of first 10 words.
def review_concat_embedding(review, keyVec): 
    # Initialize the flatten first 10 word embedding vector.
    res = np.zeros(3000)
    
    # Getting the word embedding for the first 10 words.
    w_vec = []
    for w in review:
        if w in keyVec:
            w_vec.append(keyVec[w])
        if len(w_vec) == 10:  # Stop once we have 10 vectors
            break
    
    # Adding them to the final vector, using the start and end indices.
    for ind, vec in enumerate(w_vec):
        start_ind = ind * 300
        res[start_ind:start_ind + 300] = vec

    return res

In [73]:
# Here i am performing the sentence emebeding on the review body column with the keyVectors of google news 300 model.
g_news_concat_df["review_embedding"] = g_news_concat_df["review_body"].apply(lambda review: review_concat_embedding(review, wv))
g_news_concat_df = g_news_concat_df[["review_embedding", "senti_label"]] 

In [74]:
# Here i am performing the sentence emebeding on the review body column with the keyVectors of my model.
my_model_concat_df["review_embedding"] = my_model_concat_df["review_body"].apply(lambda review: review_concat_embedding(review, my_model))
my_model_concat_df = my_model_concat_df[["review_embedding", "senti_label"]] 

In [75]:
# deallocating these model for efficent RAM usage.
del wv
del my_model

#### Binary Classifier

In [76]:
# Here i am creating the binary classification dataset with google news embeddings.
binary_g_news_concat_df = g_news_concat_df.copy()
binary_g_news_concat_df = binary_g_news_concat_df[binary_g_news_concat_df["senti_label"] != 3]

In [77]:
# Here i am creating the binary classification dataset with my model embeddings.
binary_my_model_concat_df = my_model_concat_df.copy()
binary_my_model_concat_df = binary_my_model_concat_df[binary_my_model_concat_df["senti_label"] != 3]

In [78]:
# Creating the input and output features for binary classification using google news embeddding.
b_g_news_concat_X = binary_g_news_concat_df['review_embedding']
b_g_news_concat_y = binary_g_news_concat_df["senti_label"]

In [79]:
# Creating the input and output features for binary classification using my model embeddding.
b_my_model_concat_X = binary_my_model_concat_df['review_embedding']
b_my_model_concat_y = binary_my_model_concat_df["senti_label"]


In [80]:
# Here i am performing the train and test split for the google news dataset.
b_g_news_concat_X_train, b_g_news_concat_X_test, b_g_news_concat_y_train, b_g_news_concat_y_test = train_test_split(b_g_news_concat_X, b_g_news_concat_y, test_size=0.2)

In [81]:
# Here i am performing the train and test split for the my model dataset.
b_my_model_concat_X_train, b_my_model_concat_X_test, b_my_model_concat_y_train, b_my_model_concat_y_test = train_test_split(b_my_model_concat_X, b_my_model_concat_y, test_size=0.2)

##### Google News

In [82]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
b_g_news_X_train_tensor = torch.tensor(np.array(b_g_news_concat_X_train.tolist(), dtype=np.float32), dtype=torch.float32)
b_g_news_y_train_tensor = torch.tensor(b_g_news_concat_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
b_g_news_X_test_tensor = torch.tensor(np.array(b_g_news_concat_X_test.tolist(),  dtype=np.float32) , dtype=torch.float32)
b_g_news_y_test_tensor = torch.tensor(b_g_news_concat_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [83]:
# Creating the FNN classification model object and pushing it to the device. 
b_fnnModel = ClassiferModel(3000, 50, 10, 2).to(device)
# Here i am using cross entropy loss as the loss function
loss_function = nn.CrossEntropyLoss() 
# Here i am using the Adam optimizer with learning rate as 0.001
optimizer = optim.Adam(b_fnnModel.parameters(), lr=0.001)

In [84]:
# Here i am creating the training dataset and data loader for batch processing.
train_dataset = TensorDataset(b_g_news_X_train_tensor, b_g_news_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [85]:
# Here i am training the model for 10 epoches with batch size of 32 
epochs = 10
for epoch in range(epochs):
    b_fnnModel.train().to(device)  
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad()  
        outputs = b_fnnModel(inputs) 
        loss = loss_function(outputs, labels) 
        loss.backward()  
        optimizer.step()  
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}') 

Epoch 1/10, Loss: 0.356680810970068
Epoch 2/10, Loss: 0.30839536718428134
Epoch 3/10, Loss: 0.2734541860073805
Epoch 4/10, Loss: 0.23740782462358476
Epoch 5/10, Loss: 0.20329186789505183
Epoch 6/10, Loss: 0.1728428849829361
Epoch 7/10, Loss: 0.14651444491287693
Epoch 8/10, Loss: 0.12603916580602526
Epoch 9/10, Loss: 0.10781032733311877
Epoch 10/10, Loss: 0.09383634139711503


In [86]:
# Here i am getting the test set accuracy of the model.
b_fnnModel.eval()  
with torch.no_grad():  
    X_test_tensor, y_test_tensor = b_g_news_X_test_tensor.to(device), b_g_news_y_test_tensor.to(device) 
    outputs = b_fnnModel(X_test_tensor) 
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())

b_fnn_g_news_f10_acc = accuracy
print(f'FNN Binary Classification Google News first 10 words Accuracy: {b_fnn_g_news_f10_acc*100:.2f}%')

FNN Binary Classification Google News first 10 words Accuracy: 83.26%


In [87]:
del b_fnnModel
for inputs, labels in train_loader:
    del inputs
    del labels
del b_g_news_X_test_tensor
del b_g_news_y_test_tensor

##### My Model

In [88]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
b_my_model_X_train_tensor = torch.tensor(np.array(b_my_model_concat_X_train.tolist() ,dtype=np.float32), dtype=torch.float32)
b_my_model_y_train_tensor = torch.tensor(b_my_model_concat_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
b_my_model_X_test_tensor = torch.tensor(np.array(b_my_model_concat_X_test.tolist(), dtype=np.float32), dtype=torch.float32)
b_my_model_y_test_tensor = torch.tensor(b_my_model_concat_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [89]:
# Creating the FNN classification model object and pushing it to the device. 
b_fnnModel = ClassiferModel(3000, 50, 10, 2).to(device)
# Here i am using cross entropy loss as the loss function
loss_function = nn.CrossEntropyLoss()  
# Here i am using the Adam optimizer with learning rate as 0.001
optimizer = optim.Adam(b_fnnModel.parameters(), lr=0.001) 

In [90]:
# Here i am creating the training dataset and data loader for batch processing.
train_dataset = TensorDataset(b_my_model_X_train_tensor, b_my_model_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [91]:
# Here i am training the model for 10 epoches with batch size of 32 
epochs = 10
for epoch in range(epochs):
    b_fnnModel.train() 
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device) 
        optimizer.zero_grad() 
        outputs = b_fnnModel(inputs) 
        loss = loss_function(outputs, labels) 
        loss.backward()  
        optimizer.step()  
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.3249033022031188
Epoch 2/10, Loss: 0.2890653444930911
Epoch 3/10, Loss: 0.26353344066217543
Epoch 4/10, Loss: 0.23594838480874897
Epoch 5/10, Loss: 0.20920504307225346
Epoch 6/10, Loss: 0.1859638928456232
Epoch 7/10, Loss: 0.16456441350784154
Epoch 8/10, Loss: 0.14609622634593397
Epoch 9/10, Loss: 0.13207996216518805
Epoch 10/10, Loss: 0.11790294027910568


In [92]:
# Here i am getting the test set accuracy of the model.
b_fnnModel.eval()  
with torch.no_grad(): 
    X_test_tensor, y_test_tensor = b_my_model_X_test_tensor.to(device), b_my_model_y_test_tensor.to(device) 
    outputs = b_fnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())  

b_fnn_my_model_f10_acc = accuracy
print(f'FNN Binary Classification My model first 10 words Accuracy: {b_fnn_my_model_f10_acc*100:.2f}%')

FNN Binary Classification My model first 10 words Accuracy: 85.02%


In [93]:
del b_fnnModel
for inputs, labels in train_loader:
    del inputs
    del labels
del b_my_model_X_test_tensor
del b_my_model_y_test_tensor

#### Memory deallocation

In [94]:
del binary_g_news_concat_df
del binary_my_model_concat_df
del b_g_news_concat_X
del b_g_news_concat_y
del b_my_model_concat_X
del b_my_model_concat_y
del b_g_news_concat_X_train
del b_g_news_concat_X_test
del b_g_news_concat_y_train
del b_g_news_concat_y_test
del b_my_model_concat_X_train
del b_my_model_concat_X_test
del b_my_model_concat_y_train
del b_my_model_concat_y_test


In [95]:
# Here i am spliting the dataset into input label X and output label y for Ternary classification using google news embedding.
t_g_news_concat_X = g_news_concat_df['review_embedding']
t_g_news_concat_y = g_news_concat_df["senti_label"]


In [96]:
# Here i am spliting the dataset into input label X and output label y for Ternary classification using my model embedding.
t_my_model_concat_X = my_model_concat_df['review_embedding']
t_my_model_concat_y = my_model_concat_df["senti_label"]

In [97]:
# Here i am performing the train and test split for the google news dataset
t_g_news_concat_X_train, t_g_news_concat_X_test, t_g_news_concat_y_train, t_g_news_concat_y_test = train_test_split(t_g_news_concat_X, t_g_news_concat_y, test_size=0.2)

In [98]:
# Here i am performing the train and test split for the my model dataset
t_my_model_concat_X_train, t_my_model_concat_X_test, t_my_model_concat_y_train, t_my_model_concat_y_test = train_test_split(t_my_model_concat_X, t_my_model_concat_y, test_size=0.2)

##### Google News

In [99]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
t_g_news_X_train_tensor = torch.tensor(np.array(t_g_news_concat_X_train.tolist(), dtype=np.float32), dtype=torch.float32)
t_g_news_y_train_tensor = torch.tensor(t_g_news_concat_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
t_g_news_X_test_tensor = torch.tensor(np.array(t_g_news_concat_X_test.tolist(), dtype=np.float32), dtype=torch.float32)
t_g_news_y_test_tensor = torch.tensor(t_g_news_concat_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long) 

In [100]:
# Creating the FNN classification model object and pushing it to the device. And, i am using cross entropy loss as the loss function. Also, i am using the Adam optimizer with learning rate as 0.001
t_fnnModel = ClassiferModel(3000, 50, 10, 3).to(device)
loss_function = nn.CrossEntropyLoss()  
optimizer = optim.Adam(t_fnnModel.parameters(), lr=0.001) 

In [101]:
# Here i am creating the training dataset and data loader for batch processing.
train_dataset = TensorDataset(t_g_news_X_train_tensor, t_g_news_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [102]:
# Here i am training the model for 10 epoches with batch size of 32 
epochs = 10
for epoch in range(epochs):
    t_fnnModel.train()  
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad()  
        outputs = t_fnnModel(inputs)  
        loss = loss_function(outputs, labels)  
        loss.backward()  
        optimizer.step() 
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.6859153775024414
Epoch 2/10, Loss: 0.6252996482491493
Epoch 3/10, Loss: 0.588805819041729
Epoch 4/10, Loss: 0.5519414755129814
Epoch 5/10, Loss: 0.5177000136566162
Epoch 6/10, Loss: 0.48498313717126845
Epoch 7/10, Loss: 0.45538891998648645
Epoch 8/10, Loss: 0.4293326005470753
Epoch 9/10, Loss: 0.4053522188127041
Epoch 10/10, Loss: 0.38391411766052247


In [103]:
# Here i am getting the test set accuracy of the model.
t_fnnModel.eval()  
with torch.no_grad(): 
    X_test_tensor, y_test_tensor = t_g_news_X_test_tensor.to(device), t_g_news_y_test_tensor.to(device)
    outputs = t_fnnModel(X_test_tensor) 
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu()) 

t_fnn_g_news_f10_acc = accuracy
print(f'FNN Ternary Classification Google News first 10 words Accuracy: {t_fnn_g_news_f10_acc*100:.2f}%')

FNN Ternary Classification Google News first 10 words Accuracy: 68.42%


In [104]:
del t_fnnModel
for inputs, labels in train_loader:
    del inputs 
    del labels
del t_g_news_X_test_tensor
del t_g_news_y_test_tensor

##### My Model

In [105]:
# Here i am converting the training and testing input features and output features to tensors, by first converting them into a np.array(). Also for the output labels i am subtracting one from the class as the model need the labels to start from 0.
t_my_model_X_train_tensor = torch.tensor(np.array(t_my_model_concat_X_train.tolist(), dtype=np.float32), dtype=torch.float32)
t_my_model_y_train_tensor = torch.tensor(t_my_model_concat_y_train.apply(lambda x: x-1).to_numpy(), dtype=torch.long)
t_my_model_X_test_tensor = torch.tensor(np.array(t_my_model_concat_X_test.tolist(), dtype=np.float32), dtype=torch.float32)
t_my_model_y_test_tensor = torch.tensor(t_my_model_concat_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long) 

In [106]:
# Creating the FNN classification model object and pushing it to the device. And, i am using cross entropy loss as the loss function. Also, i am using the Adam optimizer with learning rate as 0.001
t_fnnModel = ClassiferModel(3000, 50, 10, 3).to(device)
loss_function = nn.CrossEntropyLoss() 
optimizer = optim.Adam(t_fnnModel.parameters(), lr=0.001)

In [107]:
# Here i am creating the training dataset and data loader for batch processing.
train_dataset = TensorDataset(t_my_model_X_train_tensor, t_my_model_y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [108]:
# Here i am training the model for 10 epoches with batch size of 32 
epochs = 10
for epoch in range(epochs):
    t_fnnModel.train() 
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = t_fnnModel(inputs) 
        loss = loss_function(outputs, labels) 
        loss.backward() 
        optimizer.step()  
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.6482063727235794
Epoch 2/10, Loss: 0.605265987098217
Epoch 3/10, Loss: 0.577968397886753
Epoch 4/10, Loss: 0.5511817797732353
Epoch 5/10, Loss: 0.5252816880702973
Epoch 6/10, Loss: 0.5023250477552414
Epoch 7/10, Loss: 0.48019588971972466
Epoch 8/10, Loss: 0.4602250667345524
Epoch 9/10, Loss: 0.44289585554242134
Epoch 10/10, Loss: 0.4272341419303417


In [109]:
# Here i am getting the test set accuracy of the model.
t_fnnModel.eval()  
with torch.no_grad():  
    X_test_tensor, y_test_tensor = t_my_model_X_test_tensor.to(device), t_my_model_y_test_tensor.to(device)  
    outputs = t_fnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1) 
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu()) 

t_fnn_my_model_f10_acc = accuracy
print(f'FNN Ternary Classification My Model first 10 words Accuracy: {t_fnn_my_model_f10_acc*100:.2f}%')

FNN Ternary Classification My Model first 10 words Accuracy: 70.94%


In [110]:
del t_fnnModel
for inputs, labels in train_loader:
    del inputs 
    del labels
del t_my_model_X_test_tensor
del t_my_model_y_test_tensor

#### Ternary Classifier

#### Memory Deallocation

In [111]:
del g_news_concat_df
del my_model_concat_df
del t_g_news_concat_X
del t_g_news_concat_y
del t_my_model_concat_X
del t_my_model_concat_y
del t_g_news_concat_X_train
del t_g_news_concat_X_test
del t_g_news_concat_y_train
del t_g_news_concat_y_test
del t_my_model_concat_X_train
del t_my_model_concat_X_test
del t_my_model_concat_y_train
del t_my_model_concat_y_test


## 5. Convolutional Neural Networks

In [112]:
# Here i am loading the google news and my model keyVectors to use for word embedding.
wv = gensim.models.KeyedVectors.load("gensim_model_w2v_gnews_300.model")
my_model = gensim.models.Word2Vec.load("gensim_model_w2v_my_model.model").wv

In [113]:
# Making a copy of the sampled df, in which the reviews are tokenized.
sampled_cnn_df = sampled_df.copy()

In [114]:
# Here i am limiting the size of the reviews to the first 50 tokens.
sampled_cnn_df["review_body"] = sampled_cnn_df["review_body"].apply(lambda r: r[:50])


In [115]:
# Here i am creating the google news word embedding dataset.
g_news_cnn_df = sampled_cnn_df.copy()
g_news_cnn_df = g_news_cnn_df[["review_body", "senti_label"]]  

In [116]:
# Here i am creating the my model word embedding dataset.
my_model_cnn_df = sampled_cnn_df.copy()
my_model_cnn_df = my_model_cnn_df[["review_body", "senti_label"]] 

In [117]:
del sampled_cnn_df

In [118]:
# Here I am creating the CNN classifier class, with 2 layer of output size 50 and 10
class CNNClassifierModel(nn.Module):
    def __init__(self, embd_d, n_cls):
        super(CNNClassifierModel, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=embd_d, out_channels=50, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=10, kernel_size=3, padding=1)
        
        self.fc = nn.Linear(10 * 50, n_cls)
        
    def forward(self, r):
        r = r.permute(0, 2, 1)
        
        r = F.relu(self.conv1(r)) 
        r = F.relu(self.conv2(r)) 
        
        r = r.view(r.size(0), -1)
        r = self.fc(r)
        
        return r

In [119]:
# Here i am creating the Dataset Class to perform batch processing 
class ReviewDataset(Dataset):
    def __init__(self, X, y, keyVec):
        self.reviews = X
        self.keyVec = keyVec
        self.labels = y
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        embedding = np.zeros((50, 300), dtype=np.float32)
        
        # Here i am creating the [50, 300] sentence embedding matrix for each of the review.
        for ind in range(50):
            if ind < len(review) and review[ind] in self.keyVec:
                embedding[ind] = self.keyVec[review[ind]]
            else:
                embedding[ind] = np.zeros(300, dtype=np.float32)
        
        return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long) 

In [120]:
# This function performs the sentence embedding and create a [50, 300] matrix as a result.
def review_50_embedding(review, keyVec):
    w_vec = np.zeros((50, 300), dtype=np.float32)
    for ind in range(50):
        if ind < len(review) and review[ind] in keyVec:
            w_vec[ind] = keyVec[review[ind]]
        else:
            w_vec[ind] = np.zeros(300)
    return w_vec

### Binary Classifier

In [121]:
# Create the data for binary classification with google news embedding.
binary_g_news_cnn_df = g_news_cnn_df.copy()
binary_g_news_cnn_df = binary_g_news_cnn_df[binary_g_news_cnn_df["senti_label"] != 3]

In [122]:
# Create the data for binary classification with my model embedding.
binary_my_model_cnn_df = my_model_cnn_df.copy()
binary_my_model_cnn_df = binary_my_model_cnn_df[binary_my_model_cnn_df["senti_label"] != 3]

In [123]:
# Creating the input and output features for the binary classification with google news embedding
b_g_news_cnn_X = binary_g_news_cnn_df['review_body']
b_g_news_cnn_y = binary_g_news_cnn_df["senti_label"]

In [124]:
# Creating the input and output features for the binary classification with my model embedding
b_my_model_cnn_X = binary_my_model_cnn_df['review_body']
b_my_model_cnn_y = binary_my_model_cnn_df["senti_label"]


In [125]:
# Here i am performing the train and test split for google news embeddings
b_g_news_cnn_X_train, b_g_news_cnn_X_test, b_g_news_cnn_y_train, b_g_news_cnn_y_test = train_test_split(b_g_news_cnn_X, b_g_news_cnn_y, test_size=0.2)

In [126]:
# Here i am performing the train and test split for my model embeddings
b_my_model_cnn_X_train, b_my_model_cnn_X_test, b_my_model_cnn_y_train, b_my_model_cnn_y_test = train_test_split(b_my_model_cnn_X, b_my_model_cnn_y, test_size=0.2)

#### Google News

In [127]:
# Converting the test input and output features into tensors. For the output label i am subtracing 1, to make them start with 0.
b_g_news_X_test_tensor = torch.tensor(np.array([review_50_embedding(r, wv) for r in b_g_news_cnn_X_test], dtype=np.float32), dtype=torch.float32)
b_g_news_y_test_tensor = torch.tensor(b_g_news_cnn_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [128]:
# Here i am creating the CNN classifier object and using the cross entropy loss function and adam optimizer.
b_cnnModel = CNNClassifierModel(300, 2).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(b_cnnModel.parameters(), lr=0.001)  

In [129]:
# Here i am reseting the indexing for the training input and output features.
b_g_news_cnn_X_train = b_g_news_cnn_X_train.reset_index(drop=True)  
b_g_news_cnn_y_train = b_g_news_cnn_y_train.reset_index(drop=True)

# Here i am subtracting 1 from the training output feature to make the label start from 0.
b_g_news_cnn_y_train = b_g_news_cnn_y_train.apply(lambda x: x-1)

# Creating the training dataset and dataloader for batch processing.
train_dataset = ReviewDataset(b_g_news_cnn_X_train, b_g_news_cnn_y_train, wv)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [130]:
# Here i am training the model for 10 epoches and batch size of 32.
epochs = 10
for epoch in range(epochs):
    b_cnnModel.train().to(device)
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad() 
        outputs = b_cnnModel(inputs) 
        loss = loss_function(outputs, labels)  
        loss.backward() 
        optimizer.step()  
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}') 

Epoch 1/10, Loss: 0.2760045650832355
Epoch 2/10, Loss: 0.2232752208404243
Epoch 3/10, Loss: 0.20167210635878147
Epoch 4/10, Loss: 0.1836438637746498
Epoch 5/10, Loss: 0.16774623184874654
Epoch 6/10, Loss: 0.15336882965639234
Epoch 7/10, Loss: 0.14054114817213267
Epoch 8/10, Loss: 0.1290992124103941
Epoch 9/10, Loss: 0.11856321447109804
Epoch 10/10, Loss: 0.10961677876082249


In [131]:
# Here i am getting the test accuracy of the model.
b_cnnModel.eval()  
with torch.no_grad(): 
    X_test_tensor, y_test_tensor = b_g_news_X_test_tensor.to(device), b_g_news_y_test_tensor.to(device) 
    outputs = b_cnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1) 
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())

b_cnn_g_news_acc = accuracy
print(f'CNN Binary Classification Google News Accuracy: {b_cnn_g_news_acc*100:.2f}%')

CNN Binary Classification Google News Accuracy: 89.89%


In [132]:
del b_cnnModel
for inputs, labels in train_loader:
    del inputs
    del labels
del b_g_news_X_test_tensor
del b_g_news_y_test_tensor

#### My Model

In [133]:
# Converting the test input and output features into tensors. For the output label i am subtracing 1, to make them start with 0.
b_my_model_X_test_tensor = torch.tensor(np.array([review_50_embedding(r, my_model) for r in b_my_model_cnn_X_test], dtype=np.float32), dtype=torch.float32)
b_my_model_y_test_tensor = torch.tensor(b_my_model_cnn_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [134]:
# Here i am reseting the indexing for the training input and output features.
b_my_model_cnn_X_train = b_my_model_cnn_X_train.reset_index(drop=True)  
b_my_model_cnn_y_train = b_my_model_cnn_y_train.reset_index(drop=True)  

# Here i am subtracting 1 from the training output feature to make the label start from 0.
b_my_model_cnn_y_train = b_my_model_cnn_y_train.apply(lambda x: x-1)

# Creating the training dataset and dataloader for batch processing.
train_dataset = ReviewDataset(b_my_model_cnn_X_train, b_my_model_cnn_y_train, my_model)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [135]:
# Here i am creating the CNN classifier object and using the cross entropy loss function and adam optimizer.
b_cnnModel = CNNClassifierModel(300, 2).to(device)
loss_function = nn.CrossEntropyLoss()  
optimizer = optim.Adam(b_cnnModel.parameters(), lr=0.001)  

In [136]:
# Here i am training the model for 10 epoches and batch size of 32.
epochs = 10
for epoch in range(epochs):
    b_cnnModel.train().to(device)  
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad()  
        outputs = b_cnnModel(inputs)  
        loss = loss_function(outputs, labels)  
        loss.backward()  
        optimizer.step()  
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')  

Epoch 1/10, Loss: 0.26135384093225
Epoch 2/10, Loss: 0.23069126994162797
Epoch 3/10, Loss: 0.21776947853192688
Epoch 4/10, Loss: 0.20774835375770925
Epoch 5/10, Loss: 0.1985382195169106
Epoch 6/10, Loss: 0.19087891874536872
Epoch 7/10, Loss: 0.18231498567909002
Epoch 8/10, Loss: 0.1754128668885678
Epoch 9/10, Loss: 0.16834453330524266
Epoch 10/10, Loss: 0.16105076786614955


In [137]:
# Here i am getting the test accuracy of the model
b_cnnModel.eval() 
with torch.no_grad(): 
    X_test_tensor, y_test_tensor = b_my_model_X_test_tensor.to(device), b_my_model_y_test_tensor.to(device)  
    outputs = b_cnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())

b_cnn_my_model_acc = accuracy
print(f'CNN Binary Classification My Model Accuracy: {b_cnn_my_model_acc*100:.2f}%')

CNN Binary Classification My Model Accuracy: 90.40%


In [138]:
del b_cnnModel
for inputs, labels in train_loader:
    del inputs
    del labels
del b_my_model_X_test_tensor
del b_my_model_y_test_tensor

#### Memory Deallocation

In [139]:
del binary_g_news_cnn_df
del binary_my_model_cnn_df
del b_g_news_cnn_X
del b_g_news_cnn_y
del b_my_model_cnn_X
del b_my_model_cnn_y
del b_g_news_cnn_X_train
del b_g_news_cnn_X_test
del b_g_news_cnn_y_train
del b_g_news_cnn_y_test
del b_my_model_cnn_X_train
del b_my_model_cnn_X_test
del b_my_model_cnn_y_train
del b_my_model_cnn_y_test

### Ternary Classifier

In [140]:
# Here i am creating the ternary classificationd dataset for google news.
t_g_news_cnn_X = g_news_cnn_df['review_body']
t_g_news_cnn_y = g_news_cnn_df["senti_label"]

In [141]:
# Here i am creating the ternary classificationd dataset for my model.
t_my_model_cnn_X = my_model_cnn_df['review_body']
t_my_model_cnn_y = my_model_cnn_df["senti_label"]


In [142]:
# Here i am performing train and test split for google news
t_g_news_cnn_X_train, t_g_news_cnn_X_test, t_g_news_cnn_y_train, t_g_news_cnn_y_test = train_test_split(t_g_news_cnn_X, t_g_news_cnn_y, test_size=0.2)

In [143]:
# Here i am performing train and test split for my model
t_my_model_cnn_X_train, t_my_model_cnn_X_test, t_my_model_cnn_y_train, t_my_model_cnn_y_test = train_test_split(t_my_model_cnn_X, t_my_model_cnn_y, test_size=0.2)

##### Google News

In [144]:
# Converting the test input and output features into tensors. For the output label i am subtracing 1, to make them start with 0.
t_g_news_X_test_tensor = torch.tensor(np.array([review_50_embedding(r, wv) for r in t_g_news_cnn_X_test], dtype=np.float32), dtype=torch.float32)
t_g_news_y_test_tensor = torch.tensor(t_g_news_cnn_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [145]:
# Here i am creating the CNN classifier object and using the cross entropy loss function and adam optimizer.
t_cnnModel = CNNClassifierModel(300, 3).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(t_cnnModel.parameters(), lr=0.001) 

In [146]:
# Here i am reseting the indexing for the training input and output features.
t_g_news_cnn_X_train = t_g_news_cnn_X_train.reset_index(drop=True)  
t_g_news_cnn_y_train = t_g_news_cnn_y_train.reset_index(drop=True)

# Here i am subtracting 1 from the training output feature to make the label start from 0.
t_g_news_cnn_y_train = t_g_news_cnn_y_train.apply(lambda x: x-1)

# Creating the training dataset and dataloader for batch processing.
train_dataset = ReviewDataset(t_g_news_cnn_X_train, t_g_news_cnn_y_train, wv)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [147]:
# Here i am training the model.
epochs = 10
for epoch in range(epochs):
    t_cnnModel.train() 
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device) 
        optimizer.zero_grad() 
        outputs = t_cnnModel(inputs)  
        loss = loss_function(outputs, labels) 
        loss.backward()  
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.6003190448737145
Epoch 2/10, Loss: 0.5331146879816056
Epoch 3/10, Loss: 0.5106789461946487
Epoch 4/10, Loss: 0.49393784275650976
Epoch 5/10, Loss: 0.479999182240963
Epoch 6/10, Loss: 0.46754138472795487
Epoch 7/10, Loss: 0.45617883866548536
Epoch 8/10, Loss: 0.4458430671775341
Epoch 9/10, Loss: 0.43673794154882434
Epoch 10/10, Loss: 0.4290642709851265


In [148]:
# Here i am getting the test accuracy of the model
t_cnnModel.eval() 
with torch.no_grad(): 
    X_test_tensor, y_test_tensor = t_g_news_X_test_tensor.to(device), t_g_news_y_test_tensor.to(device)
    outputs = t_cnnModel(X_test_tensor) 
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())

t_cnn_g_news_acc = accuracy
print(f'CNN Ternary Classification Google News Accuracy: {t_cnn_g_news_acc*100:.2f}%')

CNN Ternary Classification Google News Accuracy: 76.48%


In [149]:
del t_cnnModel
for inputs, labels in train_loader:
    del inputs 
    del labels
del t_g_news_X_test_tensor
del t_g_news_y_test_tensor

##### My Model

In [150]:
# Converting the test input and output features into tensors. For the output label i am subtracing 1, to make them start with 0.
t_my_model_X_test_tensor = torch.tensor(np.array([review_50_embedding(r, my_model) for r in t_my_model_cnn_X_test], dtype=np.float32), dtype=torch.float32)
t_my_model_y_test_tensor = torch.tensor(t_my_model_cnn_y_test.apply(lambda x: x-1).to_numpy(), dtype=torch.long)

In [151]:
# Here i am creating the CNN classifier object and using the cross entropy loss function and adam optimizer.
t_cnnModel = CNNClassifierModel(300, 3).to(device)
loss_function = nn.CrossEntropyLoss()  
optimizer = optim.Adam(t_cnnModel.parameters(), lr=0.001)  

In [152]:
# Here i am reseting the indexing for the training input and output features.
t_my_model_cnn_X_train = t_my_model_cnn_X_train.reset_index(drop=True)  
t_my_model_cnn_y_train = t_my_model_cnn_y_train.reset_index(drop=True)  

# Here i am subtracting 1 from the training output feature to make the label start from 0.
t_my_model_cnn_y_train = t_my_model_cnn_y_train.apply(lambda x: x-1)

# Creating the training dataset and dataloader for batch processing.
train_dataset = ReviewDataset(t_my_model_cnn_X_train, t_my_model_cnn_y_train, my_model)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [153]:
# Here i am training the model
epochs = 10
for epoch in range(epochs):
    t_cnnModel.train()  
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device) 
        optimizer.zero_grad()  
        outputs = t_cnnModel(inputs) 
        loss = loss_function(outputs, labels)  
        loss.backward() 
        optimizer.step()  
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}')

Epoch 1/10, Loss: 0.5825681155824661
Epoch 2/10, Loss: 0.5468351379394532
Epoch 3/10, Loss: 0.5334672194004059
Epoch 4/10, Loss: 0.5234786265206337
Epoch 5/10, Loss: 0.5155383927822113
Epoch 6/10, Loss: 0.5081573192715645
Epoch 7/10, Loss: 0.5013403821313381
Epoch 8/10, Loss: 0.49499311629772186
Epoch 9/10, Loss: 0.4889717603087425
Epoch 10/10, Loss: 0.4842364546084404


In [154]:
# Here i am getting the test accuracy of the model
t_cnnModel.eval()  
with torch.no_grad(): 
    X_test_tensor, y_test_tensor = t_my_model_X_test_tensor.to(device), t_my_model_y_test_tensor.to(device)  
    outputs = t_cnnModel(X_test_tensor)  
    _, predicted = torch.max(outputs, 1)  
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())

t_cnn_my_model_acc = accuracy
print(f'CNN Ternary Classification My Model Accuracy: {t_cnn_my_model_acc*100:.2f}%')

CNN Ternary Classification My Model Accuracy: 76.80%


In [155]:
del t_cnnModel
for inputs, labels in train_loader:
    del inputs 
    del labels
del t_my_model_X_test_tensor
del t_my_model_y_test_tensor

## Final Accuracy Values for all the Models

In [156]:
print("Perceptron")
print("------------------------------------------------------------")
print(f'The Binary Classification (Google News 300): {perceptron_g_news_acc}')
print(f'The Binary Classification (My Model): {perceptron_my_model_acc}')
print("")
print("")
print("SVM")
print("------------------------------------------------------------")
print(f'The Binary Classification (Google News 300): {svm_g_news_acc}')
print(f'The Binary Classification (My Model): {svm_my_model_acc}')
print("")
print("")
print("FNN")
print("------------------------------------------------------------")
print(f'The Binary Classification Average (Google News 300): {b_fnn_g_news_avg_acc}')
print(f'The Binary Classification Average (My Model): {b_fnn_my_model_avg_acc}')
print("")
print(f'The Ternary Classification Average (Google News 300): {t_fnn_g_news_avg_acc}')
print(f'The Ternary Classification Average (My Model): {t_fnn_my_model_avg_acc}')
print("")
print(f'The Binary Classification First 10 (Google News 300): {b_fnn_g_news_f10_acc}')
print(f'The Binary Classification First 10 (My Model): {b_fnn_my_model_f10_acc}')
print("")
print(f'The Ternary Classification First 10 (Google News 300): {t_fnn_g_news_f10_acc}')
print(f'The Ternary Classification First 10 (My Model): {t_fnn_my_model_f10_acc}')
print("")
print("")
print("CNN")
print("------------------------------------------------------------")
print(f'The Binary Classification (Google News 300): {b_cnn_g_news_acc}')
print(f'The Binary Classification (My Model): {b_cnn_my_model_acc}')
print("")
print(f'The Ternary Classification (Google News 300): {t_cnn_g_news_acc}')
print(f'The Ternary Classification (My Model): {t_cnn_my_model_acc}')

Perceptron
------------------------------------------------------------
The Binary Classification (Google News 300): 0.822
The Binary Classification (My Model): 0.833025


SVM
------------------------------------------------------------
The Binary Classification (Google News 300): 0.845425
The Binary Classification (My Model): 0.887025


FNN
------------------------------------------------------------
The Binary Classification Average (Google News 300): 0.8774
The Binary Classification Average (My Model): 0.898975

The Ternary Classification Average (Google News 300): 0.73546
The Ternary Classification Average (My Model): 0.76388

The Binary Classification First 10 (Google News 300): 0.832625
The Binary Classification First 10 (My Model): 0.850175

The Ternary Classification First 10 (Google News 300): 0.6842
The Ternary Classification First 10 (My Model): 0.70944


CNN
------------------------------------------------------------
The Binary Classification (Google News 300): 0.89885
The