In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
import re
from bs4 import BeautifulSoup
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. Dataset Generation

In [None]:
# read the data to a pandas dataframe, to save the RAM, only read two columns that we need
df = pd.read_table('amazon_reviews_us_Kitchen_v1_00.tsv',
                   usecols = ['star_rating','review_body'],
                   error_bad_lines=False,warn_bad_lines=False).dropna()

In [None]:
# Select 50K instances for each rating score
df1 = df[df['star_rating'] == 1]
df2 = df[df['star_rating'] == 2]
df3 = df[df['star_rating'] == 3]
df4 = df[df['star_rating'] == 4]
df5 = df[df['star_rating'] == 5]

SAMPLE_NUM = 50000

df1_sample = df1.sample(n = SAMPLE_NUM) 
df2_sample = df2.sample(n = SAMPLE_NUM)
df3_sample = df3.sample(n = SAMPLE_NUM)
df4_sample = df4.sample(n = SAMPLE_NUM)
df5_sample = df5.sample(n = SAMPLE_NUM)

all_data = pd.concat([df1_sample, df2_sample,df3_sample,df4_sample,df5_sample])

In [None]:
# add ternary label 
def convert_label(s):
    if s<3:
        return 0;
    elif s==3:
        return 1;
    else:
        return 2;

all_data['label'] = all_data['star_rating'].apply(convert_label)

#### Data Cleaning

In [None]:
# Perform Data Cleaning

# convert all reviews to lower case
all_data['review_body'] = all_data['review_body'].str.lower()

# Remove HTML
all_data['review_body'] = all_data['review_body'].apply(lambda text: BeautifulSoup(text).get_text())

# Remove URL by remove all word start with 'http:' or 'https:', then remove all word start with 'www.' and end with '.com'
all_data['review_body'] = all_data['review_body'].apply(lambda text: re.sub(r'https?:\S+', '', text)) 
all_data['review_body'] = all_data['review_body'].apply(lambda text: re.sub(r'www.\S+.com', '', text))

# I manually code the contraction function by replace specific expression with their expand version.
def contractionfunction(s):
    # specific
    s = re.sub(r"won\'t", "will not", s)
    s = re.sub(r"can\'t", "can not", s)
    s = re.sub(r'ain\'t', 'are not', s)

    # general
    s = re.sub(r"n\'t", " not", s)
    s = re.sub(r'(\w+)\'re', '\g<1> are', s)
    s = re.sub(r'(\w+)\'s', '\g<1> is', s)
    s = re.sub(r'(\w+)\'d', '\g<1> would', s)
    s = re.sub(r'(\w+)\'ll', '\g<1> will', s)
    s = re.sub(r'(\w+)\'t', '\g<1> not', s)
    s = re.sub(r'(\w+)\'ve', '\g<1> have', s)
    s = re.sub(r'(\w+)\'m', '\g<1> am', s)
    return s

all_data['review_body'] = all_data['review_body'].apply(contractionfunction)

# remove all non-alphabetical characters
regex = re.compile('[^a-zA-Z]')
all_data['review_body'] = all_data['review_body'].apply(lambda text: regex.sub(' ', text))

# Remove all extra spaces
all_data['review_body'] = all_data['review_body'].apply(lambda text: re.sub(' +', ' ', text))

In [None]:
# Clear unnecessary variables to release RAM
del df

## 2. Word Embedding

#### part (a)

In [None]:
# Load 'word2vec-google-news-300' pretrained model
import gensim.downloader as api
wv_g = api.load('word2vec-google-news-300')

In [None]:
# Save the model
# wv_g.save("word2vec.google_model")

In [None]:
w1 = 'good'
w2 = 'nice'
print("Example 1: According to Pretrained Model, the similarity between word '"+w1+"' and '"+w2+"' is "+str(wv_g.similarity(w1, w2)))
print("Similar words have high similarity.")

Example 1: According to Pretrained Model, the similarity between word 'good' and 'nice' is 0.6836092
Similar words have high similarity.


In [None]:
w1 = 'big'
w2 = 'large'
print("Example 2: According to Pretrained Model, the similarity between word '"+w1+"' and '"+w2+"' is "+str(wv_g.similarity(w1, w2)))
print("Similar words have high similarity.")

Example 2: According to Pretrained Model, the similarity between word 'big' and 'large' is 0.5561479
Similar words have high similarity.


In [None]:
vec_king = wv_g['king']
vec_man = wv_g['man']
vec_woman = wv_g['woman']
vec_queen = wv_g['queen']
print("There are 5 most similar words and their similarity with 'King'-'Man'+'Woman':")
print(wv_g.similar_by_vector((vec_king-vec_man+vec_woman), topn=5, restrict_vocab=None))
print("\nThis example shows that 'King'-'Man'+'Woman'= Queen ")

There are 5 most similar words and their similarity with 'King'-'Man'+'Woman':
[('king', 0.8449392318725586), ('queen', 0.7300517559051514), ('monarch', 0.6454660892486572), ('princess', 0.6156251430511475), ('crown_prince', 0.5818676948547363)]

This example shows that 'King'-'Man'+'Woman'= Queen 


#### part (b)

In [None]:
# Train My Model

# Format all reviews into list of list of words for future training
from gensim.models import Word2Vec
raw_sentences = all_data['review_body'].tolist()
all_sentences = []
for each_s in raw_sentences:
  temp = each_s.split()
  all_sentences.append(temp)

In [None]:
# Train My model
mymodel = Word2Vec(size=300, window=11, min_count=10, workers=4)
mymodel.build_vocab(all_sentences)  
mymodel.train(sentences=all_sentences, total_examples=mymodel.corpus_count, epochs=15)  

(169562282, 238679070)

In [None]:
w1 = 'good'
w2 = 'nice'
print("Example 1: According to My Model, the similarity between word '"+w1+"' and '"+w2+"' is "+str(mymodel.similarity(w1, w2)))
print("Similar words have high similarity.")

Example 1: According to My Model, the similarity between word 'good' and 'nice' is 0.6280225
Similar words have high similarity.


In [None]:
w1 = 'big'
w2 = 'large'
print("Example 2: According to My Model, the similarity between word '"+w1+"' and '"+w2+"' is "+str(mymodel.similarity(w1, w2)))
print("Similar words have high similarity.")

Example 2: According to My Model, the similarity between word 'big' and 'large' is 0.62499684
Similar words have high similarity.


In [None]:
vec_king = mymodel.wv['king']
vec_man = mymodel.wv['man']
vec_woman = mymodel.wv['woman']
vec_queen = mymodel.wv['queen']
print("For my model, there are 5 most similar words and their similarity with 'King'-'Man'+'Woman':")
print(mymodel.similar_by_vector((vec_king-vec_man+vec_woman), topn=5, restrict_vocab=None))
print("\nThis example does not show that 'King'-'Man'+'Woman'= 'Queen', ")
print("This is because the those word are not show in reviews frequently, so we do not have enough data to train the model.")

For my model, there are 5 most similar words and their similarity with 'King'-'Man'+'Woman':
[('king', 0.5912432074546814), ('woman', 0.4643744230270386), ('petite', 0.3288615942001343), ('arthur', 0.32821086049079895), ('textured', 0.32040905952453613)]

This example does not show that 'King'-'Man'+'Woman'= 'Queen', 
This is because the those word are not show in reviews frequently, so we do not have enough data to train the model.


In [None]:
mymodel.save("word2vec.mymodel")

In [None]:
print("From the above example result we can see that, the performance of comparing high frequent words between pretrained model and my model are similar.")
print("However, for low frequency word, the pretrained model perform better.")
print("This is resonable because those 'low frequent' words appears more in training data of pretrained model.")

From the above example result we can see that, the performance of comparing high frequent words between pretrained model and my model are similar.
However, for low frequency word, the pretrained model perform better.
This is resonable because those 'low frequent' words appears more in training data of pretrained model.


In [None]:
# Clear unnecessary variables to release RAM
del all_sentences

## 3. Simple Models

#### ------------Pre-processing----------------

In [None]:
# remove the stop words 
clean_data = all_data.copy()
# create a stop word list for english
from nltk.corpus import stopwords
words_list = stopwords.words('english')

# split each review into words and check them one by one, remove the word if it is a stop word, 
# and concate the words finally
def remove_stop(s):
    pieces = s.split()
    result = ''
    for each_word in pieces:
        if each_word not in words_list:
            result = result+' '+each_word
    if len(result)>0:
        result = result[1:]
    return result

clean_data['review_body'] = clean_data['review_body'].apply(remove_stop) 

In [None]:
# create a dataframe for binary data
binary_data = clean_data[clean_data['star_rating']!=3]

####TF-IDF

In [None]:
# TF-IDF Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
review_list = binary_data['review_body'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(review_list)
#print(X.shape)
vector_df = pd.DataFrame.sparse.from_spmatrix(X)

In [None]:
# split train and test, to keep classes distribute evenly, I set the stratify to label list.
from sklearn.model_selection import train_test_split
binary_label = binary_data['label']
x_train, x_test, y_train, y_test = train_test_split(vector_df,binary_label, test_size=0.2,random_state=2,stratify=binary_label)

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron
clf = Perceptron(random_state=2)
clf.fit(x_train, y_train)

# train + test prediction

# train
y_pred = clf.predict(x_train)
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = '\nFor TF-IDF + Perceptron model, the accuracy, precision, recall and f1-score of training dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+". "

# test
y_pred = clf.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = answer_str+'The accuracy, precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+"."

print(answer_str)


For TF-IDF + Perceptron model, the accuracy, precision, recall and f1-score of training dataset are 0.982375, 0.9790218470705064, 0.985875, 0.9824364723467862. The accuracy, precision, recall and f1-score of testing dataset are 0.806, 0.792822966507177, 0.8285, 0.8102689486552567.


In [None]:
# --------------------SVM----------------------------
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=2)
lsvc.fit(x_train, y_train)

# train + test prediction

# train
y_pred = lsvc.predict(x_train)
tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = '\nFor TF-IDF + SVM model, the accuracy, precision, recall and f1-score of training dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+". "

# test
y_pred = lsvc.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = answer_str+'The accuracy, precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+"."

print(answer_str)


For TF-IDF + SVM model, the accuracy, precision, recall and f1-score of training dataset are 0.9651875, 0.965944660072618, 0.964375, 0.9651591918433727. The accuracy, precision, recall and f1-score of testing dataset are 0.8445, 0.843812375249501, 0.8455, 0.8446553446553448.


#### prepare train and test data

In [None]:
def compute_avg(s,input_model):
  words = s.split()
  num_of_words = len(words)
  if num_of_words==0:
    return [0]*300
  else:
    current_total_vector = [0]*300
    for each_word in words:
      try:
        word_vector = input_model[each_word]
        current_total_vector = [a+b for a,b in zip(current_total_vector,word_vector)]
      except:
        pass
    avg_vector = [x / num_of_words for x in current_total_vector]
    return avg_vector

def get_avg_W2V(input_model1,df1):
  temp_df = df1[['review_body']].copy()
  temp_df['all_vec'] = temp_df.apply(lambda y: compute_avg(y['review_body'],input_model1),axis = 1)
  vec_df = temp_df['all_vec'].apply(pd.Series)
  return vec_df

In [None]:
# Compute average and Split ternary data into train and test part for 2 Word2Vec models
from sklearn.model_selection import train_test_split

ternary_label = clean_data['label']

# my model
my_vec_df_ternary = get_avg_W2V(mymodel.wv,clean_data)
x_train_3_my, x_test_3_my, y_train_3_my, y_test_3_my = train_test_split(my_vec_df_ternary,ternary_label, test_size=0.2,random_state=2,stratify=ternary_label)

# pretrained model
g_vec_df_ternary = get_avg_W2V(wv_g,clean_data)
x_train_3_g, x_test_3_g, y_train_3_g, y_test_3_g = train_test_split(g_vec_df_ternary,ternary_label, test_size=0.2,random_state=2,stratify=ternary_label)

In [None]:
# Compute average and Split Binary data into train and test part for 2 Word2Vec models

# my model
my_data_df_ternary = my_vec_df_ternary.copy()
my_data_df_ternary['label'] = ternary_label
my_data_df_binary = my_data_df_ternary[my_data_df_ternary['label']!=1]
my_vec_df_binary = my_data_df_binary.iloc[:, :-1].values
my_binary_label = my_data_df_binary.iloc[:, 300].values
my_binary_label = np.where(my_binary_label == 2, 1, my_binary_label)
x_train_2_my, x_test_2_my, y_train_2_my, y_test_2_my = train_test_split(my_vec_df_binary,my_binary_label, test_size=0.2,random_state=2,stratify=my_binary_label)

# pretrained model
g_data_df_ternary = g_vec_df_ternary.copy()
g_data_df_ternary['label'] = ternary_label
g_data_df_binary = g_data_df_ternary[g_data_df_ternary['label']!=1]
g_vec_df_binary = g_data_df_binary.iloc[:, :-1].values
g_binary_label = g_data_df_binary.iloc[:, 300].values
g_binary_label = np.where(g_binary_label == 2, 1, g_binary_label)
x_train_2_g, x_test_2_g, y_train_2_g, y_test_2_g = train_test_split(g_vec_df_binary,g_binary_label, test_size=0.2,random_state=2,stratify=g_binary_label)


#### My Word2Vec

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron
clf = Perceptron(random_state=2)
clf.fit(x_train_2_my, y_train_2_my)

# train + test prediction

# train
y_pred = clf.predict(x_train_2_my)
tn, fp, fn, tp = confusion_matrix(y_train_2_my, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = '\nFor my Word2Vec + Perceptron model, the accuracy, precision, recall and f1-score of training dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+". "

# test
y_pred = clf.predict(x_test_2_my)
tn, fp, fn, tp = confusion_matrix(y_test_2_my, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = answer_str+'The accuracy, precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+"."

print(answer_str)


For my Word2Vec + Perceptron model, the accuracy, precision, recall and f1-score of training dataset are 0.73401875, 0.8380003249742729, 0.5802, 0.6856687027749669. The accuracy, precision, recall and f1-score of testing dataset are 0.73325, 0.839618520675597, 0.57665, 0.6837206544937159.


In [None]:
# --------------------SVM---------------------------- 
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=2)
lsvc.fit(x_train_2_my, y_train_2_my)

# train + test prediction

# train
y_pred = lsvc.predict(x_train_2_my)
tn, fp, fn, tp = confusion_matrix(y_train_2_my, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = '\nFor my Word2Vec + SVM model, the accuracy, precision, recall and f1-score of training dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+". "

# test
y_pred = lsvc.predict(x_test_2_my)
tn, fp, fn, tp = confusion_matrix(y_test_2_my, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = answer_str+'The accuracy, precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+"."

print(answer_str)


For my Word2Vec + SVM model, the accuracy, precision, recall and f1-score of training dataset are 0.84928125, 0.8577189456300488, 0.8374875, 0.8474824966954014. The accuracy, precision, recall and f1-score of testing dataset are 0.849475, 0.859338851472932, 0.83575, 0.8473802945425972.


#### pre_trained Word2Vec

In [None]:
# Perceptron            
from sklearn.linear_model import Perceptron
clf = Perceptron(random_state=2)
clf.fit(x_train_2_g, y_train_2_g)

# train + test prediction

# train
y_pred = clf.predict(x_train_2_g)
tn, fp, fn, tp = confusion_matrix(y_train_2_g, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = '\nFor pretrained Word2Vec + Perceptron model, the accuracy, precision, recall and f1-score of training dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+". "

# test
y_pred = clf.predict(x_test_2_g)
tn, fp, fn, tp = confusion_matrix(y_test_2_g, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = answer_str+'The accuracy, precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+"."

print(answer_str)


For pretrained Word2Vec + Perceptron model, the accuracy, precision, recall and f1-score of training dataset are 0.665025, 0.6025732666190136, 0.96945, 0.743201027272553. The accuracy, precision, recall and f1-score of testing dataset are 0.663975, 0.601705690804776, 0.9701, 0.7427313618528089.


In [None]:
# --------------------SVM---------------------------- 
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state=2)
lsvc.fit(x_train_2_g, y_train_2_g)

# train + test prediction

# train
y_pred = lsvc.predict(x_train_2_g)
tn, fp, fn, tp = confusion_matrix(y_train_2_g, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = '\nFor pretrained Word2Vec + SVM model, the accuracy, precision, recall and f1-score of training dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+". "

# test
y_pred = lsvc.predict(x_test_2_g)
tn, fp, fn, tp = confusion_matrix(y_test_2_g, y_pred).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))

answer_str = answer_str+'The accuracy, precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(accuracy)+", "+str(precision)+", "+str(recall)+", "+str(f1_score)+"."

print(answer_str)


For pretrained Word2Vec + SVM model, the accuracy, precision, recall and f1-score of training dataset are 0.82218125, 0.83827022770523, 0.7984, 0.8178494830180224. The accuracy, precision, recall and f1-score of testing dataset are 0.820325, 0.8358232426482152, 0.79725, 0.8160810707065537.


###### Conclusion for Q uestion 3:
In homework 1, 

## 4. Feedforward Neutral Networks

#### part (a)

In [None]:
# import everything and set device
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
#import torchvision.datasets as datasets # standard datasets
import torchvision.transforms as transforms # data processing
#import torch.utils.data.TensorDataset #as TensorDataset
from torch.utils.data import TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#### Binary

In [None]:
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)



class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        hidden_1 = 50
        hidden_2 = 10
        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 1)
        
        
    def forward(self, x):
        x = torch.relu(self.fc1(x)) #torch.relu()
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        #x = torch.round(x)
        #x = torch.softmax(x)
        return x

#### Pretrained Model

In [None]:
# Split train data into train and valid data
x_puretrain_2_g, x_valid_2_g, y_puretrain_2_g, y_valid_2_g = train_test_split(x_train_2_g,y_train_2_g, test_size=0.2,random_state=2,stratify=y_train_2_g)

# convert dataframe to dataset
## train data     
train_data = trainData(torch.tensor(x_puretrain_2_g), torch.tensor(y_puretrain_2_g))

## validation data
validation_data = trainData(torch.tensor(x_valid_2_g), torch.tensor(y_valid_2_g))

## test data    
test_data = testData(torch.tensor(x_test_2_g))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
b_model = binaryClassification()
#b_model.to(device)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(b_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 40
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    b_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      y_pred = b_model(X_batch)
      loss = criterion(y_pred.flatten(), y_batch.float())
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    b_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = b_model(data)
        # calculate the loss
        loss = criterion(output.flatten(), target.float())
        # update running validation loss 
        valid_loss += loss.item()

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(b_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.398327).  Saving model ...
Epoch 000: | Train Loss: 0.42731 | Validation Loss: 0.39833
Validation loss decreased (0.398327 --> 0.384852).  Saving model ...
Epoch 001: | Train Loss: 0.39108 | Validation Loss: 0.38485
Validation loss decreased (0.384852 --> 0.374852).  Saving model ...
Epoch 002: | Train Loss: 0.37920 | Validation Loss: 0.37485
Validation loss decreased (0.374852 --> 0.372472).  Saving model ...
Epoch 003: | Train Loss: 0.37076 | Validation Loss: 0.37247
Validation loss decreased (0.372472 --> 0.363599).  Saving model ...
Epoch 004: | Train Loss: 0.36345 | Validation Loss: 0.36360
Validation loss decreased (0.363599 --> 0.359321).  Saving model ...
Epoch 005: | Train Loss: 0.35664 | Validation Loss: 0.35932
Validation loss decreased (0.359321 --> 0.359084).  Saving model ...
Epoch 006: | Train Loss: 0.35110 | Validation Loss: 0.35908
Validation loss decreased (0.359084 --> 0.352811).  Saving model ...
Epoch 007: | Train Loss: 0.34577 

In [None]:
# Load the state of model
b_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# predict test data
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = model(X_batch)
        prediction_list.append(output.item())
    
    return [round(num) for num in prediction_list]

In [None]:
# calculate accuracy
y_pred = predict(b_model, test_loader)  
tn, fp, fn, tp = confusion_matrix(y_test_2_g, np.array(y_pred)).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + Pretrained model, the accuracy for feedforward neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + Pretrained model, the accuracy for feedforward neural network is 0.84775
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.8488663723916533, 0.84615, 0.8475060096153845.


#### My model

In [None]:
# split training data into train and valid 
x_puretrain_2_my, x_valid_2_my, y_puretrain_2_my, y_valid_2_my = train_test_split(x_train_2_my,y_train_2_my, test_size=0.2,random_state=2,stratify=y_train_2_my)

## train data     
train_data = trainData(torch.tensor(x_puretrain_2_my), torch.tensor(y_puretrain_2_my))

## validation data
validation_data = trainData(torch.tensor(x_valid_2_my), torch.tensor(y_valid_2_my))

## test data    
test_data = testData(torch.tensor(x_test_2_my))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
b_model = binaryClassification()
b_model.to(device)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(b_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 40
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    b_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      
      y_pred = b_model(X_batch)
      loss = criterion(y_pred.flatten(), y_batch.float())
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    b_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        data, target = data.to(device), target.to(device)
        output = b_model(data)
        # calculate the loss
        loss = criterion(output.flatten(), target.float())
        # update running validation loss 
        valid_loss += loss.item()

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(b_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.332100).  Saving model ...
Epoch 000: | Train Loss: 0.36298 | Validation Loss: 0.33210
Validation loss decreased (0.332100 --> 0.320255).  Saving model ...
Epoch 001: | Train Loss: 0.32838 | Validation Loss: 0.32026
Validation loss decreased (0.320255 --> 0.314852).  Saving model ...
Epoch 002: | Train Loss: 0.31541 | Validation Loss: 0.31485
Epoch 003: | Train Loss: 0.30576 | Validation Loss: 0.31497
Validation loss decreased (0.314852 --> 0.307469).  Saving model ...
Epoch 004: | Train Loss: 0.29751 | Validation Loss: 0.30747
Validation loss decreased (0.307469 --> 0.306802).  Saving model ...
Epoch 005: | Train Loss: 0.29061 | Validation Loss: 0.30680
Epoch 006: | Train Loss: 0.28493 | Validation Loss: 0.31264
Validation loss decreased (0.306802 --> 0.306381).  Saving model ...
Epoch 007: | Train Loss: 0.27962 | Validation Loss: 0.30638
Epoch 008: | Train Loss: 0.27470 | Validation Loss: 0.30797
Validation loss decreased (0.306381 --> 0.304983). 

In [None]:
b_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# predict test data
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        X_batch = X_batch.to(device)
        output = model(X_batch)
        prediction_list.append(output.item())
    
    return [round(num) for num in prediction_list]

In [None]:
# calculate accuracy
y_pred = predict(b_model, test_loader)  
tn, fp, fn, tp = confusion_matrix(y_test_2_my, np.array(y_pred)).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + My model, the accuracy for feedforward neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + My model, the accuracy for feedforward neural network is 0.86965
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.8695021991203519, 0.86985, 0.8696760647870425.


#### Ternary

In [None]:
class ternaryClassification(nn.Module):
    def __init__(self):
        super(ternaryClassification,self).__init__()
        hidden_1 = 50
        hidden_2 = 10
        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        
        
    def forward(self, x):
        x = self.fc1(x) #torch.relu()
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = torch.relu(x)
        #x = torch.sigmoid(self.fc3(x))
        #x = torch.round(x)
        return x

#### Pretrained Data

In [None]:
# split train into train and valid 
x_puretrain_3_g, x_valid_3_g, y_puretrain_3_g, y_valid_3_g = train_test_split(x_train_3_g,y_train_3_g, test_size=0.2,random_state=2,stratify=y_train_3_g)

## train data     
train_data = trainData(torch.tensor(x_puretrain_3_g.values), torch.tensor(y_puretrain_3_g.values))

## validation data
validation_data = trainData(torch.tensor(x_valid_3_g.values), torch.tensor(y_valid_3_g.values))

## test data    
test_data = testData(torch.tensor(x_test_3_g.values))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
t_model = ternaryClassification()
#b_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(t_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 50
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    t_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      
      y_pred = t_model(X_batch)
      loss = criterion(y_pred, y_batch)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    t_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = t_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()#*data.size(0)

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(t_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.786512).  Saving model ...
Epoch 000: | Train Loss: 0.81122 | Validation Loss: 0.78651
Validation loss decreased (0.786512 --> 0.780143).  Saving model ...
Epoch 001: | Train Loss: 0.77864 | Validation Loss: 0.78014
Validation loss decreased (0.780143 --> 0.779010).  Saving model ...
Epoch 002: | Train Loss: 0.77409 | Validation Loss: 0.77901
Validation loss decreased (0.779010 --> 0.774775).  Saving model ...
Epoch 003: | Train Loss: 0.77079 | Validation Loss: 0.77478
Epoch 004: | Train Loss: 0.76807 | Validation Loss: 0.77729
Epoch 005: | Train Loss: 0.76508 | Validation Loss: 0.77837
Validation loss decreased (0.774775 --> 0.770540).  Saving model ...
Epoch 006: | Train Loss: 0.76183 | Validation Loss: 0.77054
Validation loss decreased (0.770540 --> 0.766265).  Saving model ...
Epoch 007: | Train Loss: 0.75788 | Validation Loss: 0.76626
Validation loss decreased (0.766265 --> 0.765228).  Saving model ...
Epoch 008: | Train Loss: 0.75484 | Validat

In [None]:
t_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# predict test data
from torch import IntTensor
def predict3class(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = torch.argmax(model(X_batch))
        prediction_list.append(IntTensor.item(output))
    
    return prediction_list#[round(num) for num in prediction_list]

def get_accuracy(y_true, y_pred):
  counter = 0
  for i in range(0,len(y_true)):
    if y_true[i]==y_pred[i]:
      counter = counter+1
  return counter/len(y_true)

In [None]:
# calculate accuracy
y_pred = predict3class(t_model, test_loader)  
accuracy = get_accuracy(y_test_3_g.to_list(), y_pred)
print("For Ternary data + Pretrained model, the accuracy for feedforward neural network is "+str(accuracy))

For Ternary data + Pretrained model, the accuracy for feedforward neural network is 0.6782


#### My Model

In [None]:
# split train into train and valid   
x_puretrain_3_my, x_valid_3_my, y_puretrain_3_my, y_valid_3_my = train_test_split(x_train_3_my,y_train_3_my, test_size=0.2,random_state=2,stratify=y_train_3_my)
## train data     
train_data = trainData(torch.tensor(x_puretrain_3_my.values), torch.tensor(y_puretrain_3_my.values))

## validation data
validation_data = trainData(torch.tensor(x_valid_3_my.values), torch.tensor(y_valid_3_my.values))

## test data    
test_data = testData(torch.tensor(x_test_3_my.values))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
t_model = ternaryClassification()
#b_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(t_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 30
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    t_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      
      y_pred = t_model(X_batch)
      loss = criterion(y_pred, y_batch)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    t_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = t_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()#*data.size(0)

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(t_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.734760).  Saving model ...
Epoch 000: | Train Loss: 0.75256 | Validation Loss: 0.73476
Validation loss decreased (0.734760 --> 0.730281).  Saving model ...
Epoch 001: | Train Loss: 0.72555 | Validation Loss: 0.73028
Validation loss decreased (0.730281 --> 0.720353).  Saving model ...
Epoch 002: | Train Loss: 0.71804 | Validation Loss: 0.72035
Validation loss decreased (0.720353 --> 0.717478).  Saving model ...
Epoch 003: | Train Loss: 0.71215 | Validation Loss: 0.71748
Validation loss decreased (0.717478 --> 0.715590).  Saving model ...
Epoch 004: | Train Loss: 0.70773 | Validation Loss: 0.71559
Validation loss decreased (0.715590 --> 0.714481).  Saving model ...
Epoch 005: | Train Loss: 0.70436 | Validation Loss: 0.71448
Validation loss decreased (0.714481 --> 0.712818).  Saving model ...
Epoch 006: | Train Loss: 0.70195 | Validation Loss: 0.71282
Epoch 007: | Train Loss: 0.69960 | Validation Loss: 0.71536
Validation loss decreased (0.712818 --> 0.

In [None]:
t_model.load_state_dict(torch.load('model.pt'))

In [None]:
# calculate accuracy
y_pred = predict3class(t_model, test_loader)  
accuracy = get_accuracy(y_test_3_my.to_list(), y_pred)
print("For Ternary data + My model, the accuracy for feedforward neural network is "+str(accuracy))

0.6628

#### part (b)

#### create f10 data

In [None]:
def compute_avg(s,input_model):
  words = s.split()
  num_of_words = len(words)
  if num_of_words==0:
    return [0]*300
  else:
    counter = 0
    current_total_vector = [0]*300
    for each_word in words:
      try:
        word_vector = input_model[each_word]
        current_total_vector = [a+b for a,b in zip(current_total_vector,word_vector)]
        counter = counter+1
      except:
        pass
      if counter >=10:
        break
    if counter>0:
      avg_vector = [x / counter for x in current_total_vector]
    else:
      avg_vector = [0]*300
    return avg_vector

def get_avg_W2V(input_model1,df1):
  temp_df = df1[['review_body']].copy()
  temp_df['all_vec'] = temp_df.apply(lambda y: compute_avg(y['review_body'],input_model1),axis = 1)
  vec_df = temp_df['all_vec'].apply(pd.Series)
  return vec_df

In [None]:
# For ternary data, compute the average for the first 10 words for both Word2Vec models
from sklearn.model_selection import train_test_split
ternary_label = clean_data['label']

# my model
my_vec_df_ternary_f10 = get_avg_W2V(mymodel.wv,clean_data)
x_train_3_my_f10, x_test_3_my_f10, y_train_3_my_f10, y_test_3_my_f10 = train_test_split(my_vec_df_ternary_f10,ternary_label, test_size=0.2,random_state=2,stratify=ternary_label)

# google model
g_vec_df_ternary_f10 = get_avg_W2V(wv_g,clean_data)
x_train_3_g_f10, x_test_3_g_f10, y_train_3_g_f10, y_test_3_g_f10 = train_test_split(g_vec_df_ternary_f10,ternary_label, test_size=0.2,random_state=2,stratify=ternary_label)

In [None]:
#--------------------binary data-------------------------- 
# my model
my_data_df_ternary_f10 = my_vec_df_ternary_f10.copy()
my_data_df_ternary_f10['label'] = ternary_label
my_data_df_binary_f10 = my_data_df_ternary_f10[my_data_df_ternary_f10['label']!=1]
my_vec_df_binary_f10 = my_data_df_binary_f10.iloc[:, :-1].values
my_binary_label_f10 = my_data_df_binary_f10.iloc[:, 300].values
my_binary_label_f10 = np.where(my_binary_label_f10 == 2, 1, my_binary_label_f10)
x_train_2_my_f10, x_test_2_my_f10, y_train_2_my_f10, y_test_2_my_f10 = train_test_split(my_vec_df_binary_f10,my_binary_label_f10, test_size=0.2,random_state=2,stratify=my_binary_label_f10)

# google model
g_data_df_ternary_f10 = g_vec_df_ternary_f10.copy()
g_data_df_ternary_f10['label'] = ternary_label
g_data_df_binary_f10 = g_data_df_ternary_f10[g_data_df_ternary_f10['label']!=1]
g_vec_df_binary_f10 = g_data_df_binary_f10.iloc[:, :-1].values
g_binary_label_f10 = g_data_df_binary_f10.iloc[:, 300].values
g_binary_label_f10 = np.where(g_binary_label_f10 == 2, 1, g_binary_label_f10)
x_train_2_g_f10, x_test_2_g_f10, y_train_2_g_f10, y_test_2_g_f10 = train_test_split(g_vec_df_binary_f10,g_binary_label_f10, test_size=0.2,random_state=2,stratify=g_binary_label_f10)

#### binary

#### Pretrained model

In [None]:
# split train into train and valid 
x_puretrain_2_g_f10, x_valid_2_g_f10, y_puretrain_2_g_f10, y_valid_2_g_f10 = train_test_split(x_train_2_g,y_train_2_g_f10, test_size=0.2,random_state=2,stratify=y_train_2_g_f10)

## train data     
train_data = trainData(torch.tensor(x_puretrain_2_g_f10), torch.tensor(y_puretrain_2_g_f10))

## validation data
validation_data = trainData(torch.tensor(x_valid_2_g_f10), torch.tensor(y_valid_2_g_f10))

## test data    
test_data = testData(torch.tensor(x_test_2_g_f10))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
b_model = binaryClassification()
#b_model.to(device)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(b_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 30
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    b_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      
      y_pred = b_model(X_batch)
      loss = criterion(y_pred.flatten(), y_batch.float())
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    b_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = b_model(data)
        # calculate the loss
        loss = criterion(output.flatten(), target.float())
        # update running validation loss 
        valid_loss += loss.item()

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(b_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.449485).  Saving model ...
Epoch 000: | Train Loss: 0.55349 | Validation Loss: 0.44949
Validation loss decreased (0.449485 --> 0.420397).  Saving model ...
Epoch 001: | Train Loss: 0.43723 | Validation Loss: 0.42040
Validation loss decreased (0.420397 --> 0.417392).  Saving model ...
Epoch 002: | Train Loss: 0.41975 | Validation Loss: 0.41739
Validation loss decreased (0.417392 --> 0.408047).  Saving model ...
Epoch 003: | Train Loss: 0.41063 | Validation Loss: 0.40805
Validation loss decreased (0.408047 --> 0.406333).  Saving model ...
Epoch 004: | Train Loss: 0.40370 | Validation Loss: 0.40633
Validation loss decreased (0.406333 --> 0.404096).  Saving model ...
Epoch 005: | Train Loss: 0.39799 | Validation Loss: 0.40410
Validation loss decreased (0.404096 --> 0.401656).  Saving model ...
Epoch 006: | Train Loss: 0.39356 | Validation Loss: 0.40166
Epoch 007: | Train Loss: 0.38993 | Validation Loss: 0.40220
Epoch 008: | Train Loss: 0.38556 | Validat

In [None]:
b_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# predict test data
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = model(X_batch)
        prediction_list.append(output.item())
    
    return [round(num) for num in prediction_list]

In [None]:
# calculate accuracy
y_pred = predict(b_model, test_loader)  
tn, fp, fn, tp = confusion_matrix(y_test_2_g_f10, np.array(y_pred)).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + Pretrained model + first 10 vectors, the accuracy for feedforward neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + Pretrained model + first 10 vectors, the accuracy for feedforward neural network is 0.764
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.7595870206489675, 0.7725, 0.7659890927119484.


#### My model

In [None]:
# split train into train and valid
x_puretrain_2_my_f10, x_valid_2_my_f10, y_puretrain_2_my_f10, y_valid_2_my_f10 = train_test_split(x_train_2_my_f10,y_train_2_my_f10, test_size=0.2,random_state=2,stratify=y_train_2_my_f10)

## train data     
train_data = trainData(torch.tensor(x_puretrain_2_my_f10), torch.tensor(y_puretrain_2_my_f10))

## validation data
validation_data = trainData(torch.tensor(x_valid_2_my_f10), torch.tensor(y_valid_2_my_f10))

## test data    
test_data = testData(torch.tensor(x_test_2_my_f10))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
b_model = binaryClassification()
#b_model.to(device)

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(b_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 30
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    b_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      y_pred = b_model(X_batch)
      loss = criterion(y_pred.flatten(), y_batch.float())
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    b_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = b_model(data)
        # calculate the loss
        loss = criterion(output.flatten(), target.float())
        # update running validation loss 
        valid_loss += loss.item()

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(b_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.485828).  Saving model ...
Epoch 000: | Train Loss: 0.53516 | Validation Loss: 0.48583
Validation loss decreased (0.485828 --> 0.477947).  Saving model ...
Epoch 001: | Train Loss: 0.47854 | Validation Loss: 0.47795
Validation loss decreased (0.477947 --> 0.475256).  Saving model ...
Epoch 002: | Train Loss: 0.46655 | Validation Loss: 0.47526
Validation loss decreased (0.475256 --> 0.473613).  Saving model ...
Epoch 003: | Train Loss: 0.45866 | Validation Loss: 0.47361
Validation loss decreased (0.473613 --> 0.470081).  Saving model ...
Epoch 004: | Train Loss: 0.45048 | Validation Loss: 0.47008
Epoch 005: | Train Loss: 0.44150 | Validation Loss: 0.47333
Epoch 006: | Train Loss: 0.43419 | Validation Loss: 0.47038
Epoch 007: | Train Loss: 0.42693 | Validation Loss: 0.47148
Epoch 008: | Train Loss: 0.41784 | Validation Loss: 0.47633
Epoch 009: | Train Loss: 0.41136 | Validation Loss: 0.47656
Epoch 010: | Train Loss: 0.40389 | Validation Loss: 0.48298


In [None]:
b_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# predict test data
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = model(X_batch)
        prediction_list.append(output.item())
    
    return [round(num) for num in prediction_list]

In [None]:
# calculate accuracy
y_pred = predict(b_model, test_loader)  
tn, fp, fn, tp = confusion_matrix(y_test_2_my_f10, np.array(y_pred)).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + My model + first 10 vectors, the accuracy for feedforward neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + My model + first 10 vectors, the accuracy for feedforward neural network is 0.77575
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.7767185148018063, 0.774, 0.7753568745304283.


#### Ternary

#### Pretrained Model

In [None]:
# split train into train and valid   
x_puretrain_3_g_f10, x_valid_3_g_f10, y_puretrain_3_g_f10, y_valid_3_g_f10 = train_test_split(x_train_3_g_f10,y_train_3_g_f10, test_size=0.2,random_state=2,stratify=y_train_3_g_f10)

## train data     
train_data = trainData(torch.tensor(x_puretrain_3_g_f10.values), torch.tensor(y_puretrain_3_g_f10.values))

## validation data
validation_data = trainData(torch.tensor(x_valid_3_g_f10.values), torch.tensor(y_valid_3_g_f10.values))

## test data    
test_data = testData(torch.tensor(x_test_3_g_f10.values))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
t_model = ternaryClassification()
#b_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(t_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 30
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    t_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      
      y_pred = t_model(X_batch)
      loss = criterion(y_pred, y_batch)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    t_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = t_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(t_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.993760).  Saving model ...
Epoch 000: | Train Loss: 1.01531 | Validation Loss: 0.99376
Validation loss decreased (0.993760 --> 0.991653).  Saving model ...
Epoch 001: | Train Loss: 0.98476 | Validation Loss: 0.99165
Validation loss decreased (0.991653 --> 0.990568).  Saving model ...
Epoch 002: | Train Loss: 0.97987 | Validation Loss: 0.99057
Epoch 003: | Train Loss: 0.97741 | Validation Loss: 0.99184
Epoch 004: | Train Loss: 0.97572 | Validation Loss: 0.99077
Epoch 005: | Train Loss: 0.97346 | Validation Loss: 0.99127
Epoch 006: | Train Loss: 0.97240 | Validation Loss: 0.99186
Epoch 007: | Train Loss: 0.97112 | Validation Loss: 0.99253
Epoch 008: | Train Loss: 0.97020 | Validation Loss: 0.99478
Epoch 009: | Train Loss: 0.96822 | Validation Loss: 0.99236
Epoch 010: | Train Loss: 0.96818 | Validation Loss: 0.99385
Epoch 011: | Train Loss: 0.96797 | Validation Loss: 0.99252
Epoch 012: | Train Loss: 0.96679 | Validation Loss: 0.99524
Epoch 013: | Train

In [None]:
t_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# calculate accuracy
y_pred = predict3class(t_model, test_loader)  
accuracy = get_accuracy(y_test_3_g_f10.to_list(), y_pred)
print("For Ternary data + Pretrained model + first 10 vectors, the accuracy for feedforward neural network is "+str(accuracy))

For Ternary data + Pretrained model + first 10 vectors, the accuracy for feedforward neural network is 0.5988


#### My Model

In [None]:
# split train into train and valid 
x_puretrain_3_my_f10, x_valid_3_my_f10, y_puretrain_3_my_f10, y_valid_3_my_f10 = train_test_split(x_train_3_my_f10,y_train_3_my_f10, test_size=0.2,random_state=2,stratify=y_train_3_my_f10)
## train data     
train_data = trainData(torch.tensor(x_puretrain_3_my_f10.values), torch.tensor(y_puretrain_3_my_f10.values))

## validation data
validation_data = trainData(torch.tensor(x_valid_3_my_f10.values), torch.tensor(y_valid_3_my_f10.values))

## test data    
test_data = testData(torch.tensor(x_test_3_my_f10.values))

In [None]:
BATCH_SIZE=32
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=validation_data, batch_size=1)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
LEARNING_RATE = 0.0005
t_model = ternaryClassification()
#b_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(t_model.parameters(), lr=LEARNING_RATE)

In [None]:
# train model and save the parameters for least validation loss
N_EPOCHS = 30
valid_loss_min = np.Inf
for e in range(0, N_EPOCHS):
    train_loss = 0
    valid_loss = 0
    t_model.train()
    for X_batch, y_batch in train_loader:
      X_batch = X_batch.float()
      #X_batch, y_batch = X_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()
      y_pred = t_model(X_batch)
      loss = criterion(y_pred, y_batch)
      loss.backward()
      optimizer.step()
      train_loss += loss.item()

    t_model.eval() # prep model for evaluation
    for data, target in valid_loader:
        data = data.float()
        output = t_model(data)
        # calculate the loss
        loss = criterion(output, target)
        # update running validation loss 
        valid_loss += loss.item()#*data.size(0)

    if (valid_loss/len(valid_loader)) <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss/len(valid_loader)))
        torch.save(t_model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss/len(valid_loader)
        

    print(f'Epoch {e+0:03}: | Train Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(valid_loader):.5f}')

Validation loss decreased (inf --> 0.871216).  Saving model ...
Epoch 000: | Train Loss: 0.92821 | Validation Loss: 0.87122
Validation loss decreased (0.871216 --> 0.863455).  Saving model ...
Epoch 001: | Train Loss: 0.86635 | Validation Loss: 0.86345
Epoch 002: | Train Loss: 0.85840 | Validation Loss: 0.86355
Validation loss decreased (0.863455 --> 0.853500).  Saving model ...
Epoch 003: | Train Loss: 0.85217 | Validation Loss: 0.85350
Validation loss decreased (0.853500 --> 0.851163).  Saving model ...
Epoch 004: | Train Loss: 0.84575 | Validation Loss: 0.85116
Epoch 005: | Train Loss: 0.84172 | Validation Loss: 0.85217
Validation loss decreased (0.851163 --> 0.849143).  Saving model ...
Epoch 006: | Train Loss: 0.83733 | Validation Loss: 0.84914
Epoch 007: | Train Loss: 0.83465 | Validation Loss: 0.85494
Epoch 008: | Train Loss: 0.83006 | Validation Loss: 0.85059
Epoch 009: | Train Loss: 0.82777 | Validation Loss: 0.85142
Epoch 010: | Train Loss: 0.82424 | Validation Loss: 0.85637


In [None]:
t_model.load_state_dict(torch.load('model.pt'))

<All keys matched successfully>

In [None]:
# calculate accuracy
y_pred = predict3class(t_model, test_loader)  
accuracy = get_accuracy(y_test_3_my_f10.to_list(), y_pred)
print("For Ternary data + My model + first 10 vectors, the accuracy for feedforward neural network is "+str(accuracy))

For Ternary data + My model + first 10 vectors, the accuracy for feedforward neural network is 0.6196


## 5. Recurrent Neutral Networks

#### part (a)

#### Binary

#### train test split

In [None]:
def convert_label(l):
  if l==2:
    return 1;
  else:
    return l;
binary_data['label']=binary_data['label'].apply(convert_label)
binary_train, binary_test = train_test_split(binary_data, test_size=0.2, random_state=2)

#### My Model

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        #output = self.softmax(output)
        output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn = RNN(300, n_hidden, 1)
LEARNING_RATE = 0.001
criterion = nn.BCELoss()#CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn.initHidden()
  optimizer.zero_grad()
  words = review.split()
  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn(input, hidden)
    
  loss = criterion(output.flatten(), torch.tensor([y_true]).float())
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn.initHidden()
  words = review.split()

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn(input, hidden)
    
  return output

In [None]:
num_train = binary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = binary_train.iloc[i].loc['review_body']
  label = binary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, mymodel)

In [None]:
num_test = binary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = binary_test.iloc[i].loc['review_body']
  label = binary_test.iloc[i].loc['label']
  output = rnn_test(sentence, mymodel)
  output = round(output.item())
  output_list.append(output)
  
tn, fp, fn, tp = confusion_matrix(binary_test['label'], output_list).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + My model, the accuracy for recurrent neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + My model, the accuracy for recurrent neural network is 0.5295
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.5204532891100055, 0.9275862068965517, 0.6667847025495751.


#### Pretrained Model

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        #output = self.softmax(output)
        output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn = RNN(300, n_hidden, 1)
LEARNING_RATE = 0.001
criterion = nn.BCELoss()#CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn.initHidden()
  optimizer.zero_grad()
  words = review.split()

  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn(input, hidden)
    
  loss = criterion(output.flatten(), torch.tensor([y_true]).float())
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn.initHidden()
  words = review.split()

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn(input, hidden)
    
  return output

In [None]:
num_train = binary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = binary_train.iloc[i].loc['review_body']
  label = binary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, wv_g)

In [None]:
num_test = binary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = binary_test.iloc[i].loc['review_body']
  label = binary_test.iloc[i].loc['label']
  output = rnn_test(sentence, wv_g)
  output = round(output.item())
  output_list.append(output)

  
tn, fp, fn, tp = confusion_matrix(binary_test['label'], output_list).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + Pretrained model, the accuracy for recurrent neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + Pretrained model, the accuracy for recurrent neural network is 0.52775
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.5203816131830009, 0.8866995073891626, 0.655857168883221.


#### ternary

#### train test split

In [None]:
ternary_train, ternary_test = train_test_split(clean_data, test_size=0.2, random_state=2)

#### My Model

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        #output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn = RNN(300, n_hidden, 3)
LEARNING_RATE = 0.002
criterion = nn.CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn.initHidden()
  optimizer.zero_grad()
  words = review.split()
  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn(input, hidden)
    
  loss = criterion(output, torch.tensor([y_true]))
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn.initHidden()
  words = review.split()

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn(input, hidden)
    
  return output

In [None]:
num_train = ternary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = ternary_train.iloc[i].loc['review_body']
  label = ternary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, mymodel)

In [None]:
num_test = ternary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = ternary_test.iloc[i].loc['review_body']
  output = rnn_test(sentence, mymodel)
  output = torch.argmax(output)
  output = IntTensor.item(output)
  #output = round(output.item())
  output_list.append(output)

In [None]:
# predict test data
from torch import IntTensor
def predict3class(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = torch.argmax(model(X_batch))
        prediction_list.append(IntTensor.item(output))
    
    return prediction_list#[round(num) for num in prediction_list]

def get_accuracy(y_true, y_pred):
  counter = 0
  for i in range(0,len(y_true)):
    if y_true[i]==y_pred[i]:
      counter = counter+1
  return counter/len(y_true)

In [None]:
y_true = ternary_test['label'].tolist()
accuracy = get_accuracy(y_true, output_list)
print("For Ternary data + My model, the accuracy for recurrent neural network is "+str(accuracy))

For Ternary data + My model, the accuracy for recurrent neural network is 0.4054


#### Pretrained Model

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        #output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn = RNN(300, n_hidden, 3)
LEARNING_RATE = 0.002
criterion = nn.CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn.initHidden()
  optimizer.zero_grad()
  words = review.split()
  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn(input, hidden)
    
  loss = criterion(output, torch.tensor([y_true]))
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn.initHidden()
  words = review.split()

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn(input, hidden)
    
  return output

In [None]:
num_train = ternary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = ternary_train.iloc[i].loc['review_body']
  label = ternary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, wv_g)

In [None]:
num_test = ternary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = ternary_test.iloc[i].loc['review_body']
  output = rnn_test(sentence, wv_g)
  output = torch.argmax(output)
  output = IntTensor.item(output)
  #output = round(output.item())
  output_list.append(output)

In [None]:
# predict test data
from torch import IntTensor
def predict3class(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = torch.argmax(model(X_batch))
        prediction_list.append(IntTensor.item(output))
    
    return prediction_list#[round(num) for num in prediction_list]

def get_accuracy(y_true, y_pred):
  counter = 0
  for i in range(0,len(y_true)):
    if y_true[i]==y_pred[i]:
      counter = counter+1
  return counter/len(y_true)

In [None]:
y_true = ternary_test['label'].tolist()
accuracy = get_accuracy(y_true, output_list)
print("For Ternary data + Pretrained model, the accuracy for recurrent neural network is "+str(accuracy))

For Ternary data + Pretrained model, the accuracy for recurrent neural network is 0.41


#### part (b)

#### Binary

#### My Model

In [None]:
class RNN_gate(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN_gate, self).__init__()

        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, 1)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        #output = self.softmax(output)
        output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn_gate = RNN_gate(300, n_hidden, 1)
LEARNING_RATE = 0.001
criterion = nn.BCELoss()#CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn_gate.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn_gate.initHidden()

  #rnn.zero_grad()
  optimizer.zero_grad()

  words = review.split()

  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn_gate(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn_gate(input, hidden)
    
  loss = criterion(output.flatten(), torch.tensor([y_true]).float())
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn_gate.initHidden()
  words = review.split()
  #rnn.load_state_dict(torch.load('model.pt'))

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn_gate(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn_gate(input, hidden)
    
  return output

In [None]:
num_train = binary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = binary_train.iloc[i].loc['review_body']
  label = binary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, mymodel)

In [None]:
num_test = binary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = binary_test.iloc[i].loc['review_body']
  label = binary_test.iloc[i].loc['label']
  output = rnn_test(sentence, mymodel)
  output = round(output.item())
  output_list.append(output)

  
tn, fp, fn, tp = confusion_matrix(binary_test['label'], output_list).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + My model, the accuracy for recurrent neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + My model, the accuracy for recurrent neural network is 0.52725
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.5192894809880655, 0.9216748768472907, 0.664299662701935.


#### Pretrained Model

In [None]:
class RNN_gate(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN_gate, self).__init__()

        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, 1)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        #output = self.softmax(output)
        output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn_gate = RNN_gate(300, n_hidden, 1)
LEARNING_RATE = 0.001
criterion = nn.BCELoss()#CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn_gate.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn_gate.initHidden()

  #rnn.zero_grad()
  optimizer.zero_grad()

  words = review.split()

  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn_gate(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn_gate(input, hidden)
    
  loss = criterion(output.flatten(), torch.tensor([y_true]).float())
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn_gate.initHidden()
  words = review.split()
  #rnn.load_state_dict(torch.load('model.pt'))

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn_gate(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn_gate(input, hidden)
    
  return output

In [None]:
num_train = binary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = binary_train.iloc[i].loc['review_body']
  label = binary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, wv_g)

In [None]:
num_test = binary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = binary_test.iloc[i].loc['review_body']
  label = binary_test.iloc[i].loc['label']
  output = rnn_test(sentence, wv_g)
  output = round(output.item())
  output_list.append(output)

  
tn, fp, fn, tp = confusion_matrix(binary_test['label'], output_list).ravel()
accuracy = (tn+tp)/(tn+fp+fn+tp)
print("For Binary data + Pretrained model, the accuracy for recurrent neural network is "+str(accuracy))

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*((precision*recall)/(precision+recall))
print("Other useful values are shown below:")
answer_str = 'The precision, recall and f1-score of testing dataset are '
answer_str = answer_str+str(precision)+", "+str(recall)+", "+str(f1_score)+"."
print(answer_str)

For Binary data + Pretrained model, the accuracy for recurrent neural network is 0.53
Other useful values are shown below:
The precision, recall and f1-score of testing dataset are 0.5217896571760604, 0.8847290640394089, 0.6564327485380117.


#### Ternary

#### My Model

In [None]:
class RNN_gate(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN_gate, self).__init__()

        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, 1)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        #output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn = RNN(300, n_hidden, 3)
LEARNING_RATE = 0.002
criterion = nn.CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn.initHidden()

  #rnn.zero_grad()
  optimizer.zero_grad()

  words = review.split()

  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn(input, hidden)
    
  loss = criterion(output, torch.tensor([y_true]))
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn.initHidden()
  words = review.split()
  #rnn.load_state_dict(torch.load('model.pt'))

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn(input, hidden)
    
  return output

In [None]:
num_train = ternary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = ternary_train.iloc[i].loc['review_body']
  label = ternary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, mymodel)

In [None]:
num_test = ternary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = ternary_test.iloc[i].loc['review_body']
  output = rnn_test(sentence, mymodel)
  output = torch.argmax(output)
  output = IntTensor.item(output)
  #output = round(output.item())
  output_list.append(output)

In [None]:
# predict test data
from torch import IntTensor
def predict3class(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = torch.argmax(model(X_batch))
        prediction_list.append(IntTensor.item(output))
    
    return prediction_list#[round(num) for num in prediction_list]

def get_accuracy(y_true, y_pred):
  counter = 0
  for i in range(0,len(y_true)):
    if y_true[i]==y_pred[i]:
      counter = counter+1
  return counter/len(y_true)

In [None]:
y_true = ternary_test['label'].tolist()
accuracy = get_accuracy(y_true, output_list)
print("For Ternary data + My model, the accuracy for recurrent neural network is "+str(accuracy))

For Ternary data + My model, the accuracy for recurrent neural network is 0.4054


#### Pretrained Model

In [None]:
class RNN_gate(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN_gate, self).__init__()

        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, 1)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        #output = torch.sigmoid(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
n_hidden = 50
rnn = RNN(300, n_hidden, 3)
LEARNING_RATE = 0.002
criterion = nn.CrossEntropyLoss() #nn.NLLLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=LEARNING_RATE)

In [None]:
def train(y_true, review, current_model):
  hidden = rnn.initHidden()

  #rnn.zero_grad()
  optimizer.zero_grad()

  words = review.split()

  counter = 0
  for each_word in words:
    try:
      input = torch.tensor([current_model[each_word]])
      output, next_hidden = rnn(input, hidden)
      counter = counter+1
    except:
      pass
    if counter==50:
      break
  
  if counter<50:
    for i in range(counter,50):
      input = torch.tensor([[0]*300])
      output, next_hidden = rnn(input, hidden)
    
  loss = criterion(output, torch.tensor([y_true]))
  loss.backward()
  optimizer.step()

  return output, loss.item()



def rnn_test(review, current_model):
  hidden = rnn.initHidden()
  words = review.split()
  #rnn.load_state_dict(torch.load('model.pt'))

  counter = 0
  with torch.no_grad():
    for each_word in words:
      try:
        input = torch.tensor([current_model[each_word]])
        output, next_hidden = rnn(input, hidden)
        counter = counter+1
      except:
        pass
      if counter==50:
        break
    
    if counter<50:
      for i in range(counter,50):
        input = torch.tensor([[0]*300])
        output, next_hidden = rnn(input, hidden)
    
  return output

In [None]:
num_train = ternary_train.iloc[:,0].size
current_loss = 0
for i in range(0,num_train):
  sentence = ternary_train.iloc[i].loc['review_body']
  label = ternary_train.iloc[i].loc['label']
  output, loss = train(label, sentence, wv_g)

In [None]:
num_test = ternary_test.iloc[:,0].size
output_list = []
for i in range(0,num_test):
  sentence = ternary_test.iloc[i].loc['review_body']
  output = rnn_test(sentence, wv_g)
  output = torch.argmax(output)
  output = IntTensor.item(output)
  #output = round(output.item())
  output_list.append(output)

In [None]:
# predict test data
from torch import IntTensor
def predict3class(model, dataloader):
    prediction_list = []
    with torch.no_grad():
      for X_batch in dataloader:
        X_batch = X_batch.float()
        output = torch.argmax(model(X_batch))
        prediction_list.append(IntTensor.item(output))
    
    return prediction_list#[round(num) for num in prediction_list]

def get_accuracy(y_true, y_pred):
  counter = 0
  for i in range(0,len(y_true)):
    if y_true[i]==y_pred[i]:
      counter = counter+1
  return counter/len(y_true)

In [None]:
y_true = ternary_test['label'].tolist()
accuracy = get_accuracy(y_true, output_list)
print("For Ternary data + Pretrained model, the accuracy for recurrent neural network is "+str(accuracy))

For Ternary data + Pretrained model, the accuracy for recurrent neural network is 0.409
