In [None]:
# Run this cell to mount your drive to this notebook in order to read the datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## Read Dataset

In [None]:
# Put the folder path where the datasets are located
PATH = "/content/drive/MyDrive/445project1/"

In [None]:
cd /content/drive/MyDrive/445project1/

/content/drive/MyDrive/445project1


In [None]:
# Read the train and test set with read_csv() method of pandas
train = pd.read_csv(PATH + "train.csv")
test = pd.read_csv(PATH + "test.csv")

In [None]:
print(train.shape)
print(test.shape)

(18000, 3)
(2000, 3)


In [None]:
train.head(10)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,I came here and left a review before but last ...,1
1,1,Had a very nice first visit here. The owner Te...,4
2,2,This is a gorgeous and very clean hotel. We h...,4
3,3,The gym is dirty. I have given up. Locker ro...,1
4,4,"The food here is delicious, fast, and consiste...",5
5,5,We stopped in on a Sunday evening. I was surpr...,1
6,6,Porcini is our favorite local dinner place and...,5
7,7,Unacceptable wait for food. After ordering and...,1
8,8,Honestly came here with the folks and was take...,5
9,9,I came here after hearing multiple different p...,4


### Preprocess Dataset

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Define a function to perform preprocessing. This function can perform things like lowercasing, stemming, removing stopwords, etc.

import string
ps = PorterStemmer()
#s_tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
w_tokenizer = nltk.tokenize.WordPunctTokenizer()
stops = set(stopwords.words('english'))

def preprocess(text: str):
    #sentences = nltk.sent_tokenize(text)
    text = text.lower()
    words = w_tokenizer.tokenize(text)
    res = ""
    for w in words:
      if w not in stops and [x for x in w if string.punctuation.find(x) != -1] == []:
        res += ps.stem(w) + " "
    #print(res)
    return res

In [None]:
# Apply your preprocessing function to your text fields.

train.text = train.text.apply(preprocess)
test.text = test.text.apply(preprocess)

train.shape, test.shape
#print(train.text)

((18000, 3), (2000, 3))

In [None]:
test.head(20)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,stay weekend made stay pleasant locat great sp...,5
1,1,forev call upon delici design whenev need tast...,5
2,2,person order homicid boneless regular absolut ...,4
3,3,eat pretti much everytim go tarpon spring staf...,4
4,4,3 time never go want huge chang usual trim get...,3
5,5,great hidden tavern grill wonder old place ear...,5
6,6,first experi sephora pari quit fair experienc ...,2
7,7,live close want tri place other mention dine e...,2
8,8,date spot place go formal event said food grea...,4
9,9,husband stop tri angelo gyro philli chees stea...,2


In [None]:
print(type(train["text"][0]))


<class 'str'>
<class 'str'>


In [None]:
# Create your binary and multiclass datasets

# For binary dataset, get rid of the class 3 in the dataset and map class 1 and 2 to 0, and class 4 and 5 to 1
binary_train = train.copy(deep=True)
binary_test = test.copy(deep=True)

binary_train = binary_train[binary_train.label != 3]
binary_train.loc[binary_train.label <= 2,['label']] = 0
binary_train.loc[binary_train.label >= 4,['label']] = 1

binary_test = binary_test[binary_test.label != 3]
binary_test.loc[binary_test.label <= 2,['label']] = 0
binary_test.loc[binary_test.label >= 4,['label']] = 1


# For multiclass dataset, make sure your classes starts from 0 and goes until 4. (5->4, 4->3, 3->2, 2->1, 1->0)

multi_tr = train.copy(deep=True)
multi_ts = test.copy(deep=True)

multi_tr.label -= 1
multi_ts.label -= 1

#multi_tr.head(5)
multi_tr.head(20)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,came left review last time get food poison unl...,0
1,1,nice first visit owner ted friendli start rest...,3
2,2,gorgeou clean hotel room west wing first chore...,3
3,3,gym dirti given locker room total dirti manag ...,0
4,4,food delici fast consist everi singl time gene...,4
5,5,stop sunday even surpris car park lot sat bar ...,0
6,6,porcini favorit local dinner place hope consid...,4
7,7,unaccept wait food order pay told bread poboy ...,0
8,8,honestli came folk taken away nostalgia gla go...,4
9,9,came hear multipl differ peopl rave place say ...,3


In [None]:
binary_train_text = binary_train["text"].values
binary_train_labels = binary_train["label"].values

multi_train_text = multi_tr["text"].values
multi_train_labels = multi_tr["label"].values

binary_test_text = binary_test["text"].values
binary_test_labels = binary_test["label"].values

multi_test_text = multi_ts["text"].values
multi_test_labels = multi_ts["label"].values

# Models

## Non-Neural Models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score

### Naive Bayes

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

# Create a class for converting sparse matrix output of TfidfVectorizer to dense matrix for feeding into GaussianNB
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
      '''
      tfid = TfidfVectorizer(**fit_params)
      vec = tfid.fit(X)
      return vec
      '''
      return self

    def transform(self, X, y=None, **fit_params):
      #sparse = self.fit(X,**fit_params).transform(X)
      return X.todense()


# Initiate the pipeline with required components.You can use Pipeline class of sklearn -> https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# There will be three components; 1) TfidfVectorizer 2) DenseTransformer 3) Naive Bayes classifier.
pipeline = Pipeline([
                     ("vectorizer",TfidfVectorizer()),
                     ("transformer", DenseTransformer()),
                     ("nb_classifier", GaussianNB())
])


# Set the hyperparameter space that will be scanned with GridSearchCV.
search_params = {
    "vectorizer__min_df": (100,500,1000),
    "vectorizer__ngram_range": ((1,1),(1,2),(1,3))
}



### Binary

In [None]:
%%time


# Initialize and run the GridSearchCV to scan the hyperparameter and find the best hyperparameter set that will maximize the scoring option for binary classification.
grid = GridSearchCV(pipeline, search_params, scoring = 'f1_macro', return_train_score= False, verbose = 1)
grid.fit(binary_train_text, binary_train_labels)
# Report the standart deviation of split scores for each hyperparameter group.

for i in range(9):
  print("STD for trial ",i,":\n",grid.cv_results_["params"][i]," - ",grid.cv_results_["std_test_score"][i] ,sep="")

# Show the best parameter set for given dataset and hyperparameter space.
print("Top parameters & score: \n", grid.best_params_, " - ", grid.best_score_ ,sep ="")




Fitting 5 folds for each of 9 candidates, totalling 45 fits
STD for trial 0:
{'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 1)} - 0.005750144597770928
STD for trial 1:
{'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 2)} - 0.008078241081952051
STD for trial 2:
{'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 3)} - 0.008078108055608053
STD for trial 3:
{'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 1)} - 0.004435828457757601
STD for trial 4:
{'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 2)} - 0.005626262939876043
STD for trial 5:
{'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 3)} - 0.005626262939876043
STD for trial 6:
{'vectorizer__min_df': 1000, 'vectorizer__ngram_range': (1, 1)} - 0.008064527431535481
STD for trial 7:
{'vectorizer__min_df': 1000, 'vectorizer__ngram_range': (1, 2)} - 0.008064527431535481
STD for trial 8:
{'vectorizer__min_df': 1000, 'vectorizer__ngram_range': (1, 3)} - 0.008064527431535481
Top parame

In [None]:
# Building the pipeline with the best parameter group and reporting Conf. Mat. and Results on the Test Set #
# Create your Pipeline object with the best parameter set.
pipeline_top = Pipeline([
                     ("vectorizer",TfidfVectorizer(min_df=100, ngram_range = (1,3))),
                     ("transformer", DenseTransformer()),
                     ("nb_classifier", GaussianNB())
])

# Fit your pipeline on training set.
pipeline_top.fit(binary_train_text, binary_train_labels)

# Take prediction and report the F1 and Accuracy scores for binary classification. Then show the confussion table.
pred_res = pipeline_top.predict(binary_test_text)
acc = accuracy_score(binary_test_labels, pred_res)
f1 = f1_score(binary_test_labels, pred_res, average="macro")
conf_matrix = confusion_matrix(binary_test_labels, pred_res)
print("Accuracy score:", acc)
print("F1 score:",f1)

#TODO Seaborn confusion matrix draw


Accuracy score: 0.8706842435655995
F1 score: 0.8706695147712583


### Multi

In [None]:
%%time
# Initialize and run the GridSearchCV to scan the hyperparameter and find the best hyperparameter set that will maximize the scoring option for multiclass classification.


# Initialize and run the GridSearchCV to scan the hyperparameter and find the best hyperparameter set that will maximize the scoring option for binary classification.
grid_m = GridSearchCV(pipeline, search_params, scoring = 'f1_macro', return_train_score= False, verbose = 1)
grid_m.fit(multi_train_text, multi_train_labels)
# Report the standart deviation of split scores for each hyperparameter group.

for i in range(9):
  print("STD for trial ",i,":\n",grid_m.cv_results_["params"][i]," - ",grid_m.cv_results_["std_test_score"][i] ,sep="")

# Show the best parameter set for given dataset and hyperparameter space.
print("Top parameters & score: \n", grid_m.best_params_, " - ", grid_m.best_score_ ,sep ="")


Fitting 5 folds for each of 9 candidates, totalling 45 fits
STD for trial 0:
{'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 1)} - 0.007528039477989049
STD for trial 1:
{'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 2)} - 0.008550563611130407
STD for trial 2:
{'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 3)} - 0.008518967542528911
STD for trial 3:
{'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 1)} - 0.009121338714247074
STD for trial 4:
{'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 2)} - 0.0076089305458114705
STD for trial 5:
{'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 3)} - 0.0076089305458114705
STD for trial 6:
{'vectorizer__min_df': 1000, 'vectorizer__ngram_range': (1, 1)} - 0.003021142001259106
STD for trial 7:
{'vectorizer__min_df': 1000, 'vectorizer__ngram_range': (1, 2)} - 0.003021142001259106
STD for trial 8:
{'vectorizer__min_df': 1000, 'vectorizer__ngram_range': (1, 3)} - 0.003021142001259106
Top para

In [None]:
pipeline_top = Pipeline([
                     ("vectorizer",TfidfVectorizer(min_df=100, ngram_range = (1,2))),
                     ("transformer", DenseTransformer()),
                     ("nb_classifier", GaussianNB())
])

# Fit your pipeline on training set.
pipeline_top.fit(multi_train_text, multi_train_labels)

# Take prediction and report the F1 and Accuracy scores for binary classification. Then show the confussion table.
pred_res = pipeline_top.predict(multi_test_text)
acc = accuracy_score(multi_test_labels, pred_res)
f1 = f1_score(multi_test_labels, pred_res, average="macro")
conf_matrix = confusion_matrix(multi_test_labels, pred_res)
print("Accuracy score:", acc)
print("F1 score:",f1)

#TODO Seaborn confusion matrix draw


Accuracy score: 0.491
F1 score: 0.4757956753612732


### Logistic Regression

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Initiate the pipeline with required components.You can use Pipeline class of sklearn -> https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# There will be three components; 1) Word weightning 2) Logistic Regression classifier.

pipeline = Pipeline([
                     ("vectorizer",TfidfVectorizer()),
                     ("logistic_classifier", LogisticRegression(random_state=22, penalty="elasticnet", solver = "saga"))
])

# Set the hyperparameter space that will be scanned with GridSearchCV.
search_params = {
    "vectorizer__min_df": (100,500,1000),
    "vectorizer__ngram_range": ((1,1),(1,2),(1,3)),
    "logistic_classifier__l1_ratio": (0.0,0.5,1.0)
}



#### Binary

In [None]:
%%time

# Initialize and run the GridSearchCV to scan the hyperparameter and find the best hyperparameter set that will maximize the scoring option for binary classification.
grid = GridSearchCV(pipeline, search_params, scoring = 'f1_macro', return_train_score= False, verbose = 1)
grid.fit(binary_train_text, binary_train_labels)
# Report the standart deviation of split scores for each hyperparameter group.

for i in range(9):
  print("STD for trial ",i,":\n",grid.cv_results_["params"][i]," - ",grid.cv_results_["std_test_score"][i] ,sep="")

# Show the best parameter set for given dataset and hyperparameter space.
print("Top parameters & score: \n", grid.best_params_, " - ", grid.best_score_ ,sep ="")





Fitting 5 folds for each of 27 candidates, totalling 135 fits
STD for trial 0:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 1)} - 0.004296934336474636
STD for trial 1:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 2)} - 0.005137348665850664
STD for trial 2:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 3)} - 0.005210047560874584
STD for trial 3:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 1)} - 0.005086346731671591
STD for trial 4:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 2)} - 0.004292411243546852
STD for trial 5:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 3)} - 0.004292411243546852
STD for trial 6:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 1000, 'vectori

In [None]:
# Building the pipeline with the best parameter group and reporting Conf. Mat. and Results on the Test Set #
# Create your Pipeline object with the best parameter set.
pipeline_top = Pipeline([
                     ("vectorizer",TfidfVectorizer(min_df=100, ngram_range = (1,2))),
                     ("logistic_classifier", LogisticRegression(random_state=22, penalty="elasticnet", solver = "saga", l1_ratio = 0.5))
])

# Fit your pipeline on training set.
pipeline_top.fit(binary_train_text, binary_train_labels)

# Take prediction and report the F1 and Accuracy scores for binary classification. Then show the confussion table.
pred_res = pipeline_top.predict(binary_test_text)
acc = accuracy_score(binary_test_labels, pred_res)
f1 = f1_score(binary_test_labels, pred_res, average="macro")
conf_matrix = confusion_matrix(binary_test_labels, pred_res)
print("Accuracy score:", acc)
print("F1 score:",f1)

#TODO Seaborn confusion matrix draw

Accuracy score: 0.90646578782172
F1 score: 0.9064445524429137


#### Multiclass

In [None]:
%%time

# Initialize and run the GridSearchCV to scan the hyperparameter and find the best hyperparameter set that will maximize the scoring option for binary classification.
grid_m = GridSearchCV(pipeline, search_params, scoring = 'f1_macro', return_train_score= False, verbose = 1)
grid_m.fit(multi_train_text, multi_train_labels)
# Report the standart deviation of split scores for each hyperparameter group.

for i in range(9):
  print("STD for trial ",i,":\n",grid_m.cv_results_["params"][i]," - ",grid_m.cv_results_["std_test_score"][i] ,sep="")

# Show the best parameter set for given dataset and hyperparameter space.
print("Top parameters & score: \n", grid_m.best_params_, " - ", grid_m.best_score_ ,sep ="")


Fitting 5 folds for each of 27 candidates, totalling 135 fits
STD for trial 0:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 1)} - 0.006143746368883522
STD for trial 1:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 2)} - 0.005230666043487704
STD for trial 2:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 100, 'vectorizer__ngram_range': (1, 3)} - 0.005221509426448944
STD for trial 3:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 1)} - 0.007831118170465258
STD for trial 4:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 2)} - 0.007692002466348389
STD for trial 5:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 500, 'vectorizer__ngram_range': (1, 3)} - 0.007692002466348389
STD for trial 6:
{'logistic_classifier__l1_ratio': 0.0, 'vectorizer__min_df': 1000, 'vectori

In [None]:
pipeline_top = Pipeline([
                     ("vectorizer",TfidfVectorizer(min_df=100, ngram_range = (1,3))),
                     ("logistic_classifier", LogisticRegression(random_state=22, penalty="elasticnet", solver = "saga", l1_ratio=0.5))
])

# Fit your pipeline on training set.
pipeline_top.fit(multi_train_text, multi_train_labels)

# Take prediction and report the F1 and Accuracy scores for binary classification. Then show the confussion table.
pred_res = pipeline_top.predict(multi_test_text)
acc = accuracy_score(multi_test_labels, pred_res)
f1 = f1_score(multi_test_labels, pred_res, average="macro")
conf_matrix = confusion_matrix(multi_test_labels, pred_res)
print("Accuracy score:", acc)
print("F1 score:",f1)

#TODO Seaborn confusion matrix draw

Accuracy score: 0.5645
F1 score: 0.5611534754520854


## Neural Models

### Convolutional Neural Network (CNN) Data Setup



In [None]:
import pandas as pd
import numpy as np
import nltk,re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from numpy import array,asarray,zeros

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers.convolutional import Conv1D,MaxPooling1D
from keras.layers import Dense,Flatten,Embedding,Input,Dropout, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint

from gensim.models import Word2Vec
import gensim.downloader as api

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Create a validation set from train set
# Please use random_state of 22 and test_size of 0.1
cnn_bt_train, cnn_bt_val = train_test_split(binary_train, test_size = 0.1, random_state = 22)
print("bt: ",cnn_bt_train.shape)
cnn_bt_train.head(10)

cnn_multi_train, cnn_multi_val = train_test_split(multi_tr, test_size = 0.1, random_state = 22)
print("multi: ",cnn_multi_train.shape)

print(type(cnn_bt_train["text"][0]))
print(cnn_bt_train.head(10))

bt:  (12966, 3)
multi:  (16200, 3)
<class 'str'>
       Unnamed: 0                                               text  label
17174       17174  thoroughli dissatisfi today experi judg bean b...      0
6381         6381  price intimid worth owner awesom human bartend...      1
14980       14980  far wors starbuck entir citi terribl custom se...      0
4542         4542  start say coffe buff certainli enjoy nice cup ...      0
11771       11771  absolut favourit place greasi chines food best...      1
9330         9330  mani beer choic avail bar serv unclean glass w...      0
5249         5249  spent happi hour depot reno ace game appet sel...      0
3182         3182  great food servic go back soon authent kurdish...      1
5551         5551  need work start put sign restroom menu confus ...      0
13288       13288  made appt noon tuesday get 4 new tire chose pl...      0


### 1. Randomly Initializing Embedding Matrix

In [None]:
#Tokenize the sentences
tokenizer = Tokenizer()

tokenizer.fit_on_texts(list(cnn_bt_train["text"]))
                            
train_seq  = tokenizer.texts_to_sequences(list(cnn_bt_train["text"])) 
val_seq = tokenizer.texts_to_sequences(list(cnn_bt_val["text"]))

#padding to prepare sequences of same length
train_seq_pad  = pad_sequences(train_seq, maxlen=100)
val_seq_pad = pad_sequences(val_seq, maxlen=100)

word_vec_size = 100

words = tokenizer.word_index
print(len(words))

19164


In [None]:
# Create your own word embeddings from scratch and load a pretrained word embeddings
cnn_bt_train_seqs = [x.split(" ") for x in list(cnn_bt_train["text"])]
x = set()
for row in cnn_bt_train_seqs:
  for w in row:
    x.add(w)

n_unique =  len(x)
print("Number of unique words:", n_unique)


embedding_matrix = np.zeros((n_unique, word_vec_size))
for word, i in words.items():
  if i < n_unique:
    embedding_vector = np.random.rand(word_vec_size)*2 - 1
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
#example 
print(embedding_matrix[3])

# You can check https://radimrehurek.com/gensim/models/word2vec.html for training a word embeddings from scratch
num_words = n_unique



Number of unique words: 19165
[-0.48650185  0.57426916 -0.78449297  0.96196246 -0.83038441 -0.70190601
 -0.60714707 -0.0209153   0.41883556  0.68632738 -0.94993838 -0.22719243
 -0.97562921  0.1814011  -0.60807706  0.15537241 -0.03931685  0.24496245
 -0.20579255 -0.80571739 -0.40296165 -0.59244424  0.91127789 -0.15551397
 -0.82291443 -0.43470799 -0.46012219  0.15750555 -0.10794644 -0.4067142
 -0.52482379 -0.14046346 -0.30931678 -0.19192207 -0.09239058  0.11763344
  0.41410979  0.15801172  0.74219211 -0.93560821  0.2769971   0.70279904
  0.22448149 -0.33374728 -0.21356701 -0.14108483 -0.1329077  -0.81692755
 -0.62204202  0.26410006 -0.20232415  0.98377745 -0.76133442  0.98473612
 -0.04682744 -0.68824163  0.21684994  0.30035167 -0.10847498  0.95203447
 -0.61234869  0.32306246  0.77296536  0.42228672 -0.45261033  0.95989011
 -0.23736457  0.11543646  0.05127127 -0.52185373 -0.23354833  0.13651324
  0.89161994 -0.24476714 -0.24213453 -0.14072084 -0.93912928 -0.44135007
 -0.16693311 -0.547035

### Building the Model: Binary

In [None]:
def findWordUnique(tr):
  cnn_bt_train_seqs = [x.split(" ") for x in list(tr["text"])]
  x = set()
  for row in cnn_bt_train_seqs:
    for w in row:
      x.add(w)

  return len(x)

In [None]:


def buildBinaryCNNModel(train_seq, tr_labels, val_seq_pad, val_labels , word_count, embed_matrix, vec_size=100, window = 128, kernel = 4, pool = 3, dense = 64, loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'], epoch = 10):
  max_len = max([len(x) for x in train_seq])
  train_seq_pad = pad_sequences(train_seq, vec_size)
 #print(len(train_seq_pad[0]))
  embedding_layer = Embedding(
    word_count+1,
    vec_size,
    weights=[embed_matrix],
    input_length=max_len,
    trainable=False
  )

  #Set up the network with cnn
  input_shape = Input(shape = (len(train_seq_pad[0]),))

  cnn_bt=embedding_layer(input_shape)

  cnn_bt=Conv1D(window,kernel,activation="relu")(cnn_bt)
  cnn_bt=MaxPooling1D(pool)(cnn_bt)

  cnn_bt=Conv1D(window/2,kernel-1,activation="relu")(cnn_bt)
  cnn_bt=MaxPooling1D(pool)(cnn_bt)

  cnn_bt=Conv1D(window/2,kernel-1,activation="relu")(cnn_bt)
  cnn_bt=GlobalMaxPooling1D()(cnn_bt)

  cnn_bt=Dense(dense,activation="relu")(cnn_bt)
  cnn_bt_out = Dense(1, activation="sigmoid")(cnn_bt)

  cnn_bt_model = tf.keras.Model(input_shape, cnn_bt_out)
  cnn_bt_model.compile(
    loss=loss, optimizer=optimizer, metrics=metrics
  )
  print(cnn_bt_model.summary())

  cnn_bt_res = cnn_bt_model.fit(train_seq_pad, tr_labels, validation_data = (val_seq_pad,val_labels) , batch_size = 40, epochs = epoch, verbose = 1)
  models.append(cnn_bt_model)
  return cnn_bt_res


In [None]:
import itertools

######## HYPERPARAMETER TUNING ########

w = [128, 64]
k = [6,4]
d = [64,32]

models = []
props = []
index = 0
max_accs = []
max_accs_val = []

print("***** For randomly initialized embedding matrix (Binary)")
for l in list(itertools.product(*[w,k,d])):
  epoch = 10
  cnn_bt_r = buildBinaryCNNModel(train_seq, cnn_bt_train["label"].values, val_seq_pad, cnn_bt_val["label"].values , len(words), embedding_matrix, epoch = epoch, window = l[0], kernel = l[1], dense = l[2])
  print("window:",l[0],"kernel:",l[1],"hidden-layer:",l[2],sep=" ")
  props.append(l)
  print("train acc:",cnn_bt_r.history["accuracy"][epoch-1])
  max_accs.append(cnn_bt_r.history["accuracy"][epoch-1])
  print("validation acc:",cnn_bt_r.history["val_accuracy"][epoch-1])
  max_accs_val.append(cnn_bt_r.history["val_accuracy"][epoch-1])
  index+=1

#cnn_bt_r = buildBinaryCNNModel(train_seq, cnn_bt_train["label"].values, val_seq_pad, cnn_bt_val["label"].values , len(words), embedding_matrix, epoch = 8)


***** For randomly initialized embedding matrix (Binary)
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          1916500   
                                                                 
 conv1d (Conv1D)             (None, 95, 128)           76928     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 31, 128)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 27, 64)            41024     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 9, 64)            0         
 1D)

AttributeError: ignored

In [None]:
max_ind=max_accs_val.index(max(max_accs_val))
max_model = models[max_ind]
max_prop = props[max_ind]

print("***********")
print("max train acc:", max(max_accs))
print("validation acc:", max(max_accs_val))
print("max val model properties:", max_prop)
print("***********")

***********
max train acc: 0.9800246953964233
validation acc: 0.8424705266952515
max val model properties: (128, 6, 32)
***********


In [None]:
# Predicting test cases - Randomly Initialized Matrix
test_seq_bt  = tokenizer.texts_to_sequences(list(binary_test["text"])) 

test_seq_bt_pad  = pad_sequences(test_seq_bt, maxlen=100)
bt_test_acc = models[max_ind].evaluate(test_seq_bt_pad,binary_test["label"].values)
print("Test accuracy:",bt_test_acc)

Test accuracy: [0.6174161434173584, 0.8405523896217346]


| **

### Embedding Words with Word2Vec Gimsim

In [None]:
# You can check https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html and https://github.com/RaRe-Technologies/gensim-data for loading pretrained word embeddings. 
model_w2v = Word2Vec(cnn_bt_train_seqs, size = word_vec_size, window = 10, workers = 10, min_count = 2)
vocs = list(model_w2v.wv.vocab)
print(vocs)
print(len(vocs))


['thoroughli', 'dissatisfi', 'today', 'experi', 'judg', 'bean', 'base', 'menu', 'establish', 'seem', 'pride', 'brisket', 'taco', 'wife', 'platter', 'come', 'deep', 'fri', 'corn', 'tortilla', 'two', 'break', 'half', 'order', 'fold', 'effect', 'make', 'sandwich', 'wholli', 'ined', 'flavorless', 'fat', 'overwhelm', 'good', 'melt', 'mouth', 'chewi', 'slice', 'better', 'juic', 'smoke', 'ring', 'flavor', 'left', 'three', 'plate', 'push', 'side', 'waitress', 'came', 'took', 'inquiri', 'eat', 'also', 'best', 'mac', 'chees', 'brentwood', 'smokehous', 'someth', 'kid', 'clearli', 'shell', 'box', 'complet', 'uninterest', '5', '50', 'per', 'serv', 'toast', 'drink', 'includ', 'lemonad', '2', 'although', 'price', 'list', 'check', 'everyth', 'told', 'surpris', 'charg', 'ask', 'explain', 'said', 'ok', 'tell', '10', 'minut', 'noth', 'end', 'pay', '70', 'piti', 'attempt', 'barbecu', 'want', 'real', 'deal', 'find', 'head', '65', 'edley', 'martin', 'peg', 'leg', '', 'intimid', 'worth', 'owner', 'awesom', '

In [None]:
print(model_w2v.wv.most_similar("food"))

[('mediocr', 0.7179396748542786), ('meal', 0.6965236067771912), ('sushi', 0.6805679798126221), ('fast', 0.6784641146659851), ('unmemor', 0.6660317182540894), ('pretti', 0.6659749746322632), ('averag', 0.6612766981124878), ('slow', 0.6560137271881104), ('okinomiyaki', 0.6530337333679199), ('og', 0.6441921591758728)]


In [None]:
num_words = len(vocs)
embedding_matrix_w2v = np.zeros((num_words, word_vec_size))
for word, i in words.items():
  if i < num_words:
    embedding_vector = model_w2v[word]
    if embedding_vector is not None:
      embedding_matrix_w2v[i] = embedding_vector
#example 
print(embedding_matrix_w2v[139])

[ 1.92469075e-01 -1.17801225e+00 -2.47474954e-01 -1.11693478e+00
  6.62574530e-01  1.15619600e+00  2.44506165e-01 -5.71349740e-01
  1.01484120e+00 -3.19186091e-01  5.80986917e-01 -4.40786868e-01
  3.91332626e-01 -3.09438668e-02 -1.03984937e-01 -2.18946409e+00
 -1.52988839e+00  1.62380219e-01 -5.27331717e-02  3.51681799e-01
 -6.33964956e-01  8.19543064e-01  1.36018753e-01 -1.11400420e-02
  1.20924079e+00  1.59854293e+00 -7.18247652e-01  1.25051284e+00
  1.41187131e-01 -8.16629469e-01 -2.04071864e-01 -2.98759639e-01
 -4.91380960e-01 -5.22248387e-01  2.53374241e-02 -3.43104787e-02
  9.40915048e-01 -2.83278435e-01 -1.52462542e+00 -6.29243195e-01
  3.83757472e-01  6.30444705e-01  9.34543967e-01  6.92725360e-01
 -4.69506544e-04  3.82283181e-01  1.83480811e+00 -6.43104672e-01
  1.81330174e-01  1.42926618e-01  9.57469046e-01  5.82436800e-01
  7.25558341e-01  4.34256136e-01  6.02196395e-01 -6.19040318e-02
 -6.60597384e-01  3.66046667e-01  4.31912690e-02  6.47805810e-01
 -6.17813528e-01 -2.02132

In [None]:
# find max sequence length
print(cnn_bt_train_seqs[3])
max_seq_len = max([len(x) for x in train_seq])
print(max_seq_len)

['start', 'say', 'coffe', 'buff', 'certainli', 'enjoy', 'nice', 'cup', 'coffe', 'say', 'wow', 'cafe', 'mocha', 'americano', 'still', 'go', 'back', 'grab', 'go', 'neighborhood', 'grab', 'go', 'way', 'go', 'opinion', 'last', 'time', 'stop', 'least', 'two', 'dog', 'sit', 'obedi', 'owner', 'laptop', 'pretti', 'full', 'never', 'realli', 'type', 'linger', 'coffe', 'shop', 'alon', 'though', 'notic', 'set', 'rule', 'post', 'milk', 'sugar', 'realli', 'made', 'feel', 'welcom', 'invit', 'set', 'rule', 'appreci', 'custom', '']
510


In [None]:
# Predicting test cases - Word2Vec
cnn_bt_r_w2v = buildBinaryCNNModel(train_seq, cnn_bt_train["label"].values, val_seq_pad, cnn_bt_val["label"].values , num_words-1, embedding_matrix_w2v, epoch = 12, window = max_prop[0], kernel = max_prop[1], dense = max_prop[2])
print("train acc:",cnn_bt_r_w2v.history["accuracy"][11])
print("validation acc:",cnn_bt_r_w2v.history["val_accuracy"][11])
bt_test_acc_w2v = models[-1].evaluate(test_seq_bt_pad,binary_test["label"].values)
print("Test accuracy:",bt_test_acc_w2v)

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_13 (Embedding)    (None, 100, 100)          1162400   
                                                                 
 conv1d_30 (Conv1D)          (None, 95, 128)           76928     
                                                                 
 max_pooling1d_20 (MaxPoolin  (None, 31, 128)          0         
 g1D)                                                            
                                                                 
 conv1d_31 (Conv1D)          (None, 27, 64)            41024     
                                                                 
 max_pooling1d_21 (MaxPoolin  (None, 9, 64)            0         
 g1D)                                                     

### Vector Weights from Gimsim Api

In [None]:
# Gensim Model Embedding Matrix
import gensim.downloader as api
model_gensim = api.load("glove-wiki-gigaword-100")
embedding_matrix_gensim = np.zeros((len(words), word_vec_size))
for word, i in words.items():
  if i < len(words):
    if word in model_gensim:
      embedding_matrix_gensim[i] = model_gensim[word]

In [None]:
# Predicting test cases - Gensim API
cnn_bt_r_gensim = buildBinaryCNNModel(train_seq, cnn_bt_train["label"].values, val_seq_pad, cnn_bt_val["label"].values , len(words)-1, embedding_matrix_gensim, epoch = 12, window = max_prop[0], kernel = max_prop[1], dense = max_prop[2])
print("train acc:",cnn_bt_r_gensim.history["accuracy"][11])
print("validation acc:",cnn_bt_r_gensim.history["val_accuracy"][11])
bt_test_acc_gensim = models[-1].evaluate(test_seq_bt_pad,binary_test["label"].values)
print("Test accuracy:",bt_test_acc_gensim)

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_12 (Embedding)    (None, 100, 100)          1916400   
                                                                 
 conv1d_27 (Conv1D)          (None, 95, 128)           76928     
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 31, 128)          0         
 g1D)                                                            
                                                                 
 conv1d_28 (Conv1D)          (None, 27, 64)            41024     
                                                                 
 max_pooling1d_19 (MaxPoolin  (None, 9, 64)            0         
 g1D)                                                      

In [None]:
# Prepare your dataset for CNN classifier

In [None]:
# Create Embedding Matrices and Layers


### Building the Model: Multi-class

In [None]:
#Tokenize the sentences
mTokenizer = Tokenizer()

mTokenizer.fit_on_texts(list(cnn_multi_train["text"]))
                            
train_seq_m  = mTokenizer.texts_to_sequences(list(cnn_multi_train["text"])) 
val_seq_m = mTokenizer.texts_to_sequences(list(cnn_multi_val["text"]))

#find max sequence length
max_seq_len_m = max([len(x) for x in train_seq_m])
word_vec_size = 100
#padding to prepare sequences of same length
train_seq_mpad  = pad_sequences(train_seq_m, maxlen=word_vec_size)
val_seq_mpad = pad_sequences(val_seq_m, maxlen=word_vec_size)



words = mTokenizer.word_index
print(len(words))

20967


In [None]:
#One hot encoding of labels
cnn_multi_tr_labels = cnn_multi_train["label"].values
cnn_multi_tr_labels = tf.keras.utils.to_categorical(cnn_multi_tr_labels, num_classes = 5)

cnn_multi_val_labels = cnn_multi_val["label"].values
cnn_multi_val_labels = tf.keras.utils.to_categorical(cnn_multi_val_labels, num_classes = 5)

cnn_multi_ts_labels = multi_ts["label"].values
cnn_multi_ts_labels = tf.keras.utils.to_categorical(cnn_multi_ts_labels, num_classes = 5)
print(cnn_multi_tr_labels)

[[0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]]


In [None]:

def buildMultiCNNModel(train_seq, tr_labels, val_seq_pad, val_labels , word_count, embed_matrix, vec_size=100, window = 128, kernel = 4, pool = 3, dense = 64, loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'], epoch = 10):
  max_len = max([len(x) for x in train_seq])
  train_seq_pad = pad_sequences(train_seq, vec_size)
 #print(len(train_seq_pad[0]))
  embedding_layer = Embedding(
    word_count,
    vec_size,
    weights=[embed_matrix],
    input_length=max_len,
    trainable=False
  )

  #Set up the network with cnn
  input_shape = Input(shape = (len(train_seq_pad[0]),))

  
  cnn_bt=embedding_layer(input_shape)

  cnn_bt=Conv1D(window,kernel,activation="relu")(cnn_bt)
  cnn_bt=MaxPooling1D(pool)(cnn_bt)

  cnn_bt=Conv1D(window/2,kernel-1,activation="relu")(cnn_bt)
  cnn_bt=MaxPooling1D(pool)(cnn_bt)

  cnn_bt=Conv1D(window/2,kernel-1,activation="relu")(cnn_bt)
  cnn_bt=GlobalMaxPooling1D()(cnn_bt)

  cnn_bt=Dense(dense,activation="relu")(cnn_bt)
  cnn_bt_out = Dense(5, activation="softmax")(cnn_bt)

  cnn_bt_model = tf.keras.Model(input_shape, cnn_bt_out)
  cnn_bt_model.compile(
    loss=loss, optimizer=optimizer, metrics=metrics
  )
  [print(i.shape, i.dtype) for i in cnn_bt_model.inputs]
  [print(o.shape, o.dtype) for o in cnn_bt_model.outputs]
  print(cnn_bt_model.output.shape)
  print(type(cnn_bt_model.output))
  print(cnn_bt_model.summary())

  #train_seq_m = np.asarray(train_seq_m)
  #val_seq_mpad = np.asarray(val_seq_mpad)

  cnn_bt_res = cnn_bt_model.fit(train_seq_pad, cnn_multi_tr_labels , validation_data = (val_seq_mpad, cnn_multi_val_labels ), batch_size = 40, epochs = epoch, verbose = 1) 
  models_m.append(cnn_bt_model)
  return cnn_bt_res


In [None]:
# Randomly initialized embedding matrix 
embedding_matrix = np.zeros((len(words), word_vec_size))
for word, i in words.items():
  if i < len(words):
    embedding_vector = np.random.rand(word_vec_size)*2 - 1
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector
#example 
print(embedding_matrix[3])

[ 0.84034495  0.38261029 -0.82369958  0.62283821 -0.59603242 -0.81288771
  0.98782068 -0.38523925  0.70136272 -0.33212833 -0.41839452 -0.93071217
  0.55807187 -0.04418994 -0.09514495 -0.55350952 -0.51330159 -0.28370699
 -0.31993593 -0.71021001 -0.45393319 -0.75479415  0.44023108  0.83273806
 -0.42714801 -0.50195097  0.01563235  0.26604552  0.93165244 -0.31284359
 -0.37907894 -0.95748224 -0.13726384 -0.45917752 -0.8567974   0.79342736
 -0.71253576 -0.13164182 -0.48004965 -0.24270618 -0.40409183 -0.36621945
 -0.85105712 -0.57944425 -0.39036817  0.13317692  0.34324354  0.52589927
 -0.17070688 -0.68090873 -0.02527034 -0.69002858  0.99993002 -0.5894791
  0.13343411  0.97035647 -0.57915687  0.5409738   0.78891478 -0.72688473
  0.21705982  0.94826743 -0.28868773  0.15016659  0.48927734  0.37278889
 -0.90582353  0.79586885 -0.18194034  0.1880293  -0.81738349  0.89916038
 -0.06919044 -0.96572878  0.84570551  0.28209523 -0.76579173 -0.3813534
  0.38020938 -0.68271396 -0.86872649 -0.88561393 -0.5

In [None]:
import itertools

######## HYPERPARAMETER TUNING ########
# with w2v matrix

w = [256, 128]
k = [6,4]
d = [64,32]

models_m = []
props_m = []
index = 0
max_accs_m = []
max_accs_val_m = []

print("***** For randomly initialized embedding matrix (Binary)")
for l in list(itertools.product(*[w,k,d])):
  epoch = 10
  cnn_bt_r = buildMultiCNNModel(train_seq_m, cnn_multi_tr_labels, val_seq_mpad, cnn_multi_val_labels, num_words_m, embedding_matrix_w2v_m, vec_size=word_vec_size, epoch = epoch, window = l[0], kernel = l[1], dense = l[2])
  print("window:",l[0],"kernel:",l[1],"hidden-layer:",l[2],sep=" ")
  props_m.append(l)
  print("train acc:",cnn_bt_r.history["accuracy"][epoch-1])
  max_accs_m.append(cnn_bt_r.history["accuracy"][epoch-1])
  print("validation acc:",cnn_bt_r.history["val_accuracy"][epoch-1])
  max_accs_val_m.append(cnn_bt_r.history["val_accuracy"][epoch-1])
  index+=1


***** For randomly initialized embedding matrix (Binary)
(None, 100) <dtype: 'float32'>
(None, 5) <dtype: 'float32'>
(None, 5)
<class 'keras.engine.keras_tensor.KerasTensor'>
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 100)          1263200   
                                                                 
 conv1d (Conv1D)             (None, 95, 256)           153856    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 31, 256)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 27, 128)           163968    
                  

In [None]:
max_ind_m=max_accs_val_m.index(max(max_accs_val_m))
max_model_m = models_m[max_ind_m]
max_prop_m = props_m[max_ind_m]

print("***********")
print("max train acc:", max(max_accs_m))
print("validation acc:", max(max_accs_val_m))
print("max val model properties:", max_prop_m)
print("***********")

***********
max train acc: 0.8749382495880127
validation acc: 0.4972222149372101
max val model properties: (128, 4, 32)
***********


In [None]:
cnn_multi_train_seqs = [x.split(" ") for x in list(cnn_multi_train["text"])]

model_w2v_m = Word2Vec(cnn_multi_train_seqs, size = word_vec_size, window = 10, workers = 10, min_count = 2)
vocs_m = list(model_w2v_m.wv.vocab)
print(vocs_m)
print(len(vocs_m))

['think', 'drink', 'water', 'everi', 'night', 'club', 'go', 'expect', 'guess', 'unfortun', 'might', 'suggest', 'dive', 'bar', 'littl', 'nugget', 'hook', 'haha', 'like', 'music', 'play', 'time', 'get', 'crowd', 'throw', 'elbow', 'herd', 'move', 'long', 'dancer', 'semi', 'trashi', 'next', 'step', 'strip', 'reno', '', 'pedicur', 'excel', 'problem', 'servic', 'offer', 'howev', 'walk', 'differ', 'day', 'fill', 'staff', 'basic', 'ignor', 'guy', 'actual', 'seem', 'annoy', 'told', 'wait', 'never', 'got', 'back', 'sat', 'least', '10', 'minut', 'without', 'word', 'left', 'good', 'somewher', 'els', 'someon', 'valu', 'busi', 'tri', 'quit', 'strang', 'donut', 'delici', 'interest', 'favorit', 'bacon', 'fritter', 'typic', 'shop', 'bother', 'uniqu', 'ingredi', 'friendli', 'face', 'color', 'decor', 'reviv', 'great', 'neighborhood', 'lectur', 'class', 'hit', 'miss', 'depend', 'instructor', 'style', 'read', 'along', 'scribbl', 'board', 'type', 'lot', 'review', 'rave', 'julio', 'wonder', 'anoth', 'much', 

In [None]:
num_words_m = len(vocs_m)
embedding_matrix_w2v_m = np.zeros((num_words_m, word_vec_size))
for word, i in words.items():
  if i < num_words_m:
    embedding_vector = model_w2v_m[word]
    if embedding_vector is not None:
      embedding_matrix_w2v_m[i] = embedding_vector
#example 
print(embedding_matrix_w2v_m.shape)

(12632, 100)


In [None]:
# Predicting test cases - Word2Vec Matrix

test_seq_multi  = mTokenizer.texts_to_sequences(list(multi_ts["text"])) 

test_seq_multi_pad  = pad_sequences(test_seq_multi, maxlen=word_vec_size)
multi_test_acc = models_m[max_ind_m].evaluate(test_seq_multi_pad,cnn_multi_ts_labels)
print("Test accuracy:",multi_test_acc)

Test accuracy: [1.4348455667495728, 0.49050000309944153]


In [None]:
# Predicting test cases - Random Matrix
cnn_multi_r = buildMultiCNNModel(train_seq_m, cnn_multi_tr_labels, val_seq_mpad, cnn_multi_val_labels, num_words_m, embedding_matrix_w2v_m, vec_size=word_vec_size, epoch = 16, window = max_prop_m[0], kernel = max_prop_m[1], dense = max_prop_m[2])
print("train acc:",cnn_multi_r.history["accuracy"][15])
print("validation acc:",cnn_multi_r.history["val_accuracy"][15])
multi_test_acc_r = models_m[-1].evaluate(test_seq_multi_pad,cnn_multi_ts_labels)
print("Test accuracy:",multi_test_acc_r)


(None, 100) <dtype: 'float32'>
(None, 5) <dtype: 'float32'>
(None, 5)
<class 'keras.engine.keras_tensor.KerasTensor'>
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_10 (Embedding)    (None, 100, 100)          1263200   
                                                                 
 conv1d_27 (Conv1D)          (None, 97, 128)           51328     
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 32, 128)          0         
 g1D)                                                            
                                                                 
 conv1d_28 (Conv1D)          (None, 30, 64)            24640     
                                                                 
 max_po

In [None]:
# Gensim Model Embedding Matrix
import gensim.downloader as api
model_gensim = api.load("glove-wiki-gigaword-100")
embedding_matrix_gensim = np.zeros((num_words_m, word_vec_size))
for word, i in words.items():
  if i < num_words_m:
    if word in model_gensim:
      embedding_matrix_gensim[i] = model_gensim[word]

In [None]:

# Predicting test cases - Gensim API Matrix
cnn_multi_gen = buildMultiCNNModel(train_seq_m, cnn_multi_tr_labels, val_seq_mpad, cnn_multi_val_labels, num_words_m, embedding_matrix_gensim, vec_size=word_vec_size, epoch = 16, window = max_prop_m[0], kernel = max_prop_m[1], dense = max_prop_m[2])
print("train acc:",cnn_multi_gen.history["accuracy"][15])
print("validation acc:",cnn_multi_gen.history["val_accuracy"][15])
multi_test_acc_gen = models_m[-1].evaluate(test_seq_multi_pad,cnn_multi_ts_labels)
print("Test accuracy:",multi_test_acc_gen)

(None, 100) <dtype: 'float32'>
(None, 5) <dtype: 'float32'>
(None, 5)
<class 'keras.engine.keras_tensor.KerasTensor'>
Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 100)]             0         
                                                                 
 embedding_12 (Embedding)    (None, 100, 100)          1263200   
                                                                 
 conv1d_30 (Conv1D)          (None, 97, 128)           51328     
                                                                 
 max_pooling1d_20 (MaxPoolin  (None, 32, 128)          0         
 g1D)                                                            
                                                                 
 conv1d_31 (Conv1D)          (None, 30, 64)            24640     
                                                                 
 max_p

## My Report

In this project we had to perform sentiment analysis and categorize customer reviews based on their positivity towards the place & service, using natural language processing techniques. The data at hand was in the form of texts and the labels were numbered from 1 to 5, based on increasing positivity. 

The training data has a size of 18000 and the test had 2000 reviews. To train our models, we have dropped the reviews with label "3" and merged 1 with 2 and 4 with 5 to downsample the population to 2 classes, labelling them 0 and 1. Basically negative and positive, without neutral comments. For multi classes we have only ordered our labels from 0 to 4 for one hot encoding arrangements and architectural purposes.

The preprocessing phase had 4 main steps: lowering inputs, tokenizing into words, removing stop words and unnecesarry tokens, and then stemming. The same process was applied to both training and test texts. For non-neural models (which are Naive Bayes & Logistic Regression), stratified k-fold validation was used. For CNN models, ten percent (10%) of the training data was splitted and reserved for validation. 

For the non-neural models, grid searching technique was used to find the best accuracy producing hyperparameters. After individual runs for each model type and hyperparameter combination, we have acquired the following results: 


| Label Type | Model Type  | Hyperparameters                               | Accuracy | F1   |
|------------|-------------|-----------------------------------------------|----------|------|
| Binary     | Naive Bayes | min_df=100, ngram_range=(1,3)                 | 0.87     | 0.87 |
| Binary     | Logistic    | min_df=100, ngram_range = (1,2), l1_ratio=0.5 | 0.91     | 0.91 |
| Multi      | Naive Bayes | min_df=100, ngram_range=(1,2)                 | 0.49     | 0.47 |
| Multi      | Logistic    | min_df=100, ngram_range = (1,3), l1_ratio=0.5 | 0.56     | 0.56 |


When compared, logistic regression regression surpasses naive bayes in terms of accuracy for both labeling types. This could be the result of the evaluation algorithm, which utilizes l1 & l2 regularization together.

For the convolutional models, we have utilized 3 different embedding matrices to fill our word embedding layer, to ease the identificaiton of relation between words. First one was randomly initialized with values between -1 and 1. The second one utilized a word corpus that was made from scratch using our own training data with the help of Word2Vec function. The last one was using pretrained models from the Gensim library to match and extract embedding vectors with our tokens in our corpus. These weights were then connected to a convolutional network and a fully connected layer before outputting. The convolutional part was performing a convolution followed by a max pooling layer 3 times, the final one being a global max pooling layer. Then it was connected to a dense network of a varying kernel size. 

The hyperparameter tuning was for the binary labels performed using the randomly initialized embedding matrix and for w2v matrices with multiclass labeling, for equally giving a chance to both methods rather than one. Hyperparameters tried were as follows: window size = [128, 64]
kernel size = [6,4]
dense layer node size = [64,32].

The best model networks were for binary labels: [64, 4, 32]
The best model networks were for multiclass labels: [128, 4, 32]

The best models were run for each type of embedding matrix and it was found that for the binary labelling, the best model was word2vec, followed by random initialization, followed by gensim with accuracies 0.86, 0.84, 0.80 respectively. For multiclass labeling, the order was the same, with 0.49, 0.45, 0.41 respectively. 

These results were heavily influenced by the architecture of our model. Repeated convolution and max pooling of my model perhaps diminished the effects of some certain tokens perhaps because they were coupled with more repeating/significant words based on context. The dense layer size also affected performance heavily because it controlled the amount the model learned and sometimes it could have lead to overfitting to training data. In such a case, dropout layer could have been used.

Binary segmentation was accurate enough for general purpose usage. However, the same cannot be said for multiclass labelling since almost half of the time the test labels were misrepresented. This could be due to the fact that for even a human reader, the labelling of mediocrely positive or negative comments is ambigious in essence. One might not be so sure to label a review as neutral or negative/positive when the comment includes sentiments of both. Since the labels of the test data are also human-error prone, there might not always be obvious mathematical patterns to be learnt by our model. Or perhaps this type of learning isn't well suited for such a multilabelling task. 