# **Neural Networks and Transformers:**

## **Convolutional Neural Networks:**

In [12]:
# dataframe and series 
import pandas as pd
import numpy as np

# sklearn imports for modeling part
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from sklearn.model_selection import train_test_split

from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from mlxtend.plotting import plot_decision_regions

from sklearn.metrics import confusion_matrix

# To plot
import matplotlib.pyplot as plt  
%matplotlib inline    
import matplotlib as mpl
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

In [13]:
import tensorflow as tf
tf.__version__
import keras
from keras import models, layers, optimizers
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import pad_sequences
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Input, Flatten, BatchNormalization
from keras.layers import RandomRotation, RandomTranslation, RandomFlip
from keras.models import Model
from keras.optimizers import RMSprop

In [14]:
df_cnn = pd.read_csv('/kaggle/input/kindle-store-reviews-train-cnn/train.csv')

In [15]:
df_cnn

Unnamed: 0,overall,verified,reviewerID,asin,reviewText,summary,style.Format:,sentiment,review_clean
0,4.0,True,A25XR29KUK69PJ,B009QJMMKM,this is the first time i read the series and i...,Great Story,Kindle Edition,2,this is the first time i read the series and i...
1,5.0,True,A32L1LC9TOB1YI,B00F53T9D6,who do you turn to when you are being threaten...,Great mystery!,Kindle Edition,2,who do you turn to when you are being threaten...
2,5.0,False,A7KP9BA1UJGJ5,B00YQNI0RC,i received a copy of this book from the author...,Absolutely wonderful!!,Kindle Edition,2,i received a copy of this book from the author...
3,5.0,True,AW3TEBNYS6SFY,B001V9KG4E,stan morris - did a great job with this book: ...,Surviving the Fog Review,Kindle Edition,2,stan morris did a great job with this book go...
4,5.0,False,AODTVO069QC8T,B018WHN5EU,we find smoke just where he flourishes best......,Smoke Never Disappoints,Kindle Edition,2,we find smoke just where he flourishes bestaga...
...,...,...,...,...,...,...,...,...,...
1778053,4.0,True,A4K85P27BHGST,B01BJBXCVM,it started lik a mysterie and became a romance...,Kiss is Fun,Kindle Edition,2,it started lik a mysterie and became a romance...
1778054,5.0,True,AY0LUGVU3EAKG,B00GX8U90E,"i've read both books, and i can say they were ...",Really Good Read,Kindle Edition,2,ive read both books and i can say they were ve...
1778055,2.0,False,A1JEFUO52AB8XH,B017WE2YUU,turkey and miracle's storyline was the worst.....,Not what I expected,Kindle Edition,0,turkey and miracles storyline was the worst th...
1778056,4.0,False,A2TQTPT4S772SD,B01GF3I5XY,this second volume of rave soup for the writer...,Rave Soup for the Writer's Soul: Volume Two,Kindle Edition,2,this second volume of rave soup for the writer...


In [16]:
df_cnn.dropna(inplace=True)

In [17]:
#Dividing the data into training and test sets so as to train our model on random data. 

train_data, test_data = train_test_split(df_cnn, train_size=0.8, test_size=0.2,random_state = 42)

In [18]:
train_target = train_data.sentiment
train_texts = train_data.review_clean

test_target = test_data.sentiment
test_texts = test_data.review_clean

### **Preparing data for Keras:**

In [19]:
def converting_texts(texts):
    collected_texts = []
    for text in texts:
        collected_texts.append(text)
    return collected_texts
        
train_texts = converting_texts(train_texts)
test_texts = converting_texts(test_texts)

I need to tokenize my text and padding sequences before modeling my data. I will use Keras proprocessing tools for this.

In [20]:
max_feat= 12000 #seting max features to define max number of tokenizer words

tokenizer = Tokenizer(num_words=max_feat)
tokenizer.fit_on_texts(train_texts)
# updates internal vocabulary based on a list of texts
# in the case where texts contains lists, we assume each entry of the lists to be a token
# required before using texts_to_sequences or texts_to_matrix

train_texts = tokenizer.texts_to_sequences(train_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)
# transforms each text in texts to a sequence of integers
# Only top num_words-1 most frequent words will be taken into account 
# Only words known by the tokenizer will be taken into account

In [None]:
max_len = max(len(train_ex) for train_ex in train_texts) #setting the max length

# using pad_sequence tool from Keras
# transforms a list of sequences to into a 2D Numpy array of shape 
# the maxlen argument for the length of the longest sequence in the list
train_texts = pad_sequences(train_texts, maxlen=max_len)
test_texts = pad_sequences(test_texts, maxlen=max_len)
print(len(train_texts))
print(len(test_texts))

To use batches productively, I need to turn my sequences to same length. I prefer to set everything to maximum length of the longest sentence in train data.

### Building a Model:
In this simple model, convolutional neural nets were used with 64 embedding dimension. 3-convolutional layers used, first two have batch normalization and maximum pooling arguments. The last one has glabal maximum pooling. Results were passed to a dense layer and output for prediction.

Batch normalizations normalize and scale inputs or activations by reducing the amount what the hidden unit values shift around. Max Pool downsamples the input representation by taking the maximum value over the window defined by pool size.

In [None]:
def build_model():
    sequences = layers.Input(shape=(max_len,))
    embedded = layers.Embedding(max_feat, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

In [None]:
model.fit(
    train_dataset, 
    batch_size = 128,
    epochs=2,
    validation_data=(test_dataset))

I ran the code in the data science lab because my laptop does not have a GPU, printing the outputs here: 
* loss_value = 0.28
* binary accuracy = 0.91

## **Reccurent Neural Networks:**

In [None]:
def build_rnn_model():
    sequences = layers.Input(shape=(max_len,))
    embedded = layers.Embedding(max_feat, 64)(sequences)
    x = layers.GRU(128, return_sequences=True)(embedded)
    x = layers.GRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [None]:
rnn_model.fit(
    train_texts, 
    train_target, 
    batch_size=128,
    epochs=1,
    validation_data=(test_texts, test_target) )

I ran the code in the Data Science lab, and here are the outputs: 
* loss_value = 0.20
* binary_accuracy = 0.92