# Imports

In [33]:
# Jupyter notebook autoreload
%load_ext autoreload
%autoreload 2

# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import plotly.express as px
import geopandas as gpd
import matplotlib.pyplot as plt
from IPython.display import Image

import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer



import seaborn as sns
import matplotlib as plt
from statsmodels.graphics.gofplots import qqplot


from sklearn.preprocessing import Binarizer # For transforming target into a binary target.
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split # Split data into train and test data

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

from gensim.models import Word2Vec # Vectorizing the sentences

import gensim.downloader as api # Transfer Learning other Word2Vec

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1 Explore data

In [2]:
train = pd.read_csv("data/train.tsv", sep='\t')
train

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [3]:
train[train["SentenceId"]==1]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
58,59,1,much,2
59,60,1,of a story,2
60,61,1,a story,2
61,62,1,story,2


In [4]:
test = pd.read_csv("data/test.tsv", sep='\t')
test

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine
...,...,...,...
66287,222348,11855,"A long-winded , predictable scenario ."
66288,222349,11855,"A long-winded , predictable scenario"
66289,222350,11855,"A long-winded ,"
66290,222351,11855,A long-winded


In [5]:
# Check how many rows and columns in the dataframe -> Enough data to perform ML
f"Shape of train:{train.shape} and test:{test.shape}"


'Shape of train:(156060, 4) and test:(66292, 3)'

In [6]:
# Display additional info about each columns such as data types and number of non-null values -> Correct type per column
f"Shape of train:{train.info()} and test:{test.info()}"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


'Shape of train:None and test:None'

In [7]:
# Compute the number of duplicated rows -> there are no duplicated rows
f"duplicated rows in train : {train.duplicated().sum()}, duplicated rows in train :  {test.duplicated().sum()}"


'duplicated rows in train : 0, duplicated rows in train :  0'

Missing data

In [8]:
# Counting the percentage of NaN for each column
train.isnull().sum().sort_values(ascending=False)/len(train)


PhraseId      0.0
SentenceId    0.0
Phrase        0.0
Sentiment     0.0
dtype: float64

In [9]:
# Counting the percentage of NaN for each column
test.isnull().sum().sort_values(ascending=False)/len(test)


PhraseId      0.0
SentenceId    0.0
Phrase        0.0
dtype: float64

# Scaling

In [10]:
# There is no need to scale as numerical values are : id.

# Fist Method: Sentiment Analysis. predicting if positive or negative. 

# y need to be simplified to 0 : negative and 1 : positive. 
Knowing that, the sentiment labels are: \
0 - negative \
1 - somewhat negative \
2 - neutral \
3 - somewhat positive \
4 - positive

Then, from 0 to 3 included -> it is negative so 0. the rest is positive so 1. 

In [11]:
threshold_value = 2

binarizer = Binarizer(threshold=threshold_value)

train['Sentiment_binary'] = binarizer.transform(train[['Sentiment']])

train



Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Sentiment_binary
0,1,1,A series of escapades demonstrating the adage ...,1,0
1,2,1,A series of escapades demonstrating the adage ...,2,0
2,3,1,A series,2,0
3,4,1,A,2,0
4,5,1,series,2,0
...,...,...,...,...,...
156055,156056,8544,Hearst 's,2,0
156056,156057,8544,forced avuncular chortles,1,0
156057,156058,8544,avuncular chortles,3,1
156058,156059,8544,avuncular,2,0


Remove Duplicates

In [12]:
# Dataset is unbalanced
train.groupby("Sentiment_binary").count()

Unnamed: 0_level_0,PhraseId,SentenceId,Phrase,Sentiment
Sentiment_binary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,113927,113927,113927,113927
1,42133,42133,42133,42133


In [13]:
sentiment_perc = round(train.Sentiment_binary.value_counts()/train.Sentiment_binary.sum()*100,1)

In [14]:
# Defining x are variables, on which the model will have to guess the target: the sentiment stored in y.
y = train.Sentiment_binary
X = train.drop(columns=["Sentiment_binary","Sentiment"])

In [15]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [16]:
X_train

Unnamed: 0,PhraseId,SentenceId,Phrase
121244,121245,6493,"While not for every taste , this often very fu..."
9149,9150,382,Just a bunch of good actors flailing around in...
48116,48117,2347,of a Vietnam picture
57526,57527,2902,Fincher 's
40904,40905,1960,most pitiful directing
...,...,...,...
119879,119880,6412,its trademark villain
103694,103695,5469,"earn her share of the holiday box office pie ,..."
131932,131933,7112,A moving tale of love and destruction in unexp...
146867,146868,7991,"If you love reading and\/or poetry , then by a..."


# Preprocessing text

In [18]:
def preprocessing(sentence):
    # Removing whitespaces
    sentence = sentence.strip()
    # Lowercasing
    sentence = sentence.lower()
    # Removing numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    # Removing punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')
    # Tokenizing
    tokenized = word_tokenize(sentence)
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]
    return lemmatized


In [19]:

X_train['clean_reviews'] = X_train.Phrase.apply(preprocessing)

X_test['clean_reviews'] = X_test.Phrase.apply(preprocessing)


In [20]:
X_test

Unnamed: 0,PhraseId,SentenceId,Phrase,clean_reviews
95722,95723,5000,'s as sorry,"[s, a, sorry]"
147312,147313,8016,Romantic comedy and Dogme 95 filmmaking may se...,"[romantic, comedy, and, dogme, filmmaking, may..."
36991,36992,1752,of these days,"[of, these, day]"
150211,150212,8182,flinch from its unsettling prognosis,"[flinch, from, it, unsettling, prognosis]"
140655,140656,7631,are clinically depressed,"[are, clinically, depressed]"
...,...,...,...,...
39479,39480,1885,"as the film grows to its finale , his little c...","[a, the, film, grows, to, it, finale, his, lit..."
136980,136981,7408,acted out,"[acted, out]"
50777,50778,2498,'s no denying the potency of Miller 's strange...,"[s, no, denying, the, potency, of, miller, s, ..."
75758,75759,3883,natural-seeming actors,"[naturalseeming, actor]"


In [21]:
word2vec = Word2Vec(sentences= X_train["clean_reviews"], vector_size= 100 , min_count=10)
wv = word2vec.wv


In [22]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])

    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []

    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)

    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train["clean_reviews"])
X_test_embed = embedding(word2vec, X_test["clean_reviews"])


# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)


In [25]:
# Checking the shapes:
for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)


## Baseline model

In [26]:
from sklearn.metrics import accuracy_score

unique, counts = np.unique(y_train, return_counts=True)
counts = dict(zip(unique, counts))
print('Number of labels in train set', counts)

y_pred = 0 if counts[0] > counts[1] else 1

print('Baseline accuracy: ', accuracy_score(y_test, [y_pred]*len(y_test)))


Number of labels in train set {0: 79998, 1: 29244}
Baseline accuracy:  0.7246999017471912


## Model

In [28]:
def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    return model

model = init_model()


In [29]:
es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train_pad, y_train,
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )


Epoch 1/100


2024-02-13 15:56:55.297780: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


<keras.callbacks.History at 0x5d9933160>

In [30]:
res = model.evaluate(X_test_pad, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')


The accuracy evaluated on the test set is of 80.584%


### Improving embedding with another Word2Vec: glove-wiki-gigaword-50


In [34]:
print(list(api.info()['models'].keys()))
word2vec_transfer = api.load("glove-wiki-gigaword-50")


['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [46]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])

    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []

    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)

    return embed

# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train["clean_reviews"])
X_test_embed_2 = embedding(word2vec_transfer, X_test["clean_reviews"])


In [47]:
# Pad the training and test embedded sentences
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=200)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=200)


In [48]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model = init_model()

model.fit(X_train_pad_2, y_train,
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
 365/2390 [===>..........................] - ETA: 11:50 - loss: 0.3823 - accuracy: 0.8368

KeyboardInterrupt: 

In [None]:
X_test_pad.shape

(46818, 200, 100)

In [None]:
X_test_pad_2.shape

(4, 200, 50)

In [None]:
y_test.shape

(46818,)

In [None]:
res = model.evaluate(X_test_pad_2, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')


ValueError: Data cardinality is ambiguous:
  x sizes: 4
  y sizes: 46818
Make sure all arrays contain the same number of samples.