In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
import warnings
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import re
#TQDM is a progress bar library
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
import random
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Dropout, Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train = pd.read_csv("/content/drive/MyDrive/Colab/train.tsv", sep="\t")
test = pd.read_csv("/content/drive/MyDrive/Colab/test.tsv", sep="\t")

In [None]:
stemmer = PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#lower case for each phrase
train['Phrase']= train['Phrase'].apply(lambda x: x.lower())

#lemmization
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in token.tokenize(text)]

#stemming
def stem_text(text):
    return [stemmer.stem(w) for w in token.tokenize(text)]

#stop word removal
def stop_word_removing(text):
    stopwords = nltk.corpus.stopwords.words('english')
    return [w for w in token.tokenize(text) if not w in stopwords]

#detokenization
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

# train["Phrase"] = train.Phrase.apply(stop_word_removing)
# train["Phrase"] = train.Phrase.apply(detokenize)
train["Phrase"] = train.Phrase.apply(lemmatize_text)
#train["Phrase"] = train.Phrase.apply(stem_text)
#train["Phrase"] = train.Phrase.apply(detokenize)


# test["Phrase"] = test.Phrase.apply(stop_word_removing)
# test["Phrase"] = test.Phrase.apply(detokenize)
test["Phrase"] = test.Phrase.apply(lemmatize_text)
#test["Phrase"] = test.Phrase.apply(stem_text)
#test["Phrase"] = test.Phrase.apply(detokenize)

In [None]:
target = train.Sentiment.values
y_target = to_categorical(target)
num_classes = y_target.shape[1]
print(num_classes)
print(train.Sentiment.values)

X_train,X_val,y_train,y_val = train_test_split(train["Phrase"], y_target,
                                               test_size=0.2, stratify=y_target)
unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    unique_words.update(sent)
    if(len_max<len(sent)):
        len_max = len(sent)

tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test["Phrase"])

X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_val.shape,X_test.shape)

5
[1 2 2 ... 3 2 2]


100%|██████████| 124848/124848 [00:00<00:00, 288255.37it/s]


(124848, 48) (31212, 48) (66292, 48)


In [None]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_loss', patience = 2)
callback = [early_stopping]

#Model using Keras LSTM
model = Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5, return_sequences=True))
model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5, return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(learning_rate=0.005),metrics=['accuracy'])
model.summary()

#model fitting
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=6, 
                  batch_size=256, verbose=1, callbacks=callback)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 48, 300)           4161300   
                                                                 
 lstm (LSTM)                 (None, 48, 128)           219648    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 100)               6500      
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 505       
                                                                 
Total params: 4,437,361
Trainable params: 4,437,361
Non-

In [None]:
pred_validation = model.evaluate(X_val, y_val)



In [None]:
model_json = model.to_json()
with open(f"/content/drive/MyDrive/Colab/modelLSTM.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(f"/content/drive/MyDrive/Colab/modelLSTM.h5")
print("Ho salvato il modello!")

In [None]:
test_prediction = model.predict(X_test)

In [None]:
label_pred_max=[np.argmax(i) for i in test_prediction]
print(label_pred_max)

[2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 3, 3, 2, 3, 3, 2, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 3, 4, 3, 2, 3, 3, 3, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 1, 1, 1, 1, 1, 2, 3, 3, 2, 1, 3, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [None]:
test_id = test['PhraseId']

submission = pd.DataFrame(list(zip(test_id, label_pred_max)),
               columns =['PhraseId', 'Sentiment'])
submission.head(20)

submission.to_csv('/content/drive/MyDrive/Colab/submission_LSTM.csv', index=False)