In [None]:
import pandas as pd
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import Accuracy, Recall, Precision, MeanSquaredError
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from sklearn.model_selection import KFold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
reviews_cleaned = "/content/drive/MyDrive/NLP_FINAL/data/cleanedDataset.csv"
dataset = pd.read_csv(reviews_cleaned)

In [None]:
y = dataset["sentiment"].values
X = dataset["text"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, shuffle=True)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

sequences_length = 50
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_train = pad_sequences(sequences_train, maxlen=sequences_length)

sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_test = pad_sequences(sequences_test, maxlen=sequences_length)

In [None]:
vocabulary_size = len(tokenizer.word_index) + 1

In [None]:
folds = 4
kfold = KFold(n_splits=folds, shuffle=True)

s = []

for train, test in kfold.split(X_train, y_train):
  embedding_dim = 16
  lstm_units = 32

  model = Sequential()
  model.add(Embedding(vocabulary_size, embedding_dim))
  model.add(LSTM(lstm_units))
  model.add(Dropout(.5))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall(), MeanSquaredError()])
  model.summary()

  model.fit(sequences_train[train], y_train[train], epochs = 5, batch_size=10, verbose = 1)

  scores = model.evaluate(sequences_train[test], y_train[test], verbose=1)
  print("\n\n")
  print(model.metrics_names)
  print(scores)

  s.append(scores)

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 16)          1106944   
                                                                 
 lstm_9 (LSTM)               (None, 32)                6272      
                                                                 
 dropout_9 (Dropout)         (None, 32)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,113,249
Trainable params: 1,113,249
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5



['loss', 'accuracy', 'precision_6', 'recall_6', 'mean_squared_error']
[0.404997855424881, 0.8493186831474304, 0.90033501386

In [None]:
import numpy as np

print("avg acc", np.mean([i[1] for i in s]))
print()

avg acc 0.8490315079689026


In [None]:
acc = model.evaluate(sequences_test, y_test)



In [None]:
acc

[0.45100492238998413,
 0.8460866808891296,
 0.8665032386779785,
 0.932374119758606,
 0.11887399852275848]