In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings('ignore')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Notebooks/train.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1.0
1,2,1,A series of escapades demonstrating the adage ...,2.0
2,3,1,A series,2.0
3,4,1,A,2.0
4,5,1,series,2.0


In [None]:
import re
import spacy

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
  text = str(text) if type(text) != str else text
  text = text.lower()
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"[^a-zA-Z0-9]", " ", text)
  text = re.sub(r'http\S+', '', text)

  doc = nlp(text)
  tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

  return None if len(tokens) == 0 else " ".join(tokens)

df['cleaned_text'] = df['Phrase'].apply(clean_text)

In [None]:
df.isna().sum()

Unnamed: 0,0
PhraseId,0
SentenceId,0
Phrase,1
Sentiment,66292
cleaned_text,0


In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

Unnamed: 0,0
PhraseId,0
SentenceId,0
Phrase,0
Sentiment,0
cleaned_text,0


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
num_of_words = set()

for txt in df['cleaned_text']:
  for word in txt.split():
    num_of_words.add(word)

len(num_of_words)

In [None]:
num_of_words = 10000

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, SpatialDropout1D

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=len(X_train)),
    SpatialDropout1D(0.2), # drop feature (only used in sequence data)

    GRU(units=128),
    Dropout(0.2),

    Dense(units=64, activation='relu'),
    Dropout(0.2),

    Dense(units=5, activation='softmax')
])

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=512, validation_data=(X_test, y_test))

Epoch 1/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 1s/step - accuracy: 0.5053 - loss: 1.4467 - val_accuracy: 0.5797 - val_loss: 1.0886
Epoch 2/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 1s/step - accuracy: 0.6002 - loss: 1.0152 - val_accuracy: 0.6139 - val_loss: 0.9924
Epoch 3/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 1s/step - accuracy: 0.6314 - loss: 0.9372 - val_accuracy: 0.6215 - val_loss: 0.9633
Epoch 4/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 1s/step - accuracy: 0.6434 - loss: 0.9030 - val_accuracy: 0.6243 - val_loss: 0.9470
Epoch 5/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m404s[0m 1s/step - accuracy: 0.6519 - loss: 0.8827 - val_accuracy: 0.6260 - val_loss: 0.9368
Epoch 6/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 1s/step - accuracy: 0.6600 - loss: 0.8569 - val_accuracy: 0.6306 - val_loss: 0.9293
Epoch 7/10
[1m244/244

<keras.src.callbacks.history.History at 0x781dfbeec2e0>