In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
df = pd.read_csv('/content/drive/MyDrive/Notebooks/train.tsv',sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [32]:
import re
import spacy

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
  text = str(text) if type(text) != str else text
  text = text.lower()
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"[^a-zA-Z0-9]", " ", text)
  text = re.sub(r'http\S+', '', text)

  doc = nlp(text)
  tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

  return None if len(tokens) == 0 else " ".join(tokens)

df['cleaned_text'] = df['Phrase'].apply(clean_text)

In [33]:
df.isna().sum()

Unnamed: 0,0
PhraseId,0
SentenceId,0
Phrase,0
Sentiment,0
cleaned_text,2078


In [34]:
df.dropna(inplace=True)

In [35]:
df.isna().sum()

Unnamed: 0,0
PhraseId,0
SentenceId,0
Phrase,0
Sentiment,0
cleaned_text,0


In [36]:
df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
2,77725
3,32808
1,27185
4,9192
0,7072


In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['Sentiment'], test_size=0.2, random_state=42)

In [46]:
num_of_words = set()

for txt in df['cleaned_text']:
  for word in txt.split():
    num_of_words.add(word)

len(num_of_words)

12769

In [47]:
num_words = 10000

In [48]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,SpatialDropout1D

Token = Tokenizer(num_words=num_words, oov_token='<OOV>')
Token.fit_on_texts(X_train)

X_train_tok = Token.texts_to_sequences(X_train)
X_test_tok = Token.texts_to_sequences(X_test)

In [49]:
X_train_tok[:3]

[[1222], [260], [39, 480, 4290, 2741, 859]]

In [50]:
X_train_pad = pad_sequences(X_train_tok, maxlen=100)
X_test_pad = pad_sequences(X_test_tok, maxlen=100)

In [51]:
X_train_pad[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        1222],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [55]:
model = Sequential([
    Embedding(num_words, 256, input_length=len(X_train_pad)),
    SpatialDropout1D(0.2),
    LSTM(256, dropout=0.2),

    Dense(256, activation='relu'),
    Dropout(0.2),

    Dense(5, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [57]:
model.fit(X_train_pad,y_train,epochs=8,validation_data=(X_test_pad,y_test),batch_size=512)

Epoch 1/8
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 75ms/step - accuracy: 0.5252 - loss: 1.1846 - val_accuracy: 0.6336 - val_loss: 0.9149
Epoch 2/8
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 72ms/step - accuracy: 0.6499 - loss: 0.8609 - val_accuracy: 0.6426 - val_loss: 0.8824
Epoch 3/8
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 72ms/step - accuracy: 0.6734 - loss: 0.7979 - val_accuracy: 0.6494 - val_loss: 0.8721
Epoch 4/8
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 71ms/step - accuracy: 0.6891 - loss: 0.7523 - val_accuracy: 0.6511 - val_loss: 0.8722
Epoch 5/8
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 72ms/step - accuracy: 0.7028 - loss: 0.7123 - val_accuracy: 0.6493 - val_loss: 0.8816
Epoch 6/8
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 72ms/step - accuracy: 0.7129 - loss: 0.6878 - val_accuracy: 0.6503 - val_loss: 0.8872
Epoch 7/8
[1m241/241

<keras.src.callbacks.history.History at 0x7afe5b7736d0>