In [2]:
import nltk
import numpy as np
import matplotlib as pyplot
import seaborn as sns
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [3]:
data = pd.read_csv('Terraria.csv')

In [4]:
data = data.dropna(subset=['review_text'])

In [5]:
data['review_text'] = data['review_text'].astype(str)

In [6]:
data['sentiment'] = data['review_text'].apply(lambda x: 1 if TextBlob(x).sentiment.polarity > 0 else 0)

In [7]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  tokens = word_tokenize(text)
  tokens = [token for token in tokens if token not in stopwords.words('english')]
  return ' '.join(tokens)

In [8]:
data['processed_text'] = data['review_text'].apply(preprocess_text)

In [9]:
X = data['processed_text']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [11]:
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [12]:
model.fit(X_train_pad, y_train, epochs=20, batch_size=32)

Epoch 1/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 40ms/step - accuracy: 0.6904 - loss: 0.6210
Epoch 2/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 38ms/step - accuracy: 0.6915 - loss: 0.6155
Epoch 3/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 37ms/step - accuracy: 0.6919 - loss: 0.6120
Epoch 4/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 40ms/step - accuracy: 0.8660 - loss: 0.3207
Epoch 5/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 41ms/step - accuracy: 0.9279 - loss: 0.1917
Epoch 6/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 40ms/step - accuracy: 0.9420 - loss: 0.1647
Epoch 7/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 38ms/step - accuracy: 0.9510 - loss: 0.1403
Epoch 8/20
[1m2118/2118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 38ms/step - accuracy: 0.9575 - loss: 0.1214
Epoch 9/

<keras.src.callbacks.history.History at 0x1e70aa12720>

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [14]:
y_pred = model.predict(X_test_pad)
y_pred = (y_pred > 0.5).astype(int)

[1m530/530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step


In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:\n", confusion)

Accuracy: 0.8948
Precision: 0.9216
Recall: 0.9263
F1 Score: 0.9240
Confusion Matrix:
 [[ 4331   921]
 [  861 10828]]
