In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load train and test datasets
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")


In [22]:
# Ensure text is string type and fill NaN values
train_df['review'] = train_df['review'].astype(str).fillna("")
test_df['review'] = test_df['review'].astype(str).fillna("")

In [20]:
#Converted Already
# Convert sentiment labels to numeric (1 = positive, 0 = negative)
#train_df["sentiment"] = train_df["sentiment"].map({"positive": 1, "negative": 0})
#test_df["sentiment"] = test_df["sentiment"].map({"positive": 1, "negative": 0})

In [23]:
# Tokenization (Convert words to numbers)
MAX_VOCAB_SIZE = 10000  # Number of unique words to keep
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["review"])

In [24]:
# Convert reviews to sequences
X_train = tokenizer.texts_to_sequences(train_df["review"])
X_test = tokenizer.texts_to_sequences(test_df["review"])

In [25]:
# Padding sequences to ensure fixed input size
MAX_SEQUENCE_LENGTH = 200  # Adjust based on average review length
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

In [26]:
# Convert labels to numpy arrays
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [29]:
# Debugging
print("Vocabulary Size:", len(tokenizer.word_index))
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Number of training samples:", len(y_train))
print("Number of testing samples:", len(y_test))

Vocabulary Size: 10299
Shape of X_train: (8538, 200)
Shape of X_test: (2135, 200)
Number of training samples: 8538
Number of testing samples: 2135


In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define model architecture
EMBEDDING_DIM = 100  # Dimensionality of word embeddings
LSTM_UNITS = 128     # Number of LSTM units

In [31]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=EMBEDDING_DIM, input_length=200),
    LSTM(LSTM_UNITS, return_sequences=False),
    Dropout(0.3),  # Prevent overfitting
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()



In [32]:
# Train the model
EPOCHS = 5
BATCH_SIZE = 32

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1
)

Epoch 1/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 105ms/step - accuracy: 0.8369 - loss: 0.4409 - val_accuracy: 0.8618 - val_loss: 0.4272
Epoch 2/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 104ms/step - accuracy: 0.8625 - loss: 0.4115 - val_accuracy: 0.8618 - val_loss: 0.4030
Epoch 3/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 118ms/step - accuracy: 0.8662 - loss: 0.4029 - val_accuracy: 0.8618 - val_loss: 0.4017
Epoch 4/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 104ms/step - accuracy: 0.8657 - loss: 0.4025 - val_accuracy: 0.8614 - val_loss: 0.4021
Epoch 5/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 100ms/step - accuracy: 0.8656 - loss: 0.4023 - val_accuracy: 0.8614 - val_loss: 0.4019


In [33]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 LSTM Model Accuracy (Without Jargon Handling):", accuracy)

# Detailed classification report
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step

🔹 LSTM Model Accuracy (Without Jargon Handling): 0.8613583138173302

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       295
           1       0.86      1.00      0.93      1840

    accuracy                           0.86      2135
   macro avg       0.43      0.50      0.46      2135
weighted avg       0.74      0.86      0.80      2135

