In [1]:
import pandas as pd
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load datasets
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Define game jargon dictionary
game_jargon = {
    "gg": "good game",
    "nerf": "reduce power",
    "buff": "increase power",
    "noob": "new player",
    "camping": "hiding and waiting",
    "frag": "kill",
    "smurf": "experienced player using a new account",
    "meta": "most effective tactics",
}

In [2]:
# Function to replace game jargon
def replace_jargon(text):
    for jargon, replacement in game_jargon.items():
        text = re.sub(rf"\b{jargon}\b", replacement, text, flags=re.IGNORECASE)
    return text

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, float):  # Handle NaN values
        return ""
    text = replace_jargon(text)  # Replace jargon
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [3]:
# Apply preprocessing
train_df["cleaned_review"] = train_df["review"].apply(preprocess_text)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text)

# Save cleaned data (optional)
#train_df.to_csv("train_data_cleaned.csv", index=False)
#test_df.to_csv("test_data_cleaned.csv", index=False)

# Check data
print(train_df.head())
print("\nNumber of jargon replacements done:", sum(train_df["review"] != train_df["cleaned_review"]))

                                              review  sentiment  \
0                                                top          1   
1                                          haxorzone          1   
2  played this game so many years ago now we have...          1   
3                                    much laaaaaaaag          0   
4                                     very good game          1   

                                      cleaned_review  
0                                                top  
1                                          haxorzone  
2  played this game so many years ago now we have...  
3                                    much laaaaaaaag  
4                                     very good game  

Number of jargon replacements done: 78


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hyperparameters
MAX_VOCAB_SIZE = 5000  # Maximum number of words to keep
MAX_SEQUENCE_LENGTH = 200  # Maximum review length

# Initialize Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["cleaned_review"])  # Fit on training data

# Convert reviews to sequences
X_train = tokenizer.texts_to_sequences(train_df["cleaned_review"])
X_test = tokenizer.texts_to_sequences(test_df["cleaned_review"])

# Pad sequences
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

# Convert labels to NumPy arrays
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

# Debugging checks
print("Vocabulary Size:", len(tokenizer.word_index))  # Should be <= MAX_VOCAB_SIZE
print("Shape of X_train:", X_train.shape)  # Should be (num_samples, MAX_SEQUENCE_LENGTH)
print("Shape of X_test:", X_test.shape)  # Should be (num_samples, MAX_SEQUENCE_LENGTH)


Vocabulary Size: 10295
Shape of X_train: (8538, 200)
Shape of X_test: (2135, 200)


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Hyperparameters
EMBEDDING_DIM = 100  # Dimension of word embeddings
LSTM_UNITS = 128  # Number of LSTM units
DROPOUT_RATE = 0.3  # Dropout to prevent overfitting

# Build LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=EMBEDDING_DIM, input_length=200),
    LSTM(LSTM_UNITS, return_sequences=True),
    Dropout(DROPOUT_RATE),
    LSTM(LSTM_UNITS),
    Dropout(DROPOUT_RATE),
    Dense(1, activation="sigmoid")  # Binary classification (positive/negative)
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Model summary
model.summary()

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=5,  # You can increase epochs if needed
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)

# Save model
model.save("lstm_model_with_jargon.h5")




Epoch 1/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 196ms/step - accuracy: 0.8598 - loss: 0.4327 - val_accuracy: 0.8618 - val_loss: 0.4027
Epoch 2/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 201ms/step - accuracy: 0.8575 - loss: 0.4139 - val_accuracy: 0.8618 - val_loss: 0.4020
Epoch 3/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 203ms/step - accuracy: 0.8602 - loss: 0.4100 - val_accuracy: 0.8618 - val_loss: 0.4015
Epoch 4/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 198ms/step - accuracy: 0.8615 - loss: 0.4036 - val_accuracy: 0.8600 - val_loss: 0.4123
Epoch 5/5
[1m267/267[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 198ms/step - accuracy: 0.8639 - loss: 0.3984 - val_accuracy: 0.8623 - val_loss: 0.4014




In [6]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on test data
y_pred_probs = model.predict(X_test)  # Probabilities output by the model
y_pred = (y_pred_probs > 0.5).astype(int)  # Convert to binary labels

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 59ms/step

🔹 Model Accuracy: 0.8622950819672132

🔹 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.00      0.01       295
           1       0.86      1.00      0.93      1840

    accuracy                           0.86      2135
   macro avg       0.93      0.50      0.47      2135
weighted avg       0.88      0.86      0.80      2135



In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Define maximum vocabulary size and sequence length
MAX_VOCAB_SIZE = 5000  # Adjust based on dataset
MAX_SEQUENCE_LENGTH = 100  # Max length of a review after padding

# Tokenization
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["cleaned_review"])  # Use training text data

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df["cleaned_review"])
X_test_seq = tokenizer.texts_to_sequences(test_df["cleaned_review"])

# Padding sequences
X_train = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")
X_test = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

# Define vocab_size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

print(f"Vocabulary Size: {vocab_size}")  # Debugging check
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")


Vocabulary Size: 10296
Shape of X_train: (8538, 100)
Shape of X_test: (2135, 100)


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define LSTM model
embedding_dim = 100  # Dimension of word embeddings
lstm_units = 128  # Number of LSTM units

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(units=lstm_units, return_sequences=False),
    Dropout(0.3),  # Prevent overfitting
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Display model summary
model.summary()




In [16]:
# Train LSTM model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,  # Can be increased based on performance
    batch_size=64,  # Adjust based on hardware
    verbose=1
)


Epoch 1/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 101ms/step - accuracy: 0.8342 - loss: 0.4530 - val_accuracy: 0.8618 - val_loss: 0.4005
Epoch 2/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 107ms/step - accuracy: 0.8604 - loss: 0.4089 - val_accuracy: 0.8618 - val_loss: 0.3997
Epoch 3/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 95ms/step - accuracy: 0.8633 - loss: 0.4015 - val_accuracy: 0.8628 - val_loss: 0.4003
Epoch 4/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 99ms/step - accuracy: 0.8636 - loss: 0.4057 - val_accuracy: 0.8628 - val_loss: 0.3983
Epoch 5/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 103ms/step - accuracy: 0.8636 - loss: 0.4048 - val_accuracy: 0.8609 - val_loss: 0.4051
Epoch 6/10
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 94ms/step - accuracy: 0.8718 - loss: 0.3895 - val_accuracy: 0.8604 - val_loss: 0.4012
Epoch 7/10
[

In [17]:
# Evaluate model on test data
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)

# Print accuracy
print("\n🔹 LSTM Model Accuracy:", accuracy)


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8664 - loss: 0.3918

🔹 LSTM Model Accuracy: 0.8618267178535461
