In [1]:
# ----------------------------
# Step 0: Install Dependencies
# ----------------------------
# !pip install nltk tensorflow

# ----------------------------
# Step 1: Import Libraries
# ----------------------------
import pandas as pd
import numpy as np
import re
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# Step 2: Gaming Jargon Dictionary
# ----------------------------
gaming_jargon_dict = {
    "gg": "good game",
    "op": "overpowered",
    "nerf": "reduce power",
    "buff": "increase power",
    "noob": "beginner",
    "afk": "away from keyboard",
    "camp": "staying in one place",
    "clutch": "critical moment win",
    "dps": "damage per second",
    "hp": "health points",
    "mp": "mana points",
    "respawn": "reappear",
    "lag": "network delay",
    "meta": "current best strategy",
    "tilt": "frustrated play",
    "p2w": "pay to win",
    "smurf": "high-level player using new account",
    "wallbang": "shooting through walls",
    "frag": "kill",
    "wallhack": "seeing through walls cheat",
    "rng": "random outcome",
    "xp": "experience points",
    "hitbox": "damage detection area",
    "strafe": "side movement",
    "rush": "fast attack",
    "one-tap": "single shot kill"
}

# ----------------------------
# Step 3: Text Preprocessing
# ----------------------------
jargon_replacement_counts = Counter()

def preprocess_text(text):
    if isinstance(text, float):
        return ""
    
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    
    replacement_count = 0
    for slang, replacement in gaming_jargon_dict.items():
        if slang in text:
            replacement_count += text.count(slang)
            text = text.replace(slang, replacement)

    jargon_replacement_counts[replacement_count] += 1
    return text.strip()

# ----------------------------
# Step 4: Load Data
# ----------------------------
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

train_df["cleaned_review"] = train_df["review"].apply(preprocess_text)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text)

print("🔹 Jargon Replacement Count per Review:", jargon_replacement_counts)

# ----------------------------
# Step 5: Prepare Labels
# ----------------------------
y_train = train_df["sentiment"].astype(int)
y_test = test_df["sentiment"].astype(int)

# ----------------------------
# Step 6: Tokenize & Pad Sequences
# ----------------------------
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["cleaned_review"])

X_train_seq = tokenizer.texts_to_sequences(train_df["cleaned_review"])
X_test_seq = tokenizer.texts_to_sequences(test_df["cleaned_review"])

maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# ----------------------------
# Step 7: Build CNN Model
# ----------------------------
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=maxlen),
    Conv1D(128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# ----------------------------
# Step 8: Train Model
# ----------------------------
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.1)

# ----------------------------
# Step 9: Evaluate Model
# ----------------------------
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

accuracy = accuracy_score(y_test, y_pred)
print(f"\n🔹 CNN Model Accuracy: {accuracy:.4f}")
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to C:\Users\CHRISTIN
[nltk_data]     SANTHOSH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔹 Jargon Replacement Count per Review: Counter({0: 9449, 1: 852, 2: 169, 3: 61, 4: 31, 5: 20, 6: 17, 7: 16, 8: 12, 9: 10, 10: 8, 12: 5, 13: 4, 21: 3, 11: 3, 22: 2, 19: 2, 14: 2, 29: 1, 45: 1, 48: 1, 15: 1, 106: 1, 32: 1, 17: 1})




Epoch 1/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 30ms/step - accuracy: 0.8400 - loss: 0.4342 - val_accuracy: 0.8876 - val_loss: 0.2786
Epoch 2/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.8893 - loss: 0.2758 - val_accuracy: 0.9016 - val_loss: 0.2615
Epoch 3/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.9249 - loss: 0.2000 - val_accuracy: 0.9040 - val_loss: 0.2605
Epoch 4/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.9515 - loss: 0.1518 - val_accuracy: 0.8958 - val_loss: 0.2775
Epoch 5/5
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.9582 - loss: 0.1319 - val_accuracy: 0.8888 - val_loss: 0.3615
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step  

🔹 CNN Model Accuracy: 0.8871

🔹 Classification Report:
               precision    recall  f1-score   support

 