In [2]:
import pandas as pd
import re
from collections import Counter
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Gaming jargon dictionary
gaming_jargon_dict = {
    "gg": "good game", "op": "overpowered", "nerf": "reduce power",
    "buff": "increase power", "noob": "beginner", "afk": "away from keyboard",
    "camp": "staying in one place", "clutch": "critical moment win",
    "dps": "damage per second", "hp": "health points", "mp": "mana points",
    "respawn": "reappear", "lag": "network delay", "meta": "current best strategy",
    "tilt": "frustrated play", "p2w": "pay to win", "smurf": "high-level player using new account",
    "wallbang": "shooting through walls", "frag": "kill", "wallhack": "seeing through walls cheat",
    "rng": "random outcome", "xp": "experience points", "hitbox": "damage detection area",
    "strafe": "side movement", "rush": "fast attack", "one-tap": "single shot kill"
}

# Load datasets
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Dictionary to store replacement counts
jargon_replacement_counts = Counter()

def preprocess_text(text):
    if isinstance(text, float):  # Handling missing values
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    replacement_count = 0  # Count replacements per review
    
    for slang, replacement in gaming_jargon_dict.items():
        if slang in text:
            replacement_count += text.count(slang)  # Count occurrences
            text = text.replace(slang, replacement)
    
    jargon_replacement_counts[replacement_count] += 1  # Store count
    return text.strip()

# Apply preprocessing
train_df["cleaned_review"] = train_df["review"].apply(preprocess_text)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text)

# Display jargon replacement statistics
print("🔹 Jargon Replacement Count per Review:", jargon_replacement_counts)
num_reviews_with_replacements = sum(1 for count in jargon_replacement_counts if count > 0)
print(f"🔹 Total Reviews with Jargon Replacements: {num_reviews_with_replacements}")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_df["cleaned_review"])
X_test_tfidf = vectorizer.transform(test_df["cleaned_review"])

# Convert labels to integer
y_train = train_df["sentiment"].astype(int)
y_test = test_df["sentiment"].astype(int)

# Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and Evaluate
y_pred = nb_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


🔹 Jargon Replacement Count per Review: Counter({0: 9449, 1: 852, 2: 169, 3: 61, 4: 31, 5: 20, 6: 17, 7: 16, 8: 12, 9: 10, 10: 8, 12: 5, 13: 4, 21: 3, 11: 3, 22: 2, 19: 2, 14: 2, 29: 1, 45: 1, 48: 1, 15: 1, 106: 1, 32: 1, 17: 1})
🔹 Total Reviews with Jargon Replacements: 24

🔹 Model Accuracy: 0.8702576112412178

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.06      0.12       295
           1       0.87      1.00      0.93      1840

    accuracy                           0.87      2135
   macro avg       0.91      0.53      0.53      2135
weighted avg       0.88      0.87      0.82      2135

