In [1]:
# ----------------------------
# Gaming Jargon Replacement Dictionary
# ----------------------------
gaming_jargon_dict = {
    "gg": "good game", "op": "overpowered", "nerf": "reduce power", "buff": "increase power",
    "noob": "beginner", "afk": "away from keyboard", "camp": "staying in one place", "clutch": "critical moment win",
    "dps": "damage per second", "hp": "health points", "mp": "mana points", "respawn": "reappear",
    "lag": "network delay", "meta": "current best strategy", "tilt": "frustrated play", "p2w": "pay to win",
    "smurf": "high-level player using new account", "wallbang": "shooting through walls", "frag": "kill",
    "wallhack": "seeing through walls cheat", "rng": "random outcome", "xp": "experience points",
    "hitbox": "damage detection area", "strafe": "side movement", "rush": "fast attack", "one-tap": "single shot kill"
}

# ----------------------------
# Preprocessing Function
# ----------------------------
import re
from collections import Counter

jargon_replacement_counts = Counter()

def preprocess_text(text):
    if isinstance(text, float):  # Handle NaNs
        return ""
    
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    
    for slang, replacement in gaming_jargon_dict.items():
        if slang in text:
            jargon_replacement_counts[slang] += text.count(slang)
            text = text.replace(slang, replacement)
    
    return text.strip()

# ----------------------------
# Load Data
# ----------------------------
import pandas as pd

train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

# Apply preprocessing
train_df["cleaned_review"] = train_df["review"].apply(preprocess_text)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text)

# ----------------------------
# TF-IDF Vectorization
# ----------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df["cleaned_review"])
X_test = vectorizer.transform(test_df["cleaned_review"])

y_train = train_df["sentiment"].astype(int)
y_test = test_df["sentiment"].astype(int)

# ----------------------------
# K-Nearest Neighbors Model
# ----------------------------
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train KNN
knn_model = KNeighborsClassifier(n_neighbors=5, metric='cosine')  # cosine is better for text similarity
knn_model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔹 Top Jargon Replacements:", jargon_replacement_counts.most_common(10))



🔹 Model Accuracy: 0.8936768149882904

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.38      0.50       295
           1       0.91      0.98      0.94      1840

    accuracy                           0.89      2135
   macro avg       0.81      0.68      0.72      2135
weighted avg       0.88      0.89      0.88      2135


🔹 Top Jargon Replacements: [('op', 978), ('mp', 870), ('gg', 356), ('xp', 279), ('lag', 50), ('rush', 31), ('camp', 27), ('noob', 17), ('hp', 13), ('respawn', 12)]
