In [1]:
# ----------------------------
# Step 1: Gaming Jargon Dictionary
# ----------------------------
gaming_jargon_dict = {
    "gg": "good game",
    "op": "overpowered",
    "nerf": "reduce power",
    "buff": "increase power",
    "noob": "beginner",
    "afk": "away from keyboard",
    "camp": "staying in one place",
    "clutch": "critical moment win",
    "dps": "damage per second",
    "hp": "health points",
    "mp": "mana points",
    "respawn": "reappear",
    "lag": "network delay",
    "meta": "current best strategy",
    "tilt": "frustrated play",
    "p2w": "pay to win",
    "smurf": "high-level player using new account",
    "wallbang": "shooting through walls",
    "frag": "kill",
    "wallhack": "seeing through walls cheat",
    "rng": "random outcome",
    "xp": "experience points",
    "hitbox": "damage detection area",
    "strafe": "side movement",
    "rush": "fast attack",
    "one-tap": "single shot kill"
}

# ----------------------------
# Step 2: Preprocessing Function
# ----------------------------
import re
from collections import Counter

jargon_replacement_counts = Counter()

def preprocess_text_with_count(text):
    if isinstance(text, float):  # Handle missing values
        return ""

    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation

    replacement_count = 0
    for slang, replacement in gaming_jargon_dict.items():
        if slang in text:
            replacement_count += text.count(slang)
            text = text.replace(slang, replacement)

    jargon_replacement_counts[replacement_count] += 1
    return text.strip()

# ----------------------------
# Step 3: Load and Clean Data
# ----------------------------
import pandas as pd

train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

train_df["cleaned_review"] = train_df["review"].apply(preprocess_text_with_count)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text_with_count)

# ----------------------------
# Step 4: Jargon Stats (Optional Logs)
# ----------------------------
print("🔹 Jargon Replacement Count per Review:", jargon_replacement_counts)

num_reviews_with_replacements = sum(1 for count in jargon_replacement_counts if count > 0)
print(f"🔹 Total Reviews with Jargon Replacements: {num_reviews_with_replacements}")

jargon_usage = Counter()
for review in train_df["review"].dropna().tolist() + test_df["review"].dropna().tolist():
    review = review.lower()
    for slang in gaming_jargon_dict:
        if slang in review:
            jargon_usage[slang] += review.count(slang)

print("🔹 Most Frequently Replaced Jargon Terms:", jargon_usage.most_common(10))

# ----------------------------
# Step 5: TF-IDF Vectorization
# ----------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_df["cleaned_review"])
X_test_tfidf = vectorizer.transform(test_df["cleaned_review"])

y_train = train_df["sentiment"].astype(int)
y_test = test_df["sentiment"].astype(int)

print("🔹 Shape of X_train:", X_train_tfidf.shape)
print("🔹 Shape of X_test:", X_test_tfidf.shape)
print("🔹 Length of y_train:", len(y_train))
print("🔹 Length of y_test:", len(y_test))

# ----------------------------
# ✅ Step 6: Logistic Regression Model
# ----------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train_tfidf, y_train)

# ----------------------------
# Step 7: Evaluate Logistic Regression
# ----------------------------
y_pred = logreg_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 Logistic Regression Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


🔹 Jargon Replacement Count per Review: Counter({0: 9449, 1: 852, 2: 169, 3: 61, 4: 31, 5: 20, 6: 17, 7: 16, 8: 12, 9: 10, 10: 8, 12: 5, 13: 4, 21: 3, 11: 3, 22: 2, 19: 2, 14: 2, 29: 1, 45: 1, 48: 1, 15: 1, 106: 1, 32: 1, 17: 1})
🔹 Total Reviews with Jargon Replacements: 24
🔹 Most Frequently Replaced Jargon Terms: [('op', 978), ('mp', 897), ('gg', 356), ('xp', 279), ('lag', 50), ('rush', 31), ('camp', 27), ('noob', 17), ('hp', 13), ('respawn', 12)]
🔹 Shape of X_train: (8538, 5000)
🔹 Shape of X_test: (2135, 5000)
🔹 Length of y_train: 8538
🔹 Length of y_test: 2135

🔹 Logistic Regression Accuracy: 0.8964871194379391

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.33      0.46       295
           1       0.90      0.99      0.94      1840

    accuracy                           0.90      2135
   macro avg       0.86      0.66      0.70      2135
weighted avg       0.89      0.90      0.88      2135

