In [1]:
gaming_jargon_dict = {
    "gg": "good game",
    "op": "overpowered",
    "nerf": "reduce power",
    "buff": "increase power",
    "noob": "beginner",
    "afk": "away from keyboard",
    "camp": "staying in one place",
    "clutch": "critical moment win",
    "dps": "damage per second",
    "hp": "health points",
    "mp": "mana points",
    "respawn": "reappear",
    "lag": "network delay",
    "meta": "current best strategy",
    "tilt": "frustrated play",
    "p2w": "pay to win",
    "smurf": "high-level player using new account",
    "wallbang": "shooting through walls",
    "frag": "kill",
    "wallhack": "seeing through walls cheat",
    "rng": "random outcome",
    "xp": "experience points",
    "hitbox": "damage detection area",
    "strafe": "side movement",
    "rush": "fast attack",
    "one-tap": "single shot kill"
}


In [2]:
import re

def preprocess_text(text):
    if isinstance(text, float):  # Handling missing values
        return ""

    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation

    # Replace gaming jargon
    for slang, replacement in gaming_jargon_dict.items():
        text = text.replace(slang, replacement)

    return text.strip()


In [5]:
import pandas as pd

In [7]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")


In [8]:
# Apply preprocessing to reviews in both train and test sets
train_df["cleaned_review"] = train_df["review"].apply(preprocess_text)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text)

In [10]:
from collections import Counter
import re

# Dictionary to store replacement counts
jargon_replacement_counts = Counter()

def preprocess_text_with_count(text):
    if isinstance(text, float):  # Handling missing values
        return ""

    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    
    replacement_count = 0  # Count replacements per review

    for slang, replacement in gaming_jargon_dict.items():
        if slang in text:
            replacement_count += text.count(slang)  # Count occurrences
            text = text.replace(slang, replacement)

    jargon_replacement_counts[replacement_count] += 1  # Store count

    return text.strip()

# Apply function to both datasets
train_df["cleaned_review"] = train_df["review"].apply(preprocess_text_with_count)
test_df["cleaned_review"] = test_df["review"].apply(preprocess_text_with_count)

# Display the jargon replacement statistics
print("🔹 Jargon Replacement Count per Review:", jargon_replacement_counts)


🔹 Jargon Replacement Count per Review: Counter({0: 9449, 1: 852, 2: 169, 3: 61, 4: 31, 5: 20, 6: 17, 7: 16, 8: 12, 9: 10, 10: 8, 12: 5, 13: 4, 21: 3, 11: 3, 22: 2, 19: 2, 14: 2, 29: 1, 45: 1, 48: 1, 15: 1, 106: 1, 32: 1, 17: 1})


In [11]:
num_reviews_with_replacements = sum(1 for count in jargon_replacement_counts if count > 0)
print(f"🔹 Total Reviews with Jargon Replacements: {num_reviews_with_replacements}")


🔹 Total Reviews with Jargon Replacements: 24


In [12]:
jargon_usage = Counter()

for review in train_df["review"].dropna().tolist() + test_df["review"].dropna().tolist():
    review = review.lower()
    for slang in gaming_jargon_dict.keys():
        if slang in review:
            jargon_usage[slang] += review.count(slang)

print("🔹 Most Frequently Replaced Jargon Terms:", jargon_usage.most_common(10))


🔹 Most Frequently Replaced Jargon Terms: [('op', 978), ('mp', 897), ('gg', 356), ('xp', 279), ('lag', 50), ('rush', 31), ('camp', 27), ('noob', 17), ('hp', 13), ('respawn', 12)]


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary size to 5000 words

# Fit on training data and transform both train & test sets
X_train_tfidf = vectorizer.fit_transform(train_df["cleaned_review"])  
X_test_tfidf = vectorizer.transform(test_df["cleaned_review"])

# Convert labels to integer format
y_train = train_df["sentiment"].astype(int)
y_test = test_df["sentiment"].astype(int)

# Debugging checks
print("🔹 Shape of X_train:", X_train_tfidf.shape)  # Should match number of rows in train_df
print("🔹 Shape of X_test:", X_test_tfidf.shape)    # Should match number of rows in test_df
print("🔹 Length of y_train:", len(y_train))
print("🔹 Length of y_test:", len(y_test))


🔹 Shape of X_train: (8538, 5000)
🔹 Shape of X_test: (2135, 5000)
🔹 Length of y_train: 8538
🔹 Length of y_test: 2135


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=42)

# Train the model
rf_model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\n🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))



🔹 Model Accuracy: 0.8637002341920375

🔹 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.01      0.03       295
           1       0.86      1.00      0.93      1840

    accuracy                           0.86      2135
   macro avg       0.93      0.51      0.48      2135
weighted avg       0.88      0.86      0.80      2135

