In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer, PolynomialFeatures, KBinsDiscretizer


In [94]:
data = pd.read_csv(
    r"..\data\online_gaming_behavior_dataset.csv"
)

In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40034 entries, 0 to 40033
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PlayerID                   40034 non-null  int64  
 1   Age                        40034 non-null  int64  
 2   Gender                     40034 non-null  object 
 3   Location                   40034 non-null  object 
 4   GameGenre                  40034 non-null  object 
 5   PlayTimeHours              40034 non-null  float64
 6   InGamePurchases            40034 non-null  int64  
 7   GameDifficulty             40034 non-null  object 
 8   SessionsPerWeek            40034 non-null  int64  
 9   AvgSessionDurationMinutes  40034 non-null  int64  
 10  PlayerLevel                40034 non-null  int64  
 11  AchievementsUnlocked       40034 non-null  int64  
 12  EngagementLevel            40034 non-null  object 
dtypes: float64(1), int64(7), object(5)
memory usag

In [96]:
categorical_cols = [
    "Gender",
    "Location",
    "GameGenre",
    "GameDifficulty"
]

for col in categorical_cols:
    print(f"\n{col} unique values:")
    print(data[col].value_counts())


Gender unique values:
Gender
Male      23959
Female    16075
Name: count, dtype: int64

Location unique values:
Location
USA       16000
Europe    12004
Asia       8095
Other      3935
Name: count, dtype: int64

GameGenre unique values:
GameGenre
Sports        8048
Action        8039
Strategy      8012
Simulation    7983
RPG           7952
Name: count, dtype: int64

GameDifficulty unique values:
GameDifficulty
Easy      20015
Medium    12011
Hard       8008
Name: count, dtype: int64


In [97]:
X = data.drop(columns=["EngagementLevel", "PlayerID"])
y = data["EngagementLevel"]

In [98]:
y.value_counts(normalize=True)

EngagementLevel
Medium    0.483939
High      0.258181
Low       0.257881
Name: proportion, dtype: float64

In [99]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2222, random_state=42, stratify=y_temp
)

print(X_train.shape, X_val.shape, X_test.shape)

(28024, 11) (8006, 11) (4004, 11)


In [100]:
# Combine features + target for train/val
train = pd.concat([X_train, y_train], axis=1)
val = pd.concat([X_val, y_val], axis=1)

# Combine features + target for test
test = pd.concat([X_test, y_test], axis=1)

# Save to CSV
train.to_csv(r"..\data\train.csv", index=False)
val.to_csv(r"..\data\val.csv", index=False)
test.to_csv(r"..\data\test.csv", index=False)

In [101]:
from sklearn.base import BaseEstimator, TransformerMixin


class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Interaction features
        X["WeeklyPlayMinutes"] = X["SessionsPerWeek"] * X["AvgSessionDurationMinutes"]
        X["ProgressionIntensity"] = X["PlayerLevel"] * X["AchievementsUnlocked"]
        # Ratios
        X["Efficiency"] = X["AchievementsUnlocked"] / (X["PlayerLevel"] + 1)
        X["SpendPerHour"] = X["InGamePurchases"] / (X["PlayTimeHours"] + 1)
        return X


In [102]:
numeric_features = [
    "Age", "PlayTimeHours", "InGamePurchases",
    "SessionsPerWeek", "AvgSessionDurationMinutes",
    "PlayerLevel", "AchievementsUnlocked",
    "WeeklyPlayMinutes", "ProgressionIntensity",
    "Efficiency", "SpendPerHour"
]


binary_features = ["Gender"]
ordinal_features = ["GameDifficulty"]
nominal_features = ["Location", "GameGenre"]

In [103]:
log_transformer = FunctionTransformer(np.log1p, feature_names_out="one-to-one")

numeric_pipeline = Pipeline(steps=[
    ("log", log_transformer),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),  # add polynomial terms
    ("scaler", StandardScaler())
])

binary_pipeline = OneHotEncoder(drop="if_binary", handle_unknown="ignore")

ordinal_pipeline = OrdinalEncoder(categories=[["Easy", "Medium", "Hard"]])

nominal_pipeline = OneHotEncoder(drop="first", handle_unknown="ignore")


In [104]:
age_binner = KBinsDiscretizer(n_bins=5, encode="onehot-dense", strategy="quantile")

In [105]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("bin", binary_pipeline, binary_features),
        ("ord", ordinal_pipeline, ordinal_features),
        ("nom", nominal_pipeline, nominal_features),
        ("age_bin", age_binner, ["Age"])  # binning age
    ],
    remainder="drop"
)



# Logistic Regression

In [106]:
from sklearn.linear_model import LogisticRegression

In [107]:
model = Pipeline(steps=[
    ("engineer", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=1000,
        solver="lbfgs",          # supports multinomial automatically
        class_weight="balanced"  # handle imbalance
    ))
])

In [108]:
model.fit(X_train, y_train)

print("Validation Accuracy:", model.score(X_val, y_val))
print("Test Accuracy:", model.score(X_test, y_test))



Validation Accuracy: 0.8963277541843617
Test Accuracy: 0.8951048951048951


In [110]:
# Step 1: Feature engineering
X_engineered = model.named_steps["engineer"].transform(X)

# Step 2: Preprocessing
X_transformed = model.named_steps["preprocess"].transform(X_engineered)

# Step 3: Get feature names
feature_names = model.named_steps["preprocess"].get_feature_names_out()
print("Transformed feature names:\n", feature_names)

# Step 4: Wrap into DataFrame
X_transformed_df = pd.DataFrame(X_transformed, columns=feature_names)

# Step 5: Add target back
X_transformed_df["EngagementLevel"] = y.values

# Step 6: Show first 10 rows
X_transformed_df.head(10)

Transformed feature names:
 ['num__Age' 'num__PlayTimeHours' 'num__InGamePurchases'
 'num__SessionsPerWeek' 'num__AvgSessionDurationMinutes'
 'num__PlayerLevel' 'num__AchievementsUnlocked' 'num__WeeklyPlayMinutes'
 'num__ProgressionIntensity' 'num__Efficiency' 'num__SpendPerHour'
 'num__Age^2' 'num__Age PlayTimeHours' 'num__Age InGamePurchases'
 'num__Age SessionsPerWeek' 'num__Age AvgSessionDurationMinutes'
 'num__Age PlayerLevel' 'num__Age AchievementsUnlocked'
 'num__Age WeeklyPlayMinutes' 'num__Age ProgressionIntensity'
 'num__Age Efficiency' 'num__Age SpendPerHour' 'num__PlayTimeHours^2'
 'num__PlayTimeHours InGamePurchases' 'num__PlayTimeHours SessionsPerWeek'
 'num__PlayTimeHours AvgSessionDurationMinutes'
 'num__PlayTimeHours PlayerLevel'
 'num__PlayTimeHours AchievementsUnlocked'
 'num__PlayTimeHours WeeklyPlayMinutes'
 'num__PlayTimeHours ProgressionIntensity' 'num__PlayTimeHours Efficiency'
 'num__PlayTimeHours SpendPerHour' 'num__InGamePurchases^2'
 'num__InGamePurchases Se

Unnamed: 0,num__Age,num__PlayTimeHours,num__InGamePurchases,num__SessionsPerWeek,num__AvgSessionDurationMinutes,num__PlayerLevel,num__AchievementsUnlocked,num__WeeklyPlayMinutes,num__ProgressionIntensity,num__Efficiency,...,nom__GameGenre_RPG,nom__GameGenre_Simulation,nom__GameGenre_Sports,nom__GameGenre_Strategy,age_bin__Age_0.0,age_bin__Age_1.0,age_bin__Age_2.0,age_bin__Age_3.0,age_bin__Age_4.0,EngagementLevel
0,1.024963,0.666899,-0.502296,-0.212689,0.458426,0.842138,0.32917,0.209186,0.738879,-0.52497,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,Medium
1,-0.143287,-0.651336,-0.502296,-0.40782,0.878029,-1.381652,-0.648693,0.270307,-1.114596,0.151556,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,Medium
2,-0.953767,-0.182697,-0.502296,0.910503,0.857608,-0.093867,0.874341,0.937382,0.533393,0.463603,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,High
3,0.412853,-0.706535,1.990858,0.238808,0.109964,0.46518,1.026138,0.305481,0.935049,0.126012,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Medium
4,0.238501,0.607653,-0.502296,-1.285239,0.739921,1.055853,0.760568,-0.315589,1.109824,-0.415521,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Medium
5,0.577775,0.967442,-0.502296,-1.285239,0.039936,0.766486,0.189798,-0.593525,0.61457,-0.554749,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Low
6,-0.57979,0.025046,-0.502296,-1.798497,-0.658301,-1.200958,-2.125698,-1.268564,-2.024625,-0.805138,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Low
7,-0.57979,-0.907424,-0.502296,0.359456,-0.717121,-0.388456,0.238179,0.035148,-0.005196,0.138368,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Medium
8,0.657009,0.806964,-0.502296,-0.40782,0.360835,-0.56915,0.874341,0.064586,0.263357,0.941433,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Medium
9,0.657009,1.164732,-0.502296,0.664731,0.271698,1.103705,0.730252,0.583446,1.118734,-0.453004,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,High


# Decision Tree

In [111]:
from sklearn.tree import DecisionTreeClassifier

dt_model = Pipeline(steps=[
    ("engineer", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("classifier", DecisionTreeClassifier(
        max_depth=None, random_state=42, class_weight="balanced"
    ))
])

dt_model.fit(X_train, y_train)
print("Decision Tree Validation:", dt_model.score(X_val, y_val))
print("Decision Tree Test:", dt_model.score(X_test, y_test))



Decision Tree Validation: 0.8528603547339495
Decision Tree Test: 0.8528971028971029


# kNN [k nearest neighbors]

In [112]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = Pipeline(steps=[
    ("engineer", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("classifier", KNeighborsClassifier(
        n_neighbors=5, weights="distance"
    ))
])

knn_model.fit(X_train, y_train)
print("kNN Validation:", knn_model.score(X_val, y_val))
print("kNN Test:", knn_model.score(X_test, y_test))



kNN Validation: 0.8426180364726455
kNN Test: 0.8496503496503497


# Naive Bayes

In [113]:
from sklearn.naive_bayes import GaussianNB

nb_model = Pipeline(steps=[
    ("engineer", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("classifier", GaussianNB())
])

nb_model.fit(X_train, y_train)
print("Naive Bayes Validation:", nb_model.score(X_val, y_val))
print("Naive Bayes Test:", nb_model.score(X_test, y_test))



Naive Bayes Validation: 0.7867849113165126
Naive Bayes Test: 0.788961038961039


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(steps=[
    ("engineer", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200, max_depth=None,
        random_state=42, class_weight="balanced"
    ))
])

rf_model.fit(X_train, y_train)
print("Random Forest Validation:", rf_model.score(X_val, y_val))
print("Random Forest Test:", rf_model.score(X_test, y_test))



# XGB Classifier

In [None]:
from xgboost import XGBClassifier

xgb_model = Pipeline(steps=[
    ("engineer", FeatureEngineer()),
    ("preprocess", preprocessor),
    ("classifier", XGBClassifier(
        n_estimators=300, learning_rate=0.1,
        max_depth=6, subsample=0.8, colsample_bytree=0.8,
        random_state=42, eval_metric="mlogloss"
    ))
])

xgb_model.fit(X_train, y_train)
print("XGBoost Validation:", xgb_model.score(X_val, y_val))
print("XGBoost Test:", xgb_model.score(X_test, y_test))