# **Imports**


In [250]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder,StandardScaler,MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score,classification_report

# **Data Inspecting**

### Read Data 

In [251]:
info_df = pd.read_csv('Classification_Dataset/info_base_games.csv')
gamalytic_df = pd.read_csv('Classification_Dataset/ms2_gamalytic_steam_games.csv')
dlcs_df = pd.read_csv('Classification_Dataset/dlcs.csv')
demos_df = pd.read_csv('Classification_Dataset/demos.csv')

  info_df = pd.read_csv('Classification_Dataset/info_base_games.csv')


### Renaming the id columns (so i can easliy merge the columns)


In [252]:
info_df.rename(columns={'appid': 'id'}, inplace=True)
gamalytic_df.rename(columns={'steamId': 'id'}, inplace=True)
dlcs_df.rename(columns={'base_appid': 'id'}, inplace=True)
demos_df.rename(columns={'full_game_appid': 'id'}, inplace=True)

### adjusting the id data type

In [253]:
info_df['id'] = info_df['id'].astype(str)
gamalytic_df['id'] = gamalytic_df['id'].astype(str)
dlcs_df['id'] = dlcs_df['id'].astype(str)
demos_df['id'] = demos_df['id'].astype(str)

### Merging the datasets on the id column


In [254]:
# Merge info_df and gamalytic_df
merged_df = pd.merge(info_df, gamalytic_df, on='id', how='inner')

# Aggregate DLCs (count per game)
dlc_count = dlcs_df.groupby('id').size().reset_index(name='dlc_count')

merged_df = pd.merge(merged_df, dlc_count, on='id', how='left')
merged_df['dlc_count'] = merged_df['dlc_count'].fillna(0)

# Add demo presence
merged_df['hasDemo'] = merged_df['id'].isin(demos_df['id']).astype(int)

In [255]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69428 entries, 0 to 69427
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   69428 non-null  object 
 1   name                 69428 non-null  object 
 2   metacritic           2933 non-null   object 
 3   steam_achievements   69428 non-null  bool   
 4   steam_trading_cards  69428 non-null  bool   
 5   workshop_support     69428 non-null  bool   
 6   genres               69324 non-null  object 
 7   achievements_total   37295 non-null  object 
 8   release_date         69426 non-null  object 
 9   supported_platforms  69428 non-null  object 
 10  price                69428 non-null  float64
 11  copiesSold           69428 non-null  int64  
 12  publisherClass       69428 non-null  object 
 13  reviewScore          69428 non-null  object 
 14  aiContent            0 non-null      float64
 15  dlc_count            69428 non-null 

In [256]:
merged_df.isnull().sum()

id                         0
name                       0
metacritic             66495
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                   104
achievements_total     32133
release_date               2
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              69428
dlc_count                  0
hasDemo                    0
dtype: int64

In [257]:
df = merged_df.copy()

# Splitting the data into training and testing sets

In [258]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save to CSV files
# train_df.to_csv('train.csv', index=False)
# test_df.to_csv('test.csv', index=False)

In [259]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55542 entries, 33136 to 15795
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   55542 non-null  object 
 1   name                 55542 non-null  object 
 2   metacritic           2332 non-null   object 
 3   steam_achievements   55542 non-null  bool   
 4   steam_trading_cards  55542 non-null  bool   
 5   workshop_support     55542 non-null  bool   
 6   genres               55456 non-null  object 
 7   achievements_total   29878 non-null  object 
 8   release_date         55541 non-null  object 
 9   supported_platforms  55542 non-null  object 
 10  price                55542 non-null  float64
 11  copiesSold           55542 non-null  int64  
 12  publisherClass       55542 non-null  object 
 13  reviewScore          55542 non-null  object 
 14  aiContent            0 non-null      float64
 15  dlc_count            55542 non-null  

# Data wrangling 


In [260]:
train_df['metacritic'] = pd.to_numeric(train_df['metacritic'], errors='coerce').fillna(0)
train_df['achievements_total'] = pd.to_numeric(train_df['achievements_total'], errors='coerce').fillna(0)
train_df['genres'] = train_df['genres'].fillna('Unknown')
train_df['release_date'] = train_df['release_date'].replace('Coming soon', pd.NA)
train_df['release_date'] = pd.to_datetime(train_df['release_date'], errors='coerce')
train_df['release_year'] = train_df['release_date'].dt.year.fillna(train_df['release_date'].dt.year.mode()[0])

In [261]:
train_df.isnull().sum()

id                         0
name                       0
metacritic                 0
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                     0
achievements_total         0
release_date             967
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              55542
dlc_count                  0
hasDemo                    0
release_year               0
dtype: int64

In [262]:
train_df.drop(columns=['aiContent'], inplace=True)

In [263]:
train_df.isnull().sum()

id                       0
name                     0
metacritic               0
steam_achievements       0
steam_trading_cards      0
workshop_support         0
genres                   0
achievements_total       0
release_date           967
supported_platforms      0
price                    0
copiesSold               0
publisherClass           0
reviewScore              0
dlc_count                0
hasDemo                  0
release_year             0
dtype: int64

In [264]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55542 entries, 33136 to 15795
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   55542 non-null  object        
 1   name                 55542 non-null  object        
 2   metacritic           55542 non-null  float64       
 3   steam_achievements   55542 non-null  bool          
 4   steam_trading_cards  55542 non-null  bool          
 5   workshop_support     55542 non-null  bool          
 6   genres               55542 non-null  object        
 7   achievements_total   55542 non-null  float64       
 8   release_date         54575 non-null  datetime64[ns]
 9   supported_platforms  55542 non-null  object        
 10  price                55542 non-null  float64       
 11  copiesSold           55542 non-null  int64         
 12  publisherClass       55542 non-null  object        
 13  reviewScore          55542 non-n

# Preprocessing

In [265]:
train_df = pd.get_dummies(train_df, columns=['publisherClass'], dtype=int)

In [266]:
df['isWindows'] = df['supported_platforms'].apply(lambda x: 1 if 'windows' in str(x).lower() else 0)
df['isMac'] = df['supported_platforms'].apply(lambda x: 1 if 'mac' in str(x).lower() else 0)
df['isLinux'] = df['supported_platforms'].apply(lambda x: 1 if 'linux' in str(x).lower() else 0)

In [267]:
train_df.reviewScore.value_counts()

reviewScore
Positive                   21501
Mixed                      11976
Mostly Positive             7481
Very Positive               6710
Negative                    4877
Mostly Negative             2348
Overwhelmingly Positive      621
Very Negative                 21
Overwhelmingly Negative        7
Name: count, dtype: int64

In [268]:
categorical_columns = ['steam_achievements', 'steam_trading_cards', 'workshop_support','reviewScore']
encoder = LabelEncoder()
for col in categorical_columns:
    train_df[col] = encoder.fit_transform(train_df[col])

In [269]:
train_df['copiesSold'].describe()

count    5.554200e+04
mean     8.402942e+04
std      1.733042e+06
min      0.000000e+00
25%      6.000000e+01
50%      4.540000e+02
75%      3.956000e+03
max      3.021580e+08
Name: copiesSold, dtype: float64

In [270]:
Q1 = train_df['copiesSold'].quantile(0.25)
Q3 = train_df['copiesSold'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = train_df[(train_df['copiesSold'] < lower_bound) | (train_df['copiesSold'] > upper_bound)]
print(lower_bound)
print(upper_bound)

-5784.0
9800.0


In [271]:
train_df.loc[train_df['copiesSold'] <= 0, 'copiesSold'] *= -1

In [272]:
#train_df = (train_df[train_df['copiesSold'] <= 1])

## Price outliers

In [273]:
Q1 = train_df['price'].quantile(0.25)
Q3 = train_df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = train_df[(train_df['price'] < lower_bound) | (train_df['price'] > upper_bound)]
print(lower_bound)
print(upper_bound)

-12.51
23.490000000000002


In [274]:
train_df = train_df[(train_df['price'] <= 70) & (train_df['price'] > 0)]

In [275]:
train_df['genres_split'] = train_df['genres'].str.split(',')

exploded_genres = train_df.explode('genres_split')
exploded_genres['genres_split'] = exploded_genres['genres_split'].str.strip()
genre_counts = exploded_genres['genres_split'].value_counts()


rare_genres = genre_counts[genre_counts < 5000].index
exploded_genres['genres_split'] = exploded_genres['genres_split'].apply(lambda x: 'Other_genres' if x in rare_genres else x)

train_df['genres'] = (
    exploded_genres
    .groupby(exploded_genres.index)['genres_split']
    .apply(lambda x: ', '.join(sorted(set(x))))
)

train_df.drop(columns=['genres_split'],inplace=True)
train_df['genre_diversity'] = train_df['genres'].str.split(',').apply(len)

In [276]:
train_df['genres_list'] = train_df['genres'].str.split(', ')

mlb = MultiLabelBinarizer()

genres_encoded = mlb.fit_transform(list(train_df['genres_list']))
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_,index=train_df.index)

train_df = pd.concat([train_df, genres_df], axis=1)
train_df.drop(columns=['genres_list','genres'], inplace=True)

In [277]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47068 entries, 33136 to 860
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       47068 non-null  object        
 1   name                     47068 non-null  object        
 2   metacritic               47068 non-null  float64       
 3   steam_achievements       47068 non-null  int64         
 4   steam_trading_cards      47068 non-null  int64         
 5   workshop_support         47068 non-null  int64         
 6   achievements_total       47068 non-null  float64       
 7   release_date             46235 non-null  datetime64[ns]
 8   supported_platforms      47068 non-null  object        
 9   price                    47068 non-null  float64       
 10  copiesSold               47068 non-null  int64         
 11  reviewScore              47068 non-null  int32         
 12  dlc_count                47068 non-

In [278]:
# scaler = StandardScaler()
# train_df['copiesSold'] = scaler.fit_transform(train_df[['copiesSold']])

In [279]:
columns = [
    'steam_achievements', 'steam_trading_cards', 
    'workshop_support', 'achievements_total', 
    'dlc_count', 'hasDemo', 'release_year', 
    'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist', 
    'publisherClass_Indie', 'genre_diversity', 'Action', 'Adventure', 'Casual', 
    'copiesSold','Indie', 'Other_genres', 'RPG', 'Simulation', 
    'Strategy'
]

In [280]:
X = train_df[columns]
y = train_df['reviewScore']

# Apply Chi-Squared
chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X, y)
feature_scores = pd.DataFrame({'Feature': columns, 'Chi2 Score': chi2_selector.scores_})
print(feature_scores.sort_values(by='Chi2 Score', ascending=False))

                    Feature    Chi2 Score
15               copiesSold  3.475336e+10
3        achievements_total  3.336902e+05
9   publisherClass_Hobbyist  5.499679e+03
10     publisherClass_Indie  4.078569e+03
7         publisherClass_AA  2.740379e+03
1       steam_trading_cards  2.061800e+03
8        publisherClass_AAA  1.328631e+03
4                 dlc_count  9.627238e+02
0        steam_achievements  8.966288e+02
19               Simulation  6.135459e+02
5                   hasDemo  4.776628e+02
2          workshop_support  4.579380e+02
14                   Casual  3.447322e+02
18                      RPG  2.737686e+02
17             Other_genres  1.758857e+02
11          genre_diversity  1.079242e+02
16                    Indie  8.222606e+01
13                Adventure  7.816500e+01
20                 Strategy  6.504052e+01
6              release_year  2.877341e+01
12                   Action  1.172273e+01


In [281]:
top_features = feature_scores.sort_values(by='Chi2 Score', ascending=False).head(10)['Feature'].tolist()
X_selected = X[top_features]

In [282]:
top_features

['copiesSold',
 'achievements_total',
 'publisherClass_Hobbyist',
 'publisherClass_Indie',
 'publisherClass_AA',
 'steam_trading_cards',
 'publisherClass_AAA',
 'dlc_count',
 'steam_achievements',
 'Simulation']

In [283]:
X_chi2 = X.copy()
for col in X_chi2.columns:
    min_val = X_chi2[col].min()
    if min_val < 0:
        X_chi2[col] = X_chi2[col] - min_val

selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X_chi2, y)

In [284]:
from catboost import CatBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

catboost_model = CatBoostClassifier(verbose=0)  # verbose=0 to suppress output
catboost_model.fit(X_train, y_train)

# Evaluation
y_pred = catboost_model.predict(X_test)
print("CatBoost Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

CatBoost Accuracy: 0.5813681750584236
              precision    recall  f1-score   support

           0       0.47      0.27      0.34      1957
           1       0.36      0.01      0.02       420
           2       0.50      0.05      0.09      1223
           3       0.69      0.66      0.68       982
           4       0.00      0.00      0.00         2
           5       0.46      0.12      0.19       111
           6       0.60      0.88      0.71      3670
           7       0.00      0.00      0.00         6
           8       0.56      0.95      0.70      1043

    accuracy                           0.58      9414
   macro avg       0.40      0.33      0.30      9414
weighted avg       0.55      0.58      0.51      9414



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
