# **Imports**


In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder,StandardScaler,MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score,classification_report,mean_squared_error
import pickle 

# **Data Inspecting**

### Read Data 

In [105]:
info_df = pd.read_csv('Classification_Dataset/info_base_games.csv')
gamalytic_df = pd.read_csv('Classification_Dataset/ms2_gamalytic_steam_games.csv')
dlcs_df = pd.read_csv('Classification_Dataset/dlcs.csv')
demos_df = pd.read_csv('Classification_Dataset/demos.csv')

  info_df = pd.read_csv('Classification_Dataset/info_base_games.csv')


### Renaming the id columns (so i can easliy merge the columns)


In [106]:
info_df.rename(columns={'appid': 'id'}, inplace=True)
gamalytic_df.rename(columns={'steamId': 'id'}, inplace=True)
dlcs_df.rename(columns={'base_appid': 'id'}, inplace=True)
demos_df.rename(columns={'full_game_appid': 'id'}, inplace=True)

### adjusting the id data type

In [107]:
info_df['id'] = info_df['id'].astype(str)
gamalytic_df['id'] = gamalytic_df['id'].astype(str)
dlcs_df['id'] = dlcs_df['id'].astype(str)
demos_df['id'] = demos_df['id'].astype(str)

### Merging the datasets on the id column


In [108]:
# Merge info_df and gamalytic_df
merged_df = pd.merge(info_df, gamalytic_df, on='id', how='inner')

# Aggregate DLCs (count per game)
dlc_count = dlcs_df.groupby('id').size().reset_index(name='dlc_count')

merged_df = pd.merge(merged_df, dlc_count, on='id', how='left')
merged_df['dlc_count'] = merged_df['dlc_count'].fillna(0)

# Add demo presence
merged_df['hasDemo'] = merged_df['id'].isin(demos_df['id']).astype(int)

In [109]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69428 entries, 0 to 69427
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   69428 non-null  object 
 1   name                 69428 non-null  object 
 2   metacritic           2933 non-null   object 
 3   steam_achievements   69428 non-null  bool   
 4   steam_trading_cards  69428 non-null  bool   
 5   workshop_support     69428 non-null  bool   
 6   genres               69324 non-null  object 
 7   achievements_total   37295 non-null  object 
 8   release_date         69426 non-null  object 
 9   supported_platforms  69428 non-null  object 
 10  price                69428 non-null  float64
 11  copiesSold           69428 non-null  int64  
 12  publisherClass       69428 non-null  object 
 13  reviewScore          69428 non-null  object 
 14  aiContent            0 non-null      float64
 15  dlc_count            69428 non-null 

In [110]:
merged_df.isnull().sum()

id                         0
name                       0
metacritic             66495
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                   104
achievements_total     32133
release_date               2
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              69428
dlc_count                  0
hasDemo                    0
dtype: int64

In [111]:
df = merged_df.copy()

# Splitting the data into training and testing sets

In [112]:
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# # Save to CSV files
# # train_df.to_csv('train.csv', index=False)
# # test_df.to_csv('test.csv', index=False)

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69428 entries, 0 to 69427
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   69428 non-null  object 
 1   name                 69428 non-null  object 
 2   metacritic           2933 non-null   object 
 3   steam_achievements   69428 non-null  bool   
 4   steam_trading_cards  69428 non-null  bool   
 5   workshop_support     69428 non-null  bool   
 6   genres               69324 non-null  object 
 7   achievements_total   37295 non-null  object 
 8   release_date         69426 non-null  object 
 9   supported_platforms  69428 non-null  object 
 10  price                69428 non-null  float64
 11  copiesSold           69428 non-null  int64  
 12  publisherClass       69428 non-null  object 
 13  reviewScore          69428 non-null  object 
 14  aiContent            0 non-null      float64
 15  dlc_count            69428 non-null 

# Data wrangling 


In [114]:
df['metacritic'] = pd.to_numeric(df['metacritic'], errors='coerce').fillna(0)
df['achievements_total'] = pd.to_numeric(df['achievements_total'], errors='coerce').fillna(0)
df['genres'] = df['genres'].fillna('Unknown')
df['release_date'] = df['release_date'].replace('Coming soon', pd.NA)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year.fillna(df['release_date'].dt.year.mode()[0])

In [115]:
df.isnull().sum()

id                         0
name                       0
metacritic                 0
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                     0
achievements_total         0
release_date            1223
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              69428
dlc_count                  0
hasDemo                    0
release_year               0
dtype: int64

In [116]:
df.isnull().sum()

id                         0
name                       0
metacritic                 0
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                     0
achievements_total         0
release_date            1223
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              69428
dlc_count                  0
hasDemo                    0
release_year               0
dtype: int64

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69428 entries, 0 to 69427
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   69428 non-null  object        
 1   name                 69428 non-null  object        
 2   metacritic           69428 non-null  float64       
 3   steam_achievements   69428 non-null  bool          
 4   steam_trading_cards  69428 non-null  bool          
 5   workshop_support     69428 non-null  bool          
 6   genres               69428 non-null  object        
 7   achievements_total   69428 non-null  float64       
 8   release_date         68205 non-null  datetime64[ns]
 9   supported_platforms  69428 non-null  object        
 10  price                69428 non-null  float64       
 11  copiesSold           69428 non-null  int64         
 12  publisherClass       69428 non-null  object        
 13  reviewScore          69428 non-

# Preprocessing

In [118]:
from datetime import datetime
current_year = datetime.now().year
df['game_age'] = current_year - df['release_year']

# 2. Price Categories
df['price_category'] = pd.cut(
    df['price'],
    bins=[-1, 0, 5, 15, 30, 60, float('inf')],
    labels=['Free', 'Low', 'Medium', 'High', 'Very High', 'Premium']
)

In [119]:
df['isWindows'] = df['supported_platforms'].apply(lambda x: 1 if 'windows' in str(x).lower() else 0)
df['isMac'] = df['supported_platforms'].apply(lambda x: 1 if 'mac' in str(x).lower() else 0)
df['isLinux'] = df['supported_platforms'].apply(lambda x: 1 if 'linux' in str(x).lower() else 0)
df['num_platforms'] = df[['isWindows', 'isMac', 'isLinux']].sum(axis=1)

In [120]:
df['price_per_dlc'] = df['price'] / (df['dlc_count'] + 1)
df['achievements_per_dlc'] = df['achievements_total'] / (df['dlc_count'] + 1)

# 5. Log Transformation
df['log_copiesSold'] = np.log1p(df['copiesSold'].clip(lower=0))
df['log_price'] = np.log1p(df['price'].clip(lower=0))


In [121]:
def cap_outliers(series, lower_quantile=0.01, upper_quantile=0.99):
    lower = series.quantile(lower_quantile)
    upper = series.quantile(upper_quantile)
    return series.clip(lower=lower, upper=upper)

df['copiesSold'] = cap_outliers(df['copiesSold'])
df['price'] = cap_outliers(df['price'])
df['achievements_total'] = cap_outliers(df['achievements_total'])
df['dlc_count'] = cap_outliers(df['dlc_count'])

In [122]:
categorical_columns = ['steam_achievements', 'steam_trading_cards', 'workshop_support','reviewScore']
encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])

### Copiessold outliers 

In [123]:
df['copiesSold'].describe()

count    6.942800e+04
mean     3.494762e+04
std      1.654204e+05
min      1.000000e+00
25%      6.000000e+01
50%      4.570000e+02
75%      3.955000e+03
max      1.352608e+06
Name: copiesSold, dtype: float64

In [124]:
Q1 = df['copiesSold'].quantile(0.25)
Q3 = df['copiesSold'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df[(df['copiesSold'] < lower_bound) | (df['copiesSold'] > upper_bound)]
print(lower_bound)
print(upper_bound)

-5782.5
9797.5


In [125]:
df.loc[df['copiesSold'] <= 0, 'copiesSold'] *= -1

In [126]:
#train_df = (train_df[train_df['copiesSold'] <= 1])

## Price outliers

In [127]:
df.price.describe()

count    69428.000000
mean         7.342479
std          8.303669
min          0.000000
25%          0.990000
50%          4.990000
75%          9.990000
max         44.990000
Name: price, dtype: float64

In [128]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
print(lower_bound)
print(upper_bound)

-12.51
23.490000000000002


In [129]:
df[df['price'] > 200 ]

Unnamed: 0,id,name,metacritic,steam_achievements,steam_trading_cards,workshop_support,genres,achievements_total,release_date,supported_platforms,...,game_age,price_category,isWindows,isMac,isLinux,num_platforms,price_per_dlc,achievements_per_dlc,log_copiesSold,log_price


In [130]:
df = df[(df['price'] <= 200)]

In [131]:
df['genres_split'] = df['genres'].str.split(',')

exploded_genres = df.explode('genres_split')
exploded_genres['genres_split'] = exploded_genres['genres_split'].str.strip()
genre_counts = exploded_genres['genres_split'].value_counts()


rare_genres = genre_counts[genre_counts < 5000].index
exploded_genres['genres_split'] = exploded_genres['genres_split'].apply(lambda x: 'Other_genres' if x in rare_genres else x)

df['genres'] = (
    exploded_genres
    .groupby(exploded_genres.index)['genres_split']
    .apply(lambda x: ', '.join(sorted(set(x))))
)

df.drop(columns=['genres_split'],inplace=True)
df['genre_diversity'] = df['genres'].str.split(',').apply(len)

In [132]:
df['genres_list'] = df['genres'].str.split(', ')

mlb = MultiLabelBinarizer()

genres_encoded = mlb.fit_transform(list(df['genres_list']))
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_,index=df.index)

df = pd.concat([df, genres_df], axis=1)
df.drop(columns=['genres_list','genres'], inplace=True)

In [133]:
temp_encoder = LabelEncoder()
df['temp_reviewScore'] = temp_encoder.fit_transform(df['reviewScore'])
publisher_success = df.groupby('publisherClass')['temp_reviewScore'].mean().to_dict()
df['publisher_success_score'] = df['publisherClass'].map(publisher_success)
df.drop(columns=['temp_reviewScore'], inplace=True)

# 7. NEW: Platform Exclusivity
df['is_exclusive'] = (df['num_platforms'] == 1).astype(int)

# 8. NEW: Release Seasonality
df['release_month'] = df['release_date'].dt.month.fillna(df['release_date'].dt.month.mode()[0])
df['release_quarter'] = df['release_date'].dt.quarter.fillna(df['release_date'].dt.quarter.mode()[0])

# 9. NEW: Price-to-CopiesSold Ratio
df['price_to_copies_ratio'] = df['price'] / (df['log_copiesSold'] + 1e-6)

# 10. NEW: Game Complexity Proxy
df['game_complexity'] = (
    df['achievements_total'] / (df['achievements_total'].max() + 1e-6) +
    df['dlc_count'] / (df['dlc_count'].max() + 1e-6) +
    df['genre_diversity'] / (df['genre_diversity'].max() + 1e-6)
)

# 11. NEW: Metacritic Binary
df['has_metacritic'] = (df['metacritic'] > 0).astype(int)

In [134]:
df = pd.get_dummies(df, columns=['publisherClass', 'price_category'], dtype=int)

In [135]:
genre_cols = mlb.classes_
genre_popularity = {}
for genre in genre_cols:
    mask = df[genre] == 1
    genre_popularity[genre] = df[mask]['copiesSold'].mean() if mask.sum() > 0 else 0
df['genre_popularity'] = df[genre_cols].dot(list(genre_popularity.values())) / df[genre_cols].sum(axis=1)

# NEW: Free-to-Play Interaction
df['free_to_play_interaction'] = df['price_category_Free'] * df['Free To Play']

In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69428 entries, 0 to 69427
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        69428 non-null  object        
 1   name                      69428 non-null  object        
 2   metacritic                69428 non-null  float64       
 3   steam_achievements        69428 non-null  int64         
 4   steam_trading_cards       69428 non-null  int64         
 5   workshop_support          69428 non-null  int64         
 6   achievements_total        69428 non-null  float64       
 7   release_date              68205 non-null  datetime64[ns]
 8   supported_platforms       69428 non-null  object        
 9   price                     69428 non-null  float64       
 10  copiesSold                69428 non-null  float64       
 11  reviewScore               69428 non-null  int64         
 12  aiContent         

In [137]:
features = [
    'metacritic', 'steam_achievements', 'steam_trading_cards', 'workshop_support',
    'achievements_total', 'dlc_count', 'hasDemo', 'release_year', 'game_age',
    'isWindows', 'isMac', 'isLinux', 'num_platforms', 'price_per_dlc',
    'achievements_per_dlc', 'log_copiesSold', 'log_price', 'genre_diversity',
    'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist',
    'publisherClass_Indie', 'price_category_Free', 'price_category_Low',
    'price_category_Medium', 'price_category_High', 'price_category_Very High',
    'price_category_Premium', 'Action', 'Adventure', 'Casual', 'Early Access',
    'Free To Play', 'Indie', 'Other_genres', 'RPG', 'Simulation', 'Strategy',
    'publisher_success_score', 'is_exclusive', 'release_month', 'release_quarter',
    'price_to_copies_ratio', 'game_complexity', 'has_metacritic',
    'genre_popularity', 'free_to_play_interaction'
]

In [138]:
len(features)

47

### Chi_squared 

In [139]:
X = df[features]
y = df['reviewScore']
scaler = StandardScaler()
# numerical_cols = [
#     'metacritic', 'achievements_total', 'dlc_count', 'release_year', 'game_age',
#     'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold',
#     'log_price', 'genre_diversity'
# ]
numerical_cols = [
    'metacritic', 'achievements_total', 'dlc_count', 'release_year', 'game_age',
    'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold',
    'log_price', 'genre_diversity', 'publisher_success_score', 'release_month',
    'release_quarter', 'price_to_copies_ratio', 'game_complexity',
    'genre_popularity'
]
X_scaled = X.copy()
X_scaled[numerical_cols] = scaler.fit_transform(X[numerical_cols])
# Apply Chi-Squared
# chi2_selector = SelectKBest(score_func=chi2, k='all')
# chi2_selector.fit(X, y)
# feature_scores = pd.DataFrame({'Feature': columns, 'Chi2 Score': chi2_selector.scores_})
# print(feature_scores.sort_values(by='Chi2 Score', ascending=False))

In [140]:
for col in numerical_cols:
    min_val = X_scaled[col].min()
    if min_val < 0:
        X_scaled[col] = X_scaled[col] - min_val + 1e-6  # Small constant to avoid zero
    X_scaled[col] = X_scaled[col].clip(lower=0)  # Ensure no negatives

# Handle any infinities or NaNs
X_scaled = X_scaled.replace([np.inf, -np.inf], 0).fillna(0)

# Apply SMOTE to the top 5 classes
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Feature Selection with Chi-Squared
chi2_selector = SelectKBest(score_func=chi2, k = 30)
X_selected = chi2_selector.fit_transform(X_scaled, y)
selected_features = [features[i] for i in chi2_selector.get_support(indices=True)]
print("Selected Features:", selected_features)
print(len(selected_features))

Selected Features: ['metacritic', 'steam_achievements', 'steam_trading_cards', 'workshop_support', 'achievements_total', 'dlc_count', 'hasDemo', 'release_year', 'game_age', 'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold', 'log_price', 'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist', 'publisherClass_Indie', 'price_category_Free', 'price_category_Low', 'price_category_High', 'price_category_Very High', 'Free To Play', 'Simulation', 'publisher_success_score', 'price_to_copies_ratio', 'game_complexity', 'has_metacritic', 'genre_popularity', 'free_to_play_interaction']
30


### get top features score

In [141]:
feature_scores = pd.DataFrame({
    'Feature': features,
    'Chi2 Score': chi2_selector.scores_
})

# Sort features by Chi2 Score in descending order
top_features = feature_scores.sort_values(by='Chi2 Score', ascending=False)

# Print the top features
print("Top Features Ordered by Chi-Squared Score:\n", top_features)

# Optionally, get the top N features (e.g., top 20)
N = 20
print(f"\nTop {N} Features:\n", top_features.head(N))

Top Features Ordered by Chi-Squared Score:
                      Feature    Chi2 Score
0                 metacritic  34354.791387
38   publisher_success_score  20234.418942
15            log_copiesSold  19157.982918
4         achievements_total   6911.607805
42     price_to_copies_ratio   6599.498451
44            has_metacritic   6136.655316
5                  dlc_count   6076.778831
8                   game_age   5703.320383
20   publisherClass_Hobbyist   5538.923640
21      publisherClass_Indie   4731.702782
43           game_complexity   3772.311629
18         publisherClass_AA   3275.535518
23        price_category_Low   2567.286635
25       price_category_High   2421.790984
14      achievements_per_dlc   2076.886110
2        steam_trading_cards   1954.491054
19        publisherClass_AAA   1726.283271
22       price_category_Free   1477.489794
13             price_per_dlc   1372.487180
46  free_to_play_interaction   1158.722866
12             num_platforms   1157.630766
32        

In [142]:
len(top_features)

47

In [143]:
# top_features = feature_scores.sort_values(by='Chi2 Score', ascending=False).head(47)['Feature'].tolist()
# X_selected = X[top_features] 

In [144]:
# top_features

In [145]:
# X_chi2 = X.copy()
# for col in X_chi2.columns:
#     min_val = X_chi2[col].min()
#     if min_val < 0:
#         X_chi2[col] = X_chi2[col] - min_val

# selector = SelectKBest(score_func=chi2, k=10)
# X_selected = selector.fit_transform(X_chi2, y)

### Hypertuning 1

In [None]:
# catboost = CatBoostClassifier(verbose=0, random_state=42)
# param_grid = {
#     'depth': [6, 8],
#     'learning_rate': [0.01, 0.05],
#     'iterations': [500, 1000]
# }
# grid_search = GridSearchCV(
#     estimator=catboost,
#     param_grid=param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# grid_search.fit(X_train, y_train)

# # Best Model
# best_model = grid_search.best_estimator_
# print("Best Parameters:", grid_search.best_params_)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

- random state 53 ==> 59.82% 
- random state 57 ==> 59.98%
- random state 55 ==> 60.04%
- random state 54 1400 iter ==> 60.35%
- random state 54 1500 iter ==> 60.52%
- random state 54 test size 15% 1400 iter ==> 60.60 %
- random state 54 test size 20% 1500 iter 0.08 lr ==> 60.67%

In [None]:
# from catboost import CatBoostClassifier
# X_train, X_test, y_train, y_test = train_test_split(train_df[selected_features], y, test_size=0.05, random_state=54)

# catboost_model = CatBoostClassifier(verbose=0,bagging_temperature=0, depth= 6, iterations= 1500, l2_leaf_reg= 5, learning_rate=0.08)  
# catboost_model.fit(X_train, y_train)

# # Evaluation
# y_pred = catboost_model.predict(X_test)
# print("CatBoost Accuracy:", accuracy_score(y_test, y_pred)*100)
# print(classification_report(y_test, y_pred))

In [None]:
# xgboost_model =xgb.XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.09, max_depth= 5, n_estimators= 500, subsample= 0.8)  
# xgboost_model.fit(X_train, y_train)

# # Evaluation
# y_pred = xgboost_model.predict(X_test)
# print("xgb Accuracy:", accuracy_score(y_test, y_pred)*100)
# print(classification_report(y_test, y_pred))

In [None]:
# from lightgbm import LGBMClassifier

# lightgbm_model = LGBMClassifier(
#     colsample_bytree=0.8,
#     learning_rate=0.08,
#     num_leaves=80,
#     n_estimators=500,
#     subsample=0.8,
#     random_state=54
# )

# # Fit the model
# lightgbm_model.fit(X_train, y_train)

# # Evaluation
# y_pred = lightgbm_model.predict(X_test)
# print("LightGBM Accuracy:", accuracy_score(y_test, y_pred) * 100)
# print(classification_report(y_test, y_pred))

In [None]:
# #saving the best model file 
# with open('catboost_model.pkl', 'wb') as file:
#     pickle.dump(catboost_model, file)

# print("CatBoost model saved to 'catboost_model.pkl'")

In [None]:
test_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 13886 entries, 58447 to 48106
Data columns (total 38 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    13886 non-null  object        
 1   name                  13886 non-null  object        
 2   metacritic            13886 non-null  float64       
 3   steam_achievements    13886 non-null  bool          
 4   steam_trading_cards   13886 non-null  bool          
 5   workshop_support      13886 non-null  bool          
 6   achievements_total    13886 non-null  float64       
 7   release_date          13631 non-null  datetime64[ns]
 8   supported_platforms   13886 non-null  object        
 9   price                 13886 non-null  float64       
 10  copiesSold            13886 non-null  int64         
 11  publisherClass        13886 non-null  object        
 12  reviewScore           13886 non-null  object        
 13  aiContent        

In [None]:
# Save training set's mode for release_year
train_release_year_mode = train_df['release_date'].dt.year.mode()[0]

# Handle test set
test_df['metacritic'] = pd.to_numeric(test_df['metacritic'], errors='coerce').fillna(0)
test_df['achievements_total'] = pd.to_numeric(test_df['achievements_total'], errors='coerce').fillna(0)
test_df['genres'] = test_df['genres'].fillna('Unknown')
test_df['release_date'] = test_df['release_date'].replace('Coming soon', pd.NA)
test_df['release_date'] = pd.to_datetime(test_df['release_date'], errors='coerce')
test_df['release_year'] = test_df['release_date'].dt.year.fillna(train_release_year_mode)

KeyError: 'genres'

In [None]:
from datetime import datetime
e=LabelEncoder()
current_year = datetime.now().year
test_df['game_age'] = current_year - test_df['release_year']

test_df['price_category'] = pd.cut(
    test_df['price'],
    bins=[-1, 0, 5, 15, 30, 60, float('inf')],
    labels=['Free', 'Low', 'Medium', 'High', 'Very High', 'Premium']
)

test_df['isWindows'] = test_df['supported_platforms'].apply(lambda x: 1 if 'windows' in str(x).lower() else 0)
test_df['isMac'] = test_df['supported_platforms'].apply(lambda x: 1 if 'mac' in str(x).lower() else 0)
test_df['isLinux'] = test_df['supported_platforms'].apply(lambda x: 1 if 'linux' in str(x).lower() else 0)
test_df['num_platforms'] = test_df[['isWindows', 'isMac', 'isLinux']].sum(axis=1)

test_df['price_per_dlc'] = test_df['price'] / (test_df['dlc_count'] + 1)
test_df['achievements_per_dlc'] = test_df['achievements_total'] / (test_df['dlc_count'] + 1)
test_df['log_copiesSold'] = np.log1p(test_df['copiesSold'].clip(lower=0))
test_df['log_price'] = np.log1p(test_df['price'].clip(lower=0))

test_df['genres_split'] = test_df['genres'].str.split(',')
test_df['genres_split'] = test_df['genres_split'].apply(lambda x: [g.strip() for g in x])

rare_genres = set(train_df.columns) - set(['genres_split', 'genres_list']) - set(test_df.columns)
test_df['genres_split'] = test_df['genres_split'].apply(
    lambda x: ['Other_genres' if g in rare_genres else g for g in x]
)
test_df['genres'] = test_df['genres_split'].apply(lambda x: ', '.join(sorted(set(x))))
test_df['genre_diversity'] = test_df['genres'].str.split(',').apply(len)

test_df['genres_list'] = test_df['genres'].str.split(', ')
genres_encoded = mlb.transform(list(test_df['genres_list']))
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes, index=test_df.index)
test_df = pd.concat([test_df, genres_df], axis=1)
test_df.drop(columns=['genres_list', 'genres', 'genres_split'], inplace=True)

for col in ['steam_achievements', 'steam_trading_cards', 'workshop_support', 'reviewScore']:
    test_df[col] = e.transform(test_df[col])

test_df['publisher_success_score'] = test_df['publisherClass'].map(publisher_success)

test_df['is_exclusive'] = (test_df['num_platforms'] == 1).astype(int)


test_df['release_month'] = test_df['release_date'].dt.month.fillna(test_df['release_date'].dt.month.mode()[0])
test_df['release_quarter'] = test_df['release_date'].dt.quarter.fillna(test_df['release_date'].dt.quarter.mode()[0])

test_df['price_to_copies_ratio'] = test_df['price'] / (test_df['log_copiesSold'] + 1e-6)


test_df['game_complexity'] = (
    test_df['achievements_total'] / (train_df['achievements_total'].max() + 1e-6) +
    test_df['dlc_count'] / (train_df['dlc_count'].max() + 1e-6) +
    test_df['genre_diversity'] / (train_df['genre_diversity'].max() + 1e-6)
)


test_df['has_metacritic'] = (test_df['metacritic'] > 0).astype(int)


test_df = pd.get_dummies(test_df, columns=['publisherClass', 'price_category'], dtype=int)

test_df['genre_popularity'] = test_df[genre_cols].dot(list(genre_popularity.values())) / test_df[genre_cols].sum(axis=1)

if 'price_category_Free' in test_df.columns and 'Free To Play' in test_df.columns:
    test_df['free_to_play_interaction'] = test_df['price_category_Free'] * test_df['Free To Play']
else:
    test_df['free_to_play_interaction'] = 0


test_df = test_df[train_df.columns]



NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
features = [
    'metacritic', 'steam_achievements', 'steam_trading_cards', 'workshop_support',
    'achievements_total', 'dlc_count', 'hasDemo', 'release_year', 'game_age',
    'isWindows', 'isMac', 'isLinux', 'num_platforms', 'price_per_dlc',
    'achievements_per_dlc', 'log_copiesSold', 'log_price', 'genre_diversity',
    'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist',
    'publisherClass_Indie', 'price_category_Free', 'price_category_Low',
    'price_category_Medium', 'price_category_High', 'price_category_Very High',
    'price_category_Premium', 'Action', 'Adventure', 'Casual', 'Early Access',
    'Free To Play', 'Indie', 'Other_genres', 'RPG', 'Simulation', 'Strategy',
    'publisher_success_score', 'is_exclusive', 'release_month', 'release_quarter',
    'price_to_copies_ratio', 'game_complexity', 'has_metacritic',
    'genre_popularity', 'free_to_play_interaction'
]


X = train_df[features]


numerical_cols = [
    'metacritic', 'achievements_total', 'dlc_count', 'release_year', 'game_age',
    'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold',
    'log_price', 'genre_diversity', 'publisher_success_score', 'release_month',
    'release_quarter', 'price_to_copies_ratio', 'game_complexity',
    'genre_popularity'
]


X_scaled = X.copy()
X_scaled[numerical_cols] = scaler.fit_transform(X[numerical_cols])

for col in numerical_cols:
    min_val = X_scaled[col].min()
    if min_val < 0:
        X_scaled[col] = X_scaled[col] - min_val + 1e-6  # Small constant to avoid zero
    X_scaled[col] = X_scaled[col].clip(lower=0)  # Ensure no negatives


# Feature Selection with Chi-Squared
chi2_selector = SelectKBest(score_func=chi2, k=30)
X_selected = chi2_selector.fit_transform(X_scaled, y)
selected_features = [features[i] for i in chi2_selector.get_support(indices=True)]
print("Selected Features:", selected_features)

KeyError: "['release_year', 'game_age', 'isWindows', 'isMac', 'isLinux', 'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold', 'log_price', 'genre_diversity', 'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist', 'publisherClass_Indie', 'price_category_Free', 'price_category_Low', 'price_category_Medium', 'price_category_High', 'price_category_Very High', 'price_category_Premium', 'Action', 'Adventure', 'Casual', 'Early Access', 'Free To Play', 'Indie', 'Other_genres', 'RPG', 'Simulation', 'Strategy', 'publisher_success_score', 'is_exclusive', 'release_month', 'release_quarter', 'price_to_copies_ratio', 'game_complexity', 'has_metacritic', 'genre_popularity', 'free_to_play_interaction'] not in index"

In [None]:
print(len(train_df[selected_features]))

55542


In [None]:
x_test =  train_df[selected_features] 
y_test = test_df['reviewScore']

In [None]:
with open('catboost_model.pkl', 'rb') as file:
    catboost_model = pickle.load(file)

# 3. Make predictions  
predictions = catboost_model.predict(x_test)

In [None]:
if y_test is not None:
    if catboost_model.__class__.__name__ == 'CatBoostClassifier':
        accuracy = accuracy_score(y_test, predictions)
        print(f"Accuracy: {accuracy:.4f}")
    elif catboost_model.__class__.__name__ == 'CatBoostRegressor':
        rmse = mean_squared_error(y_test, predictions, squared=False)
        print(f"RMSE: {rmse:.4f}")
    else:
        print("Unknown model type.")


ValueError: Found input variables with inconsistent numbers of samples: [13886, 55542]


- Best CatBoost Parameters: {'bagging_temperature': 0, 'depth': 6, 'iterations': 1500, 'l2_leaf_reg': 5, 'learning_rate': 0.05}

- Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.8}

### Hypertuning 2 

In [None]:
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import VotingClassifier
# catboost = CatBoostClassifier(verbose=0, random_state=42)
# catboost_param_grid = {
#     'depth': [4, 6, 8, 10],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'iterations': [500, 1000, 1500],
#     'l2_leaf_reg': [1, 3, 5],
#     'bagging_temperature': [0, 0.5, 1]
# }
# catboost_grid = GridSearchCV(
#     estimator=catboost,
#     param_grid=catboost_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# catboost_grid.fit(X_train, y_train)
# best_catboost = catboost_grid.best_estimator_
# print("Best CatBoost Parameters:", catboost_grid.best_params_)

# # Hyperparameter Tuning for XGBoost
# xgboost = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
# xgboost_param_grid = {
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }
# xgboost_grid = GridSearchCV(
#     estimator=xgboost,
#     param_grid=xgboost_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# xgboost_grid.fit(X_train, y_train)
# best_xgboost = xgboost_grid.best_estimator_
# print("Best XGBoost Parameters:", xgboost_grid.best_params_)

# # Hyperparameter Tuning for LightGBM
# lightgbm = LGBMClassifier(random_state=42)
# lightgbm_param_grid = {
#     'num_leaves': [31, 50, 70],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }
# lightgbm_grid = GridSearchCV(
#     estimator=lightgbm,
#     param_grid=lightgbm_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# lightgbm_grid.fit(X_train, y_train)
# best_lightgbm = lightgbm_grid.best_estimator_
# print("Best LightGBM Parameters:", lightgbm_grid.best_params_)

# # Ensemble: Voting Classifier
# voting_clf = VotingClassifier(
#     estimators=[
#         ('catboost', best_catboost),
#         ('xgboost', best_xgboost),
#         ('lightgbm', best_lightgbm)
#     ],
#     voting='soft'
# )
# voting_clf.fit(X_train, y_train)

# # Evaluate All Models
# models = {
#     'CatBoost': best_catboost,
#     'XGBoost': best_xgboost,
#     'LightGBM': best_lightgbm,
#     'Voting Classifier': voting_clf
# }

# for name, model in models.items():
#     y_pred = model.predict(X_test)
#     print(f"\n{name} Evaluation:")
#     print("Accuracy:", accuracy_score(y_test, y_pred))