# **Imports**


In [1300]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder,StandardScaler,MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score,classification_report

# **Data Inspecting**

### Read Data 

In [1301]:
info_df = pd.read_csv('Classification_Dataset/info_base_games.csv')
gamalytic_df = pd.read_csv('Classification_Dataset/ms2_gamalytic_steam_games.csv')
dlcs_df = pd.read_csv('Classification_Dataset/dlcs.csv')
demos_df = pd.read_csv('Classification_Dataset/demos.csv')

  info_df = pd.read_csv('Classification_Dataset/info_base_games.csv')


### Renaming the id columns (so i can easliy merge the columns)


In [1302]:
info_df.rename(columns={'appid': 'id'}, inplace=True)
gamalytic_df.rename(columns={'steamId': 'id'}, inplace=True)
dlcs_df.rename(columns={'base_appid': 'id'}, inplace=True)
demos_df.rename(columns={'full_game_appid': 'id'}, inplace=True)

### adjusting the id data type

In [1303]:
info_df['id'] = info_df['id'].astype(str)
gamalytic_df['id'] = gamalytic_df['id'].astype(str)
dlcs_df['id'] = dlcs_df['id'].astype(str)
demos_df['id'] = demos_df['id'].astype(str)

### Merging the datasets on the id column


In [1304]:
# Merge info_df and gamalytic_df
merged_df = pd.merge(info_df, gamalytic_df, on='id', how='inner')

# Aggregate DLCs (count per game)
dlc_count = dlcs_df.groupby('id').size().reset_index(name='dlc_count')

merged_df = pd.merge(merged_df, dlc_count, on='id', how='left')
merged_df['dlc_count'] = merged_df['dlc_count'].fillna(0)

# Add demo presence
merged_df['hasDemo'] = merged_df['id'].isin(demos_df['id']).astype(int)

In [1305]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69428 entries, 0 to 69427
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   69428 non-null  object 
 1   name                 69428 non-null  object 
 2   metacritic           2933 non-null   object 
 3   steam_achievements   69428 non-null  bool   
 4   steam_trading_cards  69428 non-null  bool   
 5   workshop_support     69428 non-null  bool   
 6   genres               69324 non-null  object 
 7   achievements_total   37295 non-null  object 
 8   release_date         69426 non-null  object 
 9   supported_platforms  69428 non-null  object 
 10  price                69428 non-null  float64
 11  copiesSold           69428 non-null  int64  
 12  publisherClass       69428 non-null  object 
 13  reviewScore          69428 non-null  object 
 14  aiContent            0 non-null      float64
 15  dlc_count            69428 non-null 

In [1306]:
merged_df.isnull().sum()

id                         0
name                       0
metacritic             66495
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                   104
achievements_total     32133
release_date               2
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              69428
dlc_count                  0
hasDemo                    0
dtype: int64

In [1307]:
df = merged_df.copy()

# Splitting the data into training and testing sets

In [1308]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save to CSV files
# train_df.to_csv('train.csv', index=False)
# test_df.to_csv('test.csv', index=False)

In [1309]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55542 entries, 33136 to 15795
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   55542 non-null  object 
 1   name                 55542 non-null  object 
 2   metacritic           2332 non-null   object 
 3   steam_achievements   55542 non-null  bool   
 4   steam_trading_cards  55542 non-null  bool   
 5   workshop_support     55542 non-null  bool   
 6   genres               55456 non-null  object 
 7   achievements_total   29878 non-null  object 
 8   release_date         55541 non-null  object 
 9   supported_platforms  55542 non-null  object 
 10  price                55542 non-null  float64
 11  copiesSold           55542 non-null  int64  
 12  publisherClass       55542 non-null  object 
 13  reviewScore          55542 non-null  object 
 14  aiContent            0 non-null      float64
 15  dlc_count            55542 non-null  

# Data wrangling 


In [1310]:
train_df['metacritic'] = pd.to_numeric(train_df['metacritic'], errors='coerce').fillna(0)
train_df['achievements_total'] = pd.to_numeric(train_df['achievements_total'], errors='coerce').fillna(0)
train_df['genres'] = train_df['genres'].fillna('Unknown')
train_df['release_date'] = train_df['release_date'].replace('Coming soon', pd.NA)
train_df['release_date'] = pd.to_datetime(train_df['release_date'], errors='coerce')
train_df['release_year'] = train_df['release_date'].dt.year.fillna(train_df['release_date'].dt.year.mode()[0])

In [1311]:
train_df.isnull().sum()

id                         0
name                       0
metacritic                 0
steam_achievements         0
steam_trading_cards        0
workshop_support           0
genres                     0
achievements_total         0
release_date             967
supported_platforms        0
price                      0
copiesSold                 0
publisherClass             0
reviewScore                0
aiContent              55542
dlc_count                  0
hasDemo                    0
release_year               0
dtype: int64

In [1312]:
train_df.drop(columns=['aiContent'], inplace=True)

In [1313]:
train_df.isnull().sum()

id                       0
name                     0
metacritic               0
steam_achievements       0
steam_trading_cards      0
workshop_support         0
genres                   0
achievements_total       0
release_date           967
supported_platforms      0
price                    0
copiesSold               0
publisherClass           0
reviewScore              0
dlc_count                0
hasDemo                  0
release_year             0
dtype: int64

In [1314]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55542 entries, 33136 to 15795
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   55542 non-null  object        
 1   name                 55542 non-null  object        
 2   metacritic           55542 non-null  float64       
 3   steam_achievements   55542 non-null  bool          
 4   steam_trading_cards  55542 non-null  bool          
 5   workshop_support     55542 non-null  bool          
 6   genres               55542 non-null  object        
 7   achievements_total   55542 non-null  float64       
 8   release_date         54575 non-null  datetime64[ns]
 9   supported_platforms  55542 non-null  object        
 10  price                55542 non-null  float64       
 11  copiesSold           55542 non-null  int64         
 12  publisherClass       55542 non-null  object        
 13  reviewScore          55542 non-n

# Preprocessing

In [1315]:
from datetime import datetime
current_year = datetime.now().year
train_df['game_age'] = current_year - train_df['release_year']

# 2. Price Categories
train_df['price_category'] = pd.cut(
    train_df['price'],
    bins=[-1, 0, 5, 15, 30, 60, float('inf')],
    labels=['Free', 'Low', 'Medium', 'High', 'Very High', 'Premium']
)

In [1316]:
train_df['isWindows'] = train_df['supported_platforms'].apply(lambda x: 1 if 'windows' in str(x).lower() else 0)
train_df['isMac'] = train_df['supported_platforms'].apply(lambda x: 1 if 'mac' in str(x).lower() else 0)
train_df['isLinux'] = train_df['supported_platforms'].apply(lambda x: 1 if 'linux' in str(x).lower() else 0)
train_df['num_platforms'] = train_df[['isWindows', 'isMac', 'isLinux']].sum(axis=1)

In [1317]:
train_df['price_per_dlc'] = train_df['price'] / (train_df['dlc_count'] + 1)
train_df['achievements_per_dlc'] = train_df['achievements_total'] / (train_df['dlc_count'] + 1)

# 5. Log Transformation
train_df['log_copiesSold'] = np.log1p(train_df['copiesSold'].clip(lower=0))
train_df['log_price'] = np.log1p(train_df['price'].clip(lower=0))


In [1318]:
def cap_outliers(series, lower_quantile=0.01, upper_quantile=0.99):
    lower = series.quantile(lower_quantile)
    upper = series.quantile(upper_quantile)
    return series.clip(lower=lower, upper=upper)

train_df['copiesSold'] = cap_outliers(train_df['copiesSold'])
train_df['price'] = cap_outliers(train_df['price'])
train_df['achievements_total'] = cap_outliers(train_df['achievements_total'])
train_df['dlc_count'] = cap_outliers(train_df['dlc_count'])

In [1319]:
#train_df = pd.get_dummies(train_df, columns=['publisherClass', 'price_category'], dtype=int)

In [1320]:
train_df.reviewScore.value_counts()

reviewScore
Positive                   21501
Mixed                      11976
Mostly Positive             7481
Very Positive               6710
Negative                    4877
Mostly Negative             2348
Overwhelmingly Positive      621
Very Negative                 21
Overwhelmingly Negative        7
Name: count, dtype: int64

In [1321]:
categorical_columns = ['steam_achievements', 'steam_trading_cards', 'workshop_support','reviewScore']
encoder = LabelEncoder()
for col in categorical_columns:
    train_df[col] = encoder.fit_transform(train_df[col])

### Copiessold outliers 

In [1322]:
train_df['copiesSold'].describe()

count    5.554200e+04
mean     3.517690e+04
std      1.669331e+05
min      1.000000e+00
25%      6.000000e+01
50%      4.540000e+02
75%      3.956000e+03
max      1.360106e+06
Name: copiesSold, dtype: float64

In [1323]:
Q1 = train_df['copiesSold'].quantile(0.25)
Q3 = train_df['copiesSold'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = train_df[(train_df['copiesSold'] < lower_bound) | (train_df['copiesSold'] > upper_bound)]
print(lower_bound)
print(upper_bound)

-5784.0
9800.0


In [1324]:
train_df.loc[train_df['copiesSold'] <= 0, 'copiesSold'] *= -1

In [1325]:
#train_df = (train_df[train_df['copiesSold'] <= 1])

## Price outliers

In [1326]:
train_df.price.describe()

count    55542.000000
mean         7.344236
std          8.303910
min          0.000000
25%          0.990000
50%          4.990000
75%          9.990000
max         44.990000
Name: price, dtype: float64

In [1327]:
Q1 = train_df['price'].quantile(0.25)
Q3 = train_df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = train_df[(train_df['price'] < lower_bound) | (train_df['price'] > upper_bound)]
print(lower_bound)
print(upper_bound)

-12.51
23.490000000000002


In [1328]:
train_df[train_df['price'] > 200 ]

Unnamed: 0,id,name,metacritic,steam_achievements,steam_trading_cards,workshop_support,genres,achievements_total,release_date,supported_platforms,...,game_age,price_category,isWindows,isMac,isLinux,num_platforms,price_per_dlc,achievements_per_dlc,log_copiesSold,log_price


In [1329]:
train_df = train_df[(train_df['price'] <= 200)]

In [1330]:
train_df['genres_split'] = train_df['genres'].str.split(',')

exploded_genres = train_df.explode('genres_split')
exploded_genres['genres_split'] = exploded_genres['genres_split'].str.strip()
genre_counts = exploded_genres['genres_split'].value_counts()


rare_genres = genre_counts[genre_counts < 5000].index
exploded_genres['genres_split'] = exploded_genres['genres_split'].apply(lambda x: 'Other_genres' if x in rare_genres else x)

train_df['genres'] = (
    exploded_genres
    .groupby(exploded_genres.index)['genres_split']
    .apply(lambda x: ', '.join(sorted(set(x))))
)

train_df.drop(columns=['genres_split'],inplace=True)
train_df['genre_diversity'] = train_df['genres'].str.split(',').apply(len)

In [1331]:
train_df['genres_list'] = train_df['genres'].str.split(', ')

mlb = MultiLabelBinarizer()

genres_encoded = mlb.fit_transform(list(train_df['genres_list']))
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_,index=train_df.index)

train_df = pd.concat([train_df, genres_df], axis=1)
train_df.drop(columns=['genres_list','genres'], inplace=True)

In [1332]:
temp_encoder = LabelEncoder()
train_df['temp_reviewScore'] = temp_encoder.fit_transform(train_df['reviewScore'])
publisher_success = train_df.groupby('publisherClass')['temp_reviewScore'].mean().to_dict()
train_df['publisher_success_score'] = train_df['publisherClass'].map(publisher_success)
train_df.drop(columns=['temp_reviewScore'], inplace=True)

# 7. NEW: Platform Exclusivity
train_df['is_exclusive'] = (train_df['num_platforms'] == 1).astype(int)

# 8. NEW: Release Seasonality
train_df['release_month'] = train_df['release_date'].dt.month.fillna(train_df['release_date'].dt.month.mode()[0])
train_df['release_quarter'] = train_df['release_date'].dt.quarter.fillna(train_df['release_date'].dt.quarter.mode()[0])

# 9. NEW: Price-to-CopiesSold Ratio
train_df['price_to_copies_ratio'] = train_df['price'] / (train_df['log_copiesSold'] + 1e-6)

# 10. NEW: Game Complexity Proxy
train_df['game_complexity'] = (
    train_df['achievements_total'] / (train_df['achievements_total'].max() + 1e-6) +
    train_df['dlc_count'] / (train_df['dlc_count'].max() + 1e-6) +
    train_df['genre_diversity'] / (train_df['genre_diversity'].max() + 1e-6)
)

# 11. NEW: Metacritic Binary
train_df['has_metacritic'] = (train_df['metacritic'] > 0).astype(int)

In [1333]:
train_df = pd.get_dummies(train_df, columns=['publisherClass', 'price_category'], dtype=int)

In [1334]:
genre_cols = mlb.classes_
genre_popularity = {}
for genre in genre_cols:
    mask = train_df[genre] == 1
    genre_popularity[genre] = train_df[mask]['copiesSold'].mean() if mask.sum() > 0 else 0
train_df['genre_popularity'] = train_df[genre_cols].dot(list(genre_popularity.values())) / train_df[genre_cols].sum(axis=1)

# NEW: Free-to-Play Interaction
train_df['free_to_play_interaction'] = train_df['price_category_Free'] * train_df['Free To Play']

In [1335]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55542 entries, 33136 to 15795
Data columns (total 54 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        55542 non-null  object        
 1   name                      55542 non-null  object        
 2   metacritic                55542 non-null  float64       
 3   steam_achievements        55542 non-null  int64         
 4   steam_trading_cards       55542 non-null  int64         
 5   workshop_support          55542 non-null  int64         
 6   achievements_total        55542 non-null  float64       
 7   release_date              54575 non-null  datetime64[ns]
 8   supported_platforms       55542 non-null  object        
 9   price                     55542 non-null  float64       
 10  copiesSold                55542 non-null  float64       
 11  reviewScore               55542 non-null  int32         
 12  dlc_count          

In [1336]:
features = [
    'metacritic', 'steam_achievements', 'steam_trading_cards', 'workshop_support',
    'achievements_total', 'dlc_count', 'hasDemo', 'release_year', 'game_age',
    'isWindows', 'isMac', 'isLinux', 'num_platforms', 'price_per_dlc',
    'achievements_per_dlc', 'log_copiesSold', 'log_price', 'genre_diversity',
    'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist',
    'publisherClass_Indie', 'price_category_Free', 'price_category_Low',
    'price_category_Medium', 'price_category_High', 'price_category_Very High',
    'price_category_Premium', 'Action', 'Adventure', 'Casual', 'Early Access',
    'Free To Play', 'Indie', 'Other_genres', 'RPG', 'Simulation', 'Strategy',
    'publisher_success_score', 'is_exclusive', 'release_month', 'release_quarter',
    'price_to_copies_ratio', 'game_complexity', 'has_metacritic',
    'genre_popularity', 'free_to_play_interaction'
]

In [1337]:
len(features)

47

### Chi_squared 

In [1338]:
X = train_df[features]
y = train_df['reviewScore']
scaler = StandardScaler()
# numerical_cols = [
#     'metacritic', 'achievements_total', 'dlc_count', 'release_year', 'game_age',
#     'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold',
#     'log_price', 'genre_diversity'
# ]
numerical_cols = [
    'metacritic', 'achievements_total', 'dlc_count', 'release_year', 'game_age',
    'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold',
    'log_price', 'genre_diversity', 'publisher_success_score', 'release_month',
    'release_quarter', 'price_to_copies_ratio', 'game_complexity',
    'genre_popularity'
]
X_scaled = X.copy()
X_scaled[numerical_cols] = scaler.fit_transform(X[numerical_cols])
# Apply Chi-Squared
# chi2_selector = SelectKBest(score_func=chi2, k='all')
# chi2_selector.fit(X, y)
# feature_scores = pd.DataFrame({'Feature': columns, 'Chi2 Score': chi2_selector.scores_})
# print(feature_scores.sort_values(by='Chi2 Score', ascending=False))

In [1339]:
for col in numerical_cols:
    min_val = X_scaled[col].min()
    if min_val < 0:
        X_scaled[col] = X_scaled[col] - min_val + 1e-6  # Small constant to avoid zero
    X_scaled[col] = X_scaled[col].clip(lower=0)  # Ensure no negatives

# Handle any infinities or NaNs
X_scaled = X_scaled.replace([np.inf, -np.inf], 0).fillna(0)

# Apply SMOTE to the top 5 classes
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Feature Selection with Chi-Squared
chi2_selector = SelectKBest(score_func=chi2, k=30)
X_selected = chi2_selector.fit_transform(X_scaled, y)
selected_features = [features[i] for i in chi2_selector.get_support(indices=True)]
print("Selected Features:", selected_features)

Selected Features: ['metacritic', 'steam_achievements', 'steam_trading_cards', 'workshop_support', 'achievements_total', 'dlc_count', 'hasDemo', 'release_year', 'game_age', 'num_platforms', 'price_per_dlc', 'achievements_per_dlc', 'log_copiesSold', 'log_price', 'publisherClass_AA', 'publisherClass_AAA', 'publisherClass_Hobbyist', 'publisherClass_Indie', 'price_category_Free', 'price_category_Low', 'price_category_High', 'price_category_Very High', 'Free To Play', 'Simulation', 'publisher_success_score', 'price_to_copies_ratio', 'game_complexity', 'has_metacritic', 'genre_popularity', 'free_to_play_interaction']


### get top features score

In [1340]:
feature_scores = pd.DataFrame({
    'Feature': features,
    'Chi2 Score': chi2_selector.scores_
})

# Sort features by Chi2 Score in descending order
top_features = feature_scores.sort_values(by='Chi2 Score', ascending=False)

# Print the top features
print("Top Features Ordered by Chi-Squared Score:\n", top_features)

# Optionally, get the top N features (e.g., top 20)
N = 20
print(f"\nTop {N} Features:\n", top_features.head(N))

Top Features Ordered by Chi-Squared Score:
                      Feature    Chi2 Score
0                 metacritic  27808.930478
38   publisher_success_score  16001.681917
15            log_copiesSold  15342.621271
4         achievements_total   5616.988137
44            has_metacritic   4933.186908
5                  dlc_count   4736.782179
8                   game_age   4523.969636
20   publisherClass_Hobbyist   4460.084024
42     price_to_copies_ratio   4399.603424
21      publisherClass_Indie   3809.683373
43           game_complexity   3007.157650
18         publisherClass_AA   2594.328764
23        price_category_Low   2047.202061
25       price_category_High   1897.108046
14      achievements_per_dlc   1615.372718
2        steam_trading_cards   1564.706097
19        publisherClass_AAA   1381.236134
22       price_category_Free   1212.630220
13             price_per_dlc   1019.828692
12             num_platforms    960.083787
46  free_to_play_interaction    948.780210
32        

In [1341]:
len(top_features)

47

In [1342]:
# top_features = feature_scores.sort_values(by='Chi2 Score', ascending=False).head(47)['Feature'].tolist()
# X_selected = X[top_features] 

In [1343]:
# top_features

In [1344]:
# X_chi2 = X.copy()
# for col in X_chi2.columns:
#     min_val = X_chi2[col].min()
#     if min_val < 0:
#         X_chi2[col] = X_chi2[col] - min_val

# selector = SelectKBest(score_func=chi2, k=10)
# X_selected = selector.fit_transform(X_chi2, y)

### Hypertuning 1

In [1345]:
# catboost = CatBoostClassifier(verbose=0, random_state=42)
# param_grid = {
#     'depth': [6, 8],
#     'learning_rate': [0.01, 0.05],
#     'iterations': [500, 1000]
# }
# grid_search = GridSearchCV(
#     estimator=catboost,
#     param_grid=param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# grid_search.fit(X_train, y_train)

# # Best Model
# best_model = grid_search.best_estimator_
# print("Best Parameters:", grid_search.best_params_)

- random state 53 ==> 59.82% 
- random state 57 ==> 59.98%
- random state 55 ==> 60.04%
- random state 54 1400 iter ==> 60.35%
- random state 54 1500 iter ==> 60.52%
- random state 54 test size 15% 1400 iter ==> 60.60 %
- random state 54 test size 20% 1500 iter 0.08 lr ==> 60.67%

In [1346]:
from catboost import CatBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=54)

catboost_model = CatBoostClassifier(verbose=0,bagging_temperature=0, depth= 6, iterations= 1500, l2_leaf_reg= 5, learning_rate=0.08)  
catboost_model.fit(X_train, y_train)

# Evaluation
y_pred = catboost_model.predict(X_test)
print("CatBoost Accuracy:", accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

CatBoost Accuracy: 60.67152759024215
              precision    recall  f1-score   support

           0       0.50      0.42      0.46      2400
           1       0.38      0.04      0.08       427
           2       0.49      0.07      0.12      1514
           3       0.69      0.67      0.68       965
           5       0.47      0.22      0.30       129
           6       0.64      0.87      0.74      4271
           7       0.00      0.00      0.00         4
           8       0.61      0.87      0.72      1399

    accuracy                           0.61     11109
   macro avg       0.47      0.40      0.39     11109
weighted avg       0.58      0.61      0.55     11109



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1357]:
xgboost_model =xgb.XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.09, max_depth= 5, n_estimators= 500, subsample= 0.8)  
xgboost_model.fit(X_train, y_train)

# Evaluation
y_pred = xgboost_model.predict(X_test)
print("xgb Accuracy:", accuracy_score(y_test, y_pred)*100)
print(classification_report(y_test, y_pred))

xgb Accuracy: 60.1404266810694
              precision    recall  f1-score   support

           0       0.49      0.41      0.45      2400
           1       0.38      0.05      0.09       427
           2       0.42      0.07      0.12      1514
           3       0.69      0.67      0.68       965
           5       0.39      0.19      0.25       129
           6       0.64      0.86      0.73      4271
           7       0.00      0.00      0.00         4
           8       0.60      0.86      0.71      1399

    accuracy                           0.60     11109
   macro avg       0.45      0.39      0.38     11109
weighted avg       0.56      0.60      0.55     11109



In [1367]:
from lightgbm import LGBMClassifier

lightgbm_model = LGBMClassifier(
    colsample_bytree=0.8,
    learning_rate=0.08,
    num_leaves=80,
    n_estimators=500,
    subsample=0.8,
    random_state=54
)

# Fit the model
lightgbm_model.fit(X_train, y_train)

# Evaluation
y_pred = lightgbm_model.predict(X_test)
print("LightGBM Accuracy:", accuracy_score(y_test, y_pred) * 100)
print(classification_report(y_test, y_pred))

LightGBM Accuracy: 56.71077504725898
              precision    recall  f1-score   support

           0       0.45      0.40      0.43      2400
           1       0.22      0.10      0.14       427
           2       0.30      0.13      0.19      1514
           3       0.64      0.64      0.64       965
           4       0.00      0.00      0.00         0
           5       0.23      0.29      0.25       129
           6       0.65      0.80      0.71      4271
           7       0.05      0.25      0.08         4
           8       0.62      0.73      0.67      1399

    accuracy                           0.57     11109
   macro avg       0.35      0.37      0.34     11109
weighted avg       0.53      0.57      0.54     11109



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



- Best CatBoost Parameters: {'bagging_temperature': 0, 'depth': 6, 'iterations': 1500, 'l2_leaf_reg': 5, 'learning_rate': 0.05}

- Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.8}

### Hypertuning 2 

In [1348]:
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import VotingClassifier
# catboost = CatBoostClassifier(verbose=0, random_state=42)
# catboost_param_grid = {
#     'depth': [4, 6, 8, 10],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'iterations': [500, 1000, 1500],
#     'l2_leaf_reg': [1, 3, 5],
#     'bagging_temperature': [0, 0.5, 1]
# }
# catboost_grid = GridSearchCV(
#     estimator=catboost,
#     param_grid=catboost_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# catboost_grid.fit(X_train, y_train)
# best_catboost = catboost_grid.best_estimator_
# print("Best CatBoost Parameters:", catboost_grid.best_params_)

# # Hyperparameter Tuning for XGBoost
# xgboost = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
# xgboost_param_grid = {
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }
# xgboost_grid = GridSearchCV(
#     estimator=xgboost,
#     param_grid=xgboost_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# xgboost_grid.fit(X_train, y_train)
# best_xgboost = xgboost_grid.best_estimator_
# print("Best XGBoost Parameters:", xgboost_grid.best_params_)

# # Hyperparameter Tuning for LightGBM
# lightgbm = LGBMClassifier(random_state=42)
# lightgbm_param_grid = {
#     'num_leaves': [31, 50, 70],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'n_estimators': [100, 200, 500],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }
# lightgbm_grid = GridSearchCV(
#     estimator=lightgbm,
#     param_grid=lightgbm_param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1
# )
# lightgbm_grid.fit(X_train, y_train)
# best_lightgbm = lightgbm_grid.best_estimator_
# print("Best LightGBM Parameters:", lightgbm_grid.best_params_)

# # Ensemble: Voting Classifier
# voting_clf = VotingClassifier(
#     estimators=[
#         ('catboost', best_catboost),
#         ('xgboost', best_xgboost),
#         ('lightgbm', best_lightgbm)
#     ],
#     voting='soft'
# )
# voting_clf.fit(X_train, y_train)

# # Evaluate All Models
# models = {
#     'CatBoost': best_catboost,
#     'XGBoost': best_xgboost,
#     'LightGBM': best_lightgbm,
#     'Voting Classifier': voting_clf
# }

# for name, model in models.items():
#     y_pred = model.predict(X_test)
#     print(f"\n{name} Evaluation:")
#     print("Accuracy:", accuracy_score(y_test, y_pred))