In [211]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 

In [212]:
######################################################################################################################################
######################################################################################################################################
######################################################################################################################################

In [213]:

# Entering and Checking on the data


In [214]:
data = pd.read_csv(r"original_data/ml_phase_1_extracted_data.csv")

In [215]:
print(data.info() , data.shape  , sep="\n\n\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39200 entries, 0 to 39199
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   app_id                  39200 non-null  float64
 1   game_title              39200 non-null  object 
 2   release_date            39200 non-null  object 
 3   win                     39200 non-null  bool   
 4   mac                     39200 non-null  bool   
 5   linux                   39200 non-null  bool   
 6   steam_ranking           39200 non-null  object 
 7   positive_ratio          39200 non-null  float64
 8   num_of_user_reviews     39200 non-null  float64
 9   final_price             39200 non-null  float64
 10  original_price          39200 non-null  float64
 11  discount                39200 non-null  float64
 12  steam_deck              39200 non-null  bool   
 13  num_of_played_hours     39200 non-null  float64
 14  num_of_recommendations  39200 non-null

In [216]:
data.sample(5)

Unnamed: 0,app_id,game_title,release_date,win,mac,linux,steam_ranking,positive_ratio,num_of_user_reviews,final_price,original_price,discount,steam_deck,num_of_played_hours,num_of_recommendations,price_difference,game_age,platforms_count,recommendation_ratio
13539,0.908894,The secret pyramid VR,2023-01-17,True,False,False,Positive,1.0,0.147048,0.577561,0.576973,0.0,True,0.0,0.0,0.813097,0.0,0.578998,0.0
25962,0.799928,Space Tower Defense,2021-09-09,True,False,False,Mixed,0.224275,0.168319,0.472586,0.471956,0.0,True,0.283173,0.34014,0.813097,0.286724,0.578998,0.510913
20424,0.269792,Terrian Saga KR 17,2014-07-09,True,True,True,Mostly Positive,0.477404,0.706511,0.529278,0.528664,0.0,True,0.707102,0.774565,0.813097,0.814343,1.0,0.669291
16375,0.740793,Drunk Soccer is the Best Soccer,2021-07-15,True,False,False,Positive,0.656726,0.365924,0.195518,0.195051,0.0,True,0.294237,0.502881,0.813097,0.286724,0.578998,0.606095
18051,0.453424,Ominous Tales The Forsaken Isle,2017-06-19,True,False,False,Mixed,0.180262,0.187808,0.619634,0.619077,0.0,True,0.44801,0.467024,0.813097,0.635094,0.578998,0.682549


In [217]:
print(data.isnull().sum() ,end="\n\n\n")

app_id                    0
game_title                0
release_date              0
win                       0
mac                       0
linux                     0
steam_ranking             0
positive_ratio            0
num_of_user_reviews       0
final_price               0
original_price            0
discount                  0
steam_deck                0
num_of_played_hours       0
num_of_recommendations    0
price_difference          0
game_age                  0
platforms_count           0
recommendation_ratio      0
dtype: int64




In [218]:
######################################################################################################################################
######################################################################################################################################
######################################################################################################################################

In [219]:

# Splitting 


In [220]:
x = data.drop(["steam_ranking"], axis=1)
y = data["steam_ranking"]

In [221]:
from sklearn.model_selection import train_test_split

x_train , x_temp , y_train , y_temp = train_test_split(x,
                                                       y ,
                                                       train_size=0.75 ,
                                                       shuffle=True , 
                                                       random_state=42 , 
                                                       stratify=y)

x_val , x_test , y_val , y_test = train_test_split(x_temp,
                                                  y_temp,
                                                  train_size=0.65 , 
                                                  shuffle=True , 
                                                  random_state=42 , 
                                                  stratify=y_temp)

In [222]:
######################################################################################################################################
######################################################################################################################################
######################################################################################################################################

In [223]:

# Encoding


In [224]:
columns_mapping = {"Mostly Negative" : 2 , 
                  "Mixed" : 5 , 
                  "Mostly Positive" : 7 , 
                  "Positive" : 8 , 
                  "Very Positive" : 9 , 
                  "Overwhelmingly Positive" : 10 }

y_train = y_train.map(columns_mapping)
y_test = y_test.map(columns_mapping)
y_val = y_val.map(columns_mapping)

In [225]:
col_freq = x_train["game_title"].value_counts().to_dict()
x_train["game_title"] = x_train["game_title"].map(col_freq)
x_test["game_title"] = x_test["game_title"].map(col_freq).fillna(0)
x_val["game_title"] = x_val["game_title"].map(col_freq).fillna(0)

# scalling  data
max = np.max(x_train["game_title"])
min = np.min(x_train["game_title"])
x_train["game_title"] = (x_train["game_title"] - min) / (max - min)
x_test["game_title"] = (x_test["game_title"] - min) / (max - min)
x_val["game_title"] = (x_val["game_title"] - min) / (max - min)

In [226]:
x_train["release_date"] = pd.to_datetime(x_train["release_date"])
reference_date  = x_train["release_date"].max()
x_train["release_date_days_since"] = ( reference_date  - x_train["release_date"] ).dt.days 
x_train.drop(["release_date"] , axis=1 , inplace=True)

x_test["release_date"] = pd.to_datetime(x_test["release_date"])
reference_date  = x_test["release_date"].max()
x_test["release_date_days_since"] = ( reference_date  - x_test["release_date"] ).dt.days 
x_test.drop(["release_date"] , axis=1 , inplace=True)

x_val["release_date"] = pd.to_datetime(x_val["release_date"])
reference_date  = x_val["release_date"].max()
x_val["release_date_days_since"] = ( reference_date  - x_val["release_date"] ).dt.days 
x_val.drop(["release_date"] , axis=1 , inplace=True)

# scalling  data
max = np.max(x_train["release_date_days_since"])
min = np.min(x_train["release_date_days_since"])
x_train["release_date_days_since"] = (x_train["release_date_days_since"] - min) / (max - min)
x_test["release_date_days_since"] = (x_test["release_date_days_since"] - min) / (max - min)
x_val["release_date_days_since"] = (x_val["release_date_days_since"] - min) / (max - min)

In [227]:
######################################################################################################################################
######################################################################################################################################
######################################################################################################################################

In [228]:

# Hyper Parameters Tunning 


In [229]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=20 , shuffle=True , random_state=42)

params_grid = {
    "criterion" : ["gini" , "entropy"] , 
    "max_depth" : [None , 50 , 100 , 200 , 300] , 
    "min_samples_split" : [2,3,4]
}

gs = GridSearchCV(estimator = DecisionTreeClassifier(splitter="best") , 
                 cv = skf , 
                 return_train_score = False , 
                 n_jobs = -1 , 
                 scoring = "roc_auc" , 
                 param_grid = params_grid)


gs.fit(x_val , y_val)

gs_results = gs.cv_results_
gs_best_score = gs.best_score_
gs_best_params = gs.best_params_

gs_data = pd.DataFrame(gs_results)

Traceback (most recent call last):
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/studio-lab-user/.conda/envs/default/lib/python3.9/site-packages/sklearn/metrics/_ranking.py", line 633, in roc_auc_score
    raise ValueError("multi_class must be i

In [230]:
gs_data

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,...,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,mean_test_score,std_test_score,rank_test_score
0,0.021589,0.002736,0.00366,0.00077,gini,,2,"{'criterion': 'gini', 'max_depth': None, 'min_...",,,...,,,,,,,,,,1
1,0.021239,0.003092,0.003626,0.00149,gini,,3,"{'criterion': 'gini', 'max_depth': None, 'min_...",,,...,,,,,,,,,,1
2,0.020721,0.001779,0.003212,0.000278,gini,,4,"{'criterion': 'gini', 'max_depth': None, 'min_...",,,...,,,,,,,,,,1
3,0.020536,0.001372,0.003328,0.000846,gini,50.0,2,"{'criterion': 'gini', 'max_depth': 50, 'min_sa...",,,...,,,,,,,,,,1
4,0.021359,0.002244,0.003592,0.001109,gini,50.0,3,"{'criterion': 'gini', 'max_depth': 50, 'min_sa...",,,...,,,,,,,,,,1
5,0.021231,0.002611,0.003248,0.000351,gini,50.0,4,"{'criterion': 'gini', 'max_depth': 50, 'min_sa...",,,...,,,,,,,,,,1
6,0.021239,0.002644,0.003148,0.000327,gini,100.0,2,"{'criterion': 'gini', 'max_depth': 100, 'min_s...",,,...,,,,,,,,,,1
7,0.022632,0.004453,0.004441,0.004203,gini,100.0,3,"{'criterion': 'gini', 'max_depth': 100, 'min_s...",,,...,,,,,,,,,,1
8,0.020807,0.002699,0.00322,0.000449,gini,100.0,4,"{'criterion': 'gini', 'max_depth': 100, 'min_s...",,,...,,,,,,,,,,1
9,0.021678,0.003008,0.003236,0.000335,gini,200.0,2,"{'criterion': 'gini', 'max_depth': 200, 'min_s...",,,...,,,,,,,,,,1


In [231]:
print(gs_best_score , gs_best_params , sep="\n\n")

nan

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}


In [232]:
######################################################################################################################################
######################################################################################################################################
######################################################################################################################################

In [233]:

## Craeating the final model


In [234]:
from sklearn.tree import DecisionTreeClassifier

final_model = DecisionTreeClassifier(
    criterion="entropy",
    splitter="best",
    max_depth=50,
    random_state=42 ,
    min_samples_split = 3 )

In [235]:
final_model.fit(x_train , y_train)

In [236]:
y_pred = final_model.predict(x_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

accuracy = accuracy_score(y_test , y_pred)
precision = precision_score(y_test , y_pred ,average = "macro")
recall= recall_score(y_test , y_pred , average = "macro")
f1 = f1_score(y_test , y_pred , average = "macro")

result = f"accuracy :{accuracy}\nprecision : {precision}\nrecall : {recall}\nf1 score : {f1}"
print(result)

accuracy :1.0
precision : 1.0
recall : 1.0
f1 score : 1.0


In [237]:
######################################################################################################################################
######################################################################################################################################
######################################################################################################################################