#  Интерпретируемое машинное обучение для диссера

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as wg

from matplotlib.colors import ListedColormap
%matplotlib inline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
import math

In [None]:
from sklearn.linear_model import LassoCV, Ridge, RidgeCV, ElasticNet, LassoLarsCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_squared_log_error, make_scorer, r2_score, mean_absolute_error
def rmse(true, pred):
    return np.sqrt(mean_squared_error(true, pred))

def rmsle(true, pred):
    return np.sqrt(mean_squared_log_error(true, pred))

rmse_score = make_scorer(rmse, greater_is_better=False)
rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## Подготовка данных

In [None]:
df = pd.read_csv("../input/df-thesis/DF2.csv", sep=';', header=0)
log = pd.read_csv("../input/df-thesis/log.csv", sep=';', header=0)

In [None]:
df[0]

In [None]:
log_squared = log**2

In [None]:
CLASSES = sns.countplot(x = '5%', data = df, palette=["#FF0000", "#32cd32"])
CLASSES.set(xlabel='Входит в верхние 5%')
CLASSES.set(ylabel='Количество игр')

In [None]:
CLASSES = sns.countplot(x = '10%', data = df, palette=["#FF0000", "#32cd32"])
CLASSES.set(xlabel='Входит в верхние 10%')
CLASSES.set(ylabel='Количество игр')

In [None]:
df

In [None]:
df.iloc[10, :]

In [None]:
df.iloc[:,10]

In [None]:
df['Rating'] = df['positive'] / df['negative']

In [None]:
Y = df['10%']
X = pd.concat([df[' Simulation'], df[' First-Person Shooter'], df.iloc[:,10], df[' Changing The Game'], df[' Expansive Expansions'], df[' Horse Armor'], df[' Infinite Money Hole'], df.iloc[:, 16], df[' Time Is Money'], df['Rating'], log], axis = 1)

In [None]:
X_train = X
y_train = Y

In [None]:
X_fit, X_tst, y_fit, y_tst = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print('Fit:', y_fit.shape)
print('Test:', y_tst.shape)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
accuracy = make_scorer(accuracy_score)
roc_auc = make_scorer(roc_auc_score)
f1 = make_scorer(f1_score)
#accuracy_best = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
#roc_auc_best = cross_val_score(model, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
#print('Лучшие точности:', accuracy_best)
#print('Лучшие AUC-ROC', roc_auc_best)

# Логистическая регрессия

In [None]:
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
logreg = LogisticRegression()

In [None]:
model_s = GridSearchCV(logreg, grid, verbose=1, n_jobs=-1)
model_s.fit(X_train, y_train)
display(model_s.best_estimator_)

In [None]:
logreg = LogisticRegression(C=0.01, penalty = 'l2')
logreg.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = logreg.predict(X_train)
accuracy_best_1 = cross_val_score(logreg, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
roc_auc_best_1 = cross_val_score(logreg, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
f1_best_1 = cross_val_score(logreg, X_train, y_train, cv=cv, n_jobs = -1, scoring=f1, verbose = True)
print('Точность:', accuracy_best_1, '\nAUC ROC:', roc_auc_best_1, '\nF-мера:', f1_best_1)

## SVM

In [None]:
model_2 = SVC()
model_2.fit(X_train, y_train)

In [None]:
model_s = GridSearchCV(model_2, {
    'C': np.logspace(0, 1, 5, 10),
}, verbose=1, n_jobs=-1)

model_s.fit(X_train, y_train)
display(model_s.best_estimator_)

In [None]:
model_svm = SVC(C=1.7782794100389228, gamma=0.12451970847350328)
model_svm.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = model_svm.predict(X_train)
accuracy_best_3 = cross_val_score(model_svm, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
roc_auc_best_3 = cross_val_score(model_svm, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
f1_best_3 = cross_val_score(model_svm, X_train, y_train, cv=cv, n_jobs = -1, scoring=f1, verbose = True)
print('Точность:', accuracy_best_3, '\nAUC ROC:', roc_auc_best_3, '\nF-мера:', f1_best_3)

In [None]:
print('Средние метрики моделей:', '\nSVM:', '\nТочность:', np.mean(accuracy_best_3), '\nAUC ROC:', np.mean(roc_auc_best_3), '\nF-мера:', np.mean(f1_best_3))

## XGBoost + LGBM classification problem

Firstly, we start with XGBClassifier. Select hyperparameters by GridSearchCV

In [None]:
%%time

xgbmodel = XGBClassifier(
    n_jobs=1,
    random_state=42
)

cv_xgbmodel = GridSearchCV(xgbmodel, {
        'min_child_weight': [5, 10],
        'gamma': [0.5, 5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 5],
        'booster': ['gbtree', 'gblinear', 'dart'],
    }, n_jobs=-1, verbose=10,
    cv=KFold(5, shuffle=True, random_state=42)
)
cv_xgbmodel.fit(X_train, y_train)
display(cv_xgbmodel.best_estimator_)


In [None]:
model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0,
              enable_categorical=False, gamma=0.5, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=5, monotone_constraints='()',
              n_estimators=100, n_jobs=1, num_parallel_tree=1, predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.6, tree_method='exact', validate_parameters=1,
              verbosity=None)
model_xgb.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = model_xgb.predict(X_train)
accuracy_best_4 = cross_val_score(model_xgb, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
roc_auc_best_4 = cross_val_score(model_xgb, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
f1_best_4 = cross_val_score(model_xgb, X_train, y_train, cv=cv, n_jobs = -1, scoring=f1, verbose = True)
print('Точность:', accuracy_best_4, '\nAUC ROC:', roc_auc_best_4, '\nF-мера:', f1_best_4)

Initialize the model.

Fit it with train data.

In [None]:
X_train = X_train.to_numpy()

In [None]:
%%time

lgbmodel = LGBMClassifier(
    n_jobs=1,
    random_state=42
)

cv_lgbmodel = GridSearchCV(lgbmodel, {
        'lambda_l1': [0.001, 0.1],
        'lambda_l2': [0.001, 0.1],
        'num_leaves': [10, 50],
        'feature_fraction': [0.2, 0.8],
        'subsample': [0.2, 0.8],
        'learning_rate': [0.01, 0.05],
        'num_iterations': [500, 1000]
    }, n_jobs=-1, verbose=10,
    cv=KFold(5, shuffle=True, random_state=42)
)
cv_lgbmodel.fit(X_train, y_train)
display(cv_lgbmodel.best_estimator_)


In [None]:
lgbmodel = LGBMClassifier(feature_fraction=0.8, lambda_l1=0.001, lambda_l2=0.1,
               learning_rate=0.01, n_jobs=1, num_iterations=500, num_leaves=10,
               random_state=42, subsample=0.2)
lgbmodel.fit(X_train, y_train)
lgbmodel.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = lgbmodel.predict(X_train)
accuracy_best_5 = cross_val_score(lgbmodel, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
roc_auc_best_5 = cross_val_score(lgbmodel, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
f1_best_5 = cross_val_score(lgbmodel, X_train, y_train, cv=cv, n_jobs = -1, scoring=f1, verbose = True)
print('Точность:', accuracy_best_5, '\nAUC ROC:', roc_auc_best_5, '\nF-мера:', f1_best_5)

# Decision Tree

In [None]:
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()
pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])

In [None]:
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]
n_components = list(range(1,X_train.shape[1]+1,1))
dec_tree = tree.DecisionTreeClassifier()

In [None]:
parameters = dict(pca__n_components=n_components,
                      dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth)

In [None]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train)
print('Best Criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['dec_tree'])

In [None]:
dcmodel = DecisionTreeClassifier(criterion='gini', max_depth= 8)
dcmodel.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = dcmodel.predict(X_train)
accuracy_best_6 = cross_val_score(dcmodel, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
roc_auc_best_6 = cross_val_score(dcmodel, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
f1_best_6 = cross_val_score(dcmodel, X_train, y_train, cv=cv, n_jobs = -1, scoring=f1, verbose = True)
print('Точность:', accuracy_best_6, '\nAUC ROC:', roc_auc_best_6, '\nF-мера:', f1_best_6)

In [None]:
fn = list(X_train.columns)
cn=['Не верхний дециль', 'Верхний дециль']

In [None]:
fig = plt.figure(figsize=(400, 100)) 
tree.plot_tree(dcmodel, feature_names=fn,  class_names=cn, filled=True)

In [None]:
fig.savefig("decistion_tree.png")

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


In [None]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, verbose=10)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

In [None]:
rfcmodel = RandomForestClassifier(criterion='entropy', max_depth= 5, max_features = 'auto', n_estimators = 500)
rfcmodel.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = rfcmodel.predict(X_train)
accuracy_best_7 = cross_val_score(rfcmodel, X_train, y_train, cv=cv, n_jobs=-1, scoring=accuracy, verbose=True)
roc_auc_best_7 = cross_val_score(rfcmodel, X_train, y_train, cv=cv, n_jobs=-1, scoring=roc_auc, verbose=True)
f1_best_7 = cross_val_score(rfcmodel, X_train, y_train, cv=cv, n_jobs = -1, scoring=f1, verbose = True)
print('Точность:', accuracy_best_7, '\nAUC ROC:', roc_auc_best_7, '\nF-мера:', f1_best_7)

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(
    rfcmodel, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
forest_importances = pd.Series(result.importances_mean, index=fn)

In [None]:
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Важность признаков")
ax.set_ylabel("Среднее убывание точности")
fig.tight_layout()
plt.show()

In [None]:
print('Средние метрики моделей:','\nЛогистическая регрессия:', '\nТочность:', np.mean(accuracy_best_1), '\nAUC ROC:', np.mean(roc_auc_best_1), '\nF-мера:', np.mean(f1_best_1), '\nSVM:', '\nТочность:', np.mean(accuracy_best_3), '\nAUC ROC:', np.mean(roc_auc_best_3), '\nF-мера:', np.mean(f1_best_3), '\nДрево решений:', '\nТочность:', np.mean(accuracy_best_6), '\nAUC ROC:', np.mean(roc_auc_best_6), '\nF-мера:', np.mean(f1_best_6), '\nСлучайный лес:', '\nТочность:', np.mean(accuracy_best_7), '\nAUC ROC:', np.mean(roc_auc_best_7), '\nF-мера:', np.mean(f1_best_7),'\nXGBoost:', '\nТочность:', np.mean(accuracy_best_4), '\nAUC ROC:', np.mean(roc_auc_best_4), '\nF-мера:', np.mean(f1_best_4), '\nLGBM:', '\nТочность:', np.mean(accuracy_best_5), '\nAUC ROC:', np.mean(roc_auc_best_5), '\nF-мера:', np.mean(f1_best_5))

# Интерпретация с помощью SHAP

In [None]:
import shap
shap_test = shap.TreeExplainer(model_xgb).shap_values(X_train)

shap.summary_plot(shap_test, X_train,
                      max_display=50)

In [None]:
explainer = shap.Explainer(model_xgb)
shap_values = explainer(X_train)

In [None]:
shap.plots.bar(shap_values)

In [None]:
### War Thunder
shap.plots.waterfall(shap_values[187])

In [None]:
### Team Fortress 2
shap.plots.waterfall(shap_values[164])

In [None]:
### Apex Legends
shap.plots.waterfall(shap_values[10])

In [None]:
###  CS: GO
shap.plots.waterfall(shap_values[35])

# Permutation Importance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model_xgb, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = X_train.columns.tolist())


# Мобильные игры

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
def metrics(true, pred, verbose=False):
    mse = mean_squared_error(true, pred)
    r2 = r2_score(true, pred)

    if verbose:
        print('MSE: ', mse)
        print('R2:  ', r2)
        print()
        
    return (mse, r2)

In [None]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
df = pd.read_csv("../input/df-thesis/APPMagic.csv", sep=';', header=0)


In [None]:
df = df.iloc[0:390,]

In [None]:
df['log DAU'] = np.log(df['est_DAU'])


In [None]:
X_train = pd.concat([df['in_app_purchases'], df['sell_ad'], df['log DAU'], df['sports'], df['strategy'], df['action'], df['rpg'], df['casino'], df['casual'], df['asian_share']], axis = 1)

In [None]:
y_train = df['ROI']

In [None]:
min(df['est_DAU'])

# SVR

In [None]:
model_2 = SVR()
model_2.fit(X_train, y_train)

In [None]:
model_s = GridSearchCV(model_2, {
    'C': np.logspace(0, 1, 5, 10),
    'gamma': np.logspace(0, 1, 5, 10)
}, verbose=1, n_jobs=-1)

model_s.fit(X_train, y_train)
display(model_s.best_estimator_)

In [None]:
model_svm = SVR(C=10, gamma=1)
model_svm.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = model_svm.predict(X_train)
metrics(y_train, y_pred, verbose=True)

# Random Forest Regressor

In [None]:
rfr=RandomForestRegressor(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['squared_error', 'absolute_error', 'poisson']
}

In [None]:
CV_rfc = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5, verbose=10)
CV_rfc.fit(X_train, y_train)
CV_rfc.best_params_

In [None]:
rfrmodel = RandomForestRegressor(criterion='absolute_error', max_depth= 20, max_features = 'sqrt', n_estimators = 400)
rfrmodel.fit(X_train, y_train)
y_pred1 = rfrmodel.predict(X_train)
metrics(y_train, y_pred1, verbose=True)

# Decision Tree

In [None]:
std_slc = StandardScaler()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('dec_tree', dec_tree)])

In [None]:
criterion = ['squared_error', 'absolute_error', 'poisson']
max_depth = [2,4,6,8,10,12, 20]
n_components = list(range(1,X_train.shape[1]+1,1))
dec_tree = tree.DecisionTreeRegressor()

In [None]:
parameters = dict(pca__n_components=n_components,
                      dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth)

In [None]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train)
print('Best Criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['dec_tree'])

In [None]:
dcmodel = DecisionTreeRegressor(criterion='absolute_error', max_depth= 20)
dcmodel.fit(X_train, y_train)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred2 = dcmodel.predict(X_train)
metrics(y_train, y_pred2, verbose=True)

# XGBM/LGBM

In [None]:
%%time

xgbmodel = XGBRegressor(
    n_jobs=1,
    random_state=42
)

cv_xgbmodel = GridSearchCV(xgbmodel, {
        'min_child_weight': [5, 10],
        'gamma': [0.5, 5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 5],
        'booster': ['gbtree', 'gblinear', 'dart'],
    }, n_jobs=-1, verbose=10,
    cv=KFold(5, shuffle=True, random_state=42)
)
cv_xgbmodel.fit(X_train, y_train)
display(cv_xgbmodel.best_estimator_)


In [None]:
model_xgb = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.6,
             enable_categorical=False, gamma=0.5, gpu_id=-1,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.5, max_delta_step=None, max_depth=15,
             min_child_weight=1, monotone_constraints=None,
             n_estimators=100, n_jobs=1, num_parallel_tree=None, predictor=None,
             random_state=42, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
             subsample=0.6, tree_method=None, validate_parameters=1,
             verbosity=None)
model_xgb.fit(X_train, y_train)
y_pred3 = model_xgb.predict(X_train)
metrics(y_train, y_pred3, verbose=True)

In [None]:
X_train = X_train.to_numpy()

In [None]:
lgbmodel = LGBMRegressor(
    n_jobs=1,
    random_state=42
)

cv_lgbmodel = GridSearchCV(lgbmodel, {
        'lambda_l1': [0.001, 0.1],
        'lambda_l2': [0.001, 0.1],
        'num_leaves': [10, 50],
        'feature_fraction': [0.2, 0.8],
        'subsample': [0.2, 0.8],
        'learning_rate': [0.01, 0.05],
        'num_iterations': [500, 1000]
    }, n_jobs=-1, verbose=10,
    cv=KFold(5, shuffle=True, random_state=42)
)
cv_lgbmodel.fit(X_train, y_train)
display(cv_lgbmodel.best_estimator_)


In [None]:
model_lgb = LGBMRegressor(feature_fraction=0.2, lambda_l1=0.1, lambda_l2=0.001,
              learning_rate=0.8, n_jobs=1, num_iterations=5000, num_leaves=100,
              random_state=42, subsample=0.5)
model_lgb.fit(X_train, y_train)
y_pred4 = model_lgb.predict(X_train)
metrics(y_train, y_pred4, verbose=True)


# Интерпретация

In [None]:
import shap
shap_test = shap.TreeExplainer(model_xgb).shap_values(X_train)

fig = shap.summary_plot(shap_test, X_train,
                      max_display=50)

In [None]:
fig.savefig("shap_regression.png")

In [None]:
explainer = shap.Explainer(model_xgb)
shap_values = explainer(X_train)
shap.plots.bar(shap_values)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model_xgb, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = X_train.columns.tolist())