In [57]:
import pickle
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
from sklearn.compose import ColumnTransformer
from math import sqrt
# models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression

In [58]:
fw = open('../data/dataset_{}_{}.pickle'.format(25951,36),'rb')
df = pickle.load(fw)
df.head()

Unnamed: 0,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,total_ratings,ratings_ratio,...,dexion_games,laush_dmitriy_sergeevich,"nikita_""ghost_rus""",ripknot_systems,big_fish_games,sekai_project,square_enix,strategy_first,thq_nordic,ubisoft
0,0,0,124534,3339,17612,317,>5000000,7.19,127873,0.973888,...,0,0,0,0,0,0,0,0,0,0
1,0,0,3318,633,277,62,>5000000,3.99,3951,0.839787,...,0,0,0,0,0,0,0,0,0,0
2,0,0,3416,398,187,34,>5000000,3.99,3814,0.895648,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1273,267,258,184,>5000000,3.99,1540,0.826623,...,0,0,0,0,0,0,0,0,0,0
4,0,0,5250,288,624,415,>5000000,3.99,5538,0.947996,...,0,0,0,0,0,0,0,0,0,0


In [59]:
df['owners'].value_counts()
17721 / (17721+2941+1645+1340+1249+508+283+192+72)

0.6828638588108358

In [60]:
ordinal_ftrs = ['owners', 'required_age']
ordinal_required_age = [[0, 3, 7, 12, 16, 18]]
ordinal_owners = [['0-20000', '20000-50000', '50000-100000', '100000-200000', 
                   '200000-500000', '500000-1000000', '1000000-2000000', '2000000-5000000', '>5000000']]
ordinal_price_cats = [['very cheap', 'cheap', 'moderate', 'free', 'expensive', 'very expensive']]
std_ftrs = ['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'total_ratings', 'ratings_ratio', 'achievements']
clf = ColumnTransformer(
    transformers=[
        ('std', StandardScaler(), std_ftrs),
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), onehot_ftrs),
        ('ord_age', OrdinalEncoder(categories=ordinal_required_age), ['required_age']),
        ('ord_price_cats', OrdinalEncoder(categories=ordinal_price_cats), ['price_categories'])], remainder='passthrough')
target_transformer = OrdinalEncoder(categories=ordinal_owners)
print(target_transformer.categories)

[['0-20000', '20000-50000', '50000-100000', '100000-200000', '200000-500000', '500000-1000000', '1000000-2000000', '2000000-5000000', '>5000000']]


In [83]:
def MLpipe_KFold(X,y,preprocessor,ML_algo,param_grid):
    '''
    This function splits the data to other/test (80/20) and then applies KFold with 4 folds to other.
    The RMSE is minimized in cross-validation.
    '''
    
    test_scores = []
    best_models = []
    std_ftrs = X.columns
    # loop through 10 random states (2 points)
#     rmse = lambda x, y: sqrt(mean_squared_error(x, y))
    for i in range(10):
        # split data to other/test 80/20, and the use KFold with 4 folds (2 points)
        random_state = 42*i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state = random_state)
        # transfrom targe column into ordinal features
        y_other_prep = target_transformer.fit_transform(y_other)
        y_test_prep = target_transformer.fit_transform(y_test)

        kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
        # preprocess the data (1 point)
        pipe = make_pipeline(preprocessor, ML_algo)
#         print(pipe.get_params().keys())
        # loop through the hyperparameter combinations or use GridSearchCV (2 points)
        grid = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='f1_weighted', cv=kf, return_train_score=True)
        
        # for each combination, calculate the train and validation scores using the evaluation metric
        grid.fit(X_other, y_other_prep.ravel())
        
        # find which hyperparameter combination gives the best validation score (1 point)
        test_score = grid.score(X_test, y_test_prep.ravel())
        
        # calculate the test score (1 point)
        test_scores.append(test_score)
        best_models.append(grid.best_params_)
        
        # append the test score and the best model to the lists (1 point)        
    return best_models, test_scores

In [86]:
## for debug use
y = df.loc[:, df.columns=='owners']
X = df.loc[:, df.columns != 'owners']

ML_algo = LogisticRegression()
param_grid =   {'logisticregression__C': [1e-2,1e-1,1,1e1], 'logisticregression__multi_class': ['multinomial'], 'logisticregression__max_iter': [10000]}

models, scores = MLpipe_KFold(X,y,clf,ML_algo,param_grid)
print(scores)

KeyboardInterrupt: 

In [87]:
# sklearn package algorithms traning
y = df.loc[:, df.columns == 'owners']
X = df.loc[:, df.columns != 'owners']
algos = {
    'SVC': SVC(),
    'KNC': KNeighborsClassifier(),
    'RFC': RandomForestClassifier(),
    'l2C': RidgeClassifier(),
    'SGDC': SGDClassifier(),
    'LOGR': LogisticRegression()
}
params = {
    'SVC': {'svc__C': [1e-2,1e-1,1,1e1,1e2,1e3]},
    'KNC': {'kneighborsclassifier__n_neighbors': [1,10,50,100]},
    'RFC': {'randomforestclassifier__max_depth': [5,10,30,50], 'randomforestclassifier__max_features': [0.5,0.75,1.0]},
    'l2C': {'ridgeclassifier__alpha': [1e-3,1e-2,1,1e1,1e2,1e3]},
    'SGDC': {'sgdclassifier__alpha': [1e-3,1e-2,1,1e1,1e2,1e3], 'sgdclassifier__l1_ratio': np.linspace(1e-1, 1, 4) ,'sgdclassifier__penalty': ['elasticnet']},
    'LOGR':  {'logisticregression__C': [1e-2,1e-1,1,1e1,1e2,1e3], 'logisticregression__multi_class': ['ovr', 'multinomial'], 'logisticregression__max_iter': [10000]}
}
models_dict = {}
scores_dict = {}
for algo in algos:
    print("{} start".format(algo))
    start = time.time()
    models, scores = MLpipe_KFold(X,y,clf,algos[algo],params[algo])
    print("{} ends: {} seconds".format(algo,time.time()-start))
    models_dict[algo] = models
    scores_dict[algo] = (np.mean(scores), np.std(scores))
rank_by_mean = list((k, v) for k, v in sorted(scores_dict.items(), key=lambda item: item[1][0], reverse=False))
rank_by_std = list((k, v) for k, v in sorted(scores_dict.items(), key=lambda item: item[1][1], reverse=False))

NameError: name 'error' is not defined

In [85]:
rank_by_mean = list((k, v) for k, v in sorted(scores_dict.items(), key=lambda item: item[1][0], reverse=False))
rank_by_std = list((k, v) for k, v in sorted(scores_dict.items(), key=lambda item: item[1][1], reverse=False))
for i in rank_by_mean:
    print(i[0], ": ", i[1][0], "Parameters: ", models_dict[i[0]][0])
print("-----------------rank std--------------------")
for i in rank_by_std:
    print(i[0], ": ", i[1][1], "Parameters: ", models_dict[i[0]][0])

SGDC :  0.47522658419286695 Parameters:  {'sgdclassifier__alpha': 0.001, 'sgdclassifier__l1_ratio': 0.4, 'sgdclassifier__penalty': 'elasticnet'}
SVC :  0.5583755482383392 Parameters:  {'svc__C': 1000.0}
l2C :  0.5888060847437351 Parameters:  {'ridgeclassifier__alpha': 0.001}
KNC :  0.645610426859771 Parameters:  {'kneighborsclassifier__n_neighbors': 10}
RFC :  0.7972586984669783 Parameters:  {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_features': 1.0}
-----------------rank std--------------------
SVC :  0.0009534800761049822 Parameters:  {'svc__C': 1000.0}
l2C :  0.0024069844877016105 Parameters:  {'ridgeclassifier__alpha': 0.001}
RFC :  0.002908451940099242 Parameters:  {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_features': 1.0}
KNC :  0.005723153720786376 Parameters:  {'kneighborsclassifier__n_neighbors': 10}
SGDC :  0.15182797772193152 Parameters:  {'sgdclassifier__alpha': 0.001, 'sgdclassifier__l1_ratio': 0.4, 'sgdclassifier__pen

## Upward Resample Dataset

In [None]:
def balance_dataset(df, col, ratio=0.5, balance_method='avg', random_state = 42):
    vc = df[col].value_counts()
    balanced_df = pd.DataFrame(columns=df.columns)
    if balance_method == 'avg':
        sample_size = int(df.shape[0] / len(vc))
        for i in vc.index:
            replace = (vc[i] < sample_size)
            temp = df[df[col] == i]
            balanced_df = balanced_df.append(temp.sample(n=sample_size, replace=replace, random_state=random_state), ignore_index=True)
        return balanced_df
    if balance_method == 'upward':
        highest_cat = vc.index[0]
        highest_num = vc[highest_cat]
        balanced_df = balanced_df.append(df[df[col] == highest_cat])
        for i in vc.index[1:]:
            num = vc[i]
            temp = df[df[col] == i]
            balanced_df = balanced_df.append(temp)
            sample_ratio = num / highest_num
            if sample_ratio < ratio:
                sample_size = int((ratio-sample_ratio) * highest_num)
                balanced_df = balanced_df.append(temp.sample(n=sample_size, replace=True, random_state=random_state), ignore_index=True)
        return balanced_df
    if balance_method == 'downward':
        lowest_cat = vc.index[-1]
        lowest_num = vc[lowest_cat]
        balanced_df = balanced_df.append(df[df[col] == lowest_cat])
        for i in vc.index[:-1]:
            num = vc[i]
            temp = df[df[col] == i]
            sample_ratio = lowest_num / num
            if sample_ratio < ratio:
                sample_size = int(sample_ratio * num)
                balanced_df = balanced_df.append(temp.sample(n=sample_size, replace=False, random_state=random_state), ignore_index=True)
        return balanced_df
    return None

In [None]:
def MLpipe_KFold_with_resample(X,y,preprocessor,ML_algo,param_grid, resample='avg'):
    '''
    This function splits the data to other/test (80/20) and then applies KFold with 4 folds to other.
    The RMSE is minimized in cross-validation.
    '''
    
    test_scores = []
    best_models = []
    std_ftrs = X.columns
    # loop through 10 random states (2 points)
#     rmse = lambda x, y: sqrt(mean_squared_error(x, y))
    for i in range(10):
        # split data to other/test 80/20, and the use KFold with 4 folds (2 points)
        random_state = 42*i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state = random_state)
        X_other['owners'] = y_other['owners']
        df = balance_dataset(X_other, 'owners')
        y_other = df.loc[:, df.columns == 'owners']
        X_other = df.loc[:, df.columns != 'owners']
        
        y_other_prep = target_transformer.fit_transform(y_other)
        y_test_prep = target_transformer.fit_transform(y_test)

        kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
        # preprocess the data (1 point)
        pipe = make_pipeline(preprocessor, ML_algo)
#         print(pipe.get_params().keys())
        # loop through the hyperparameter combinations or use GridSearchCV (2 points)
        grid = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='f1_weighted', cv=kf, return_train_score=True)
        
        # for each combination, calculate the train and validation scores using the evaluation metric
        grid.fit(X_other, y_other_prep.ravel())
        
        # find which hyperparameter combination gives the best validation score (1 point)
        test_score = grid.score(X_test, y_test_prep.ravel())
        
        # calculate the test score (1 point)
        test_scores.append(test_score)
        best_models.append(grid.best_params_)
        
        # append the test score and the best model to the lists (1 point)        
    return best_models, test_scores

In [None]:
## for debug use
y = df.loc[:,df.columns=='owners']
X = df.loc[:, df.columns != 'owners']

ML_algo = LogisticRegression()
param_grid =  {'logisticregression__C': [1e1], 'logisticregression__multi_class': ['ovr', 'multinomial'], 'logisticregression__max_iter': [10000]}

models, scores = MLpipe_KFold_with_resample(X,y,clf,ML_algo,param_grid)
print(scores)

In [None]:
# sklearn package algorithms traning
y = df.loc[:, df.columns == 'owners']
X = df.loc[:, df.columns != 'owners']
algos = {
    'SVC': SVC(),
    'KNC': KNeighborsClassifier(),
    'RFC': RandomForestClassifier(),
    'l2C': RidgeClassifier(),
    'SGDC': SGDClassifier(),
    'LOGR': LogisticRegression()
}
params = {
    'SVC': {'svc__C': [1e-2,1e-1,1,1e1,1e2,1e3]},
    'KNC': {'kneighborsclassifier__n_neighbors': [1,10,50,100]},
    'RFC': {'randomforestclassifier__max_depth': [5,10,30,50], 'randomforestclassifier__max_features': [0.5,0.75,1.0]},
    'l2C': {'ridgeclassifier__alpha': [1e-3,1e-2,1,1e1,1e2,1e3]},
    'SGDC': {'sgdclassifier__alpha': [1e-3,1e-2,1,1e1,1e2,1e3], 'sgdclassifier__l1_ratio': np.linspace(1e-1, 1, 4) ,'sgdclassifier__penalty': ['elasticnet']},
    'LOGR':  {'logisticregression__C': [1e-2,1e-1,1,1e1,1e2,1e3], 'logisticregression__multi_class': ['ovr', 'multinomial'], 'logisticregression__max_iter': [10000]}
}
models_dict_resample = {}
scores_dict_resample = {}
for algo in algos:
    print("{} start".format(algo))
    start = time.time()
    models, scores = MLpipe_KFold(X,y,clf,algos[algo],params[algo])
    print("{} ends: {} seconds".format(algo, time.time()-start))
    models_dict_resample[algo] = models
    scores_dict_resample[algo] = (np.mean(scores), np.std(scores))
rank_by_mean_resample = list((k, v) for k, v in sorted(scores_dict_resample.items(), key=lambda item: item[1][0], reverse=False))
rank_by_std_resample = list((k, v) for k, v in sorted(scores_dict_resample.items(), key=lambda item: item[1][1], reverse=False))

for i in rank_by_mean_resample:
    print(i[0], ": ", i[1][0], "Parameters: ", models_dict[i[0]][0])
print("-----------------rank std--------------------")
for i in rank_by_std_resample:
    print(i[0], ": ", i[1][1], "Parameters: ", models_dict[i[0]][0])

## XGBoost

In [70]:
def XGB_KFold(X, y, preprocessor, params, algo):
    validation_scores = []
    test_scores = []
    train_scores = []
    for i in range(1):
        random_state = 42 * i
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state = random_state)
        random_state = 42*i
        le = LabelEncoder()
        kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
        for train_index, val_index in kf.split(X_other, y_other):    # group by User_other
            X_train = X_other.iloc[train_index]
            y_train = y_other.iloc[train_index]
            X_cv = X_other.iloc[val_index]
            y_cv = y_other.iloc[val_index]
            
            X_train_prep = clf.fit_transform(X_train)
            X_cv_prep = clf.transform(X_cv)
            X_test_prep = clf.transform(X_test)
            
            y_train_prep = le.fit_transform(y_train)
            y_cv_prep = le.fit_transform(y_cv)
            y_test_prep = le.transform(y_test)
            # model
            test_dict = {}
            train_dict = {}
            vali_dict = {}
            for i in range(len(param_grid['max_depth'])):
                XGB = xgboost.XGBClassifier()
                XGB.set_params(**ParameterGrid(param_grid)[i])
                XGB.fit(X_train_prep,y_train_prep,early_stopping_rounds=50,eval_set=[(X_cv_prep, y_cv_prep)], verbose=False)
                y_cv_pred = XGB.predict(X_cv_prep)
                validation_score = accuracy_score(y_cv_prep, y_cv_pred)
                y_train_pred = XGB.predict(X_train_prep)
                train_score = accuracy_score(y_train_prep, y_train_pred)
                y_test_pred = XGB.predict(X_test_prep)
                test_score = accuracy_score(y_test_prep, y_test_pred)
                md = param_grid['max_depth'][i]
                test_dict[md] = test_score
                train_dict[md] = train_score
                vali_dict[md] = validation_score
                validation_scores.append(vali_dict)
                test_scores.append(test_dict)
                train_scores.append(train_dict)
    print(test_scores)

In [71]:
# xgboost training
y = df['owners']
X = df.loc[:, df.columns != 'owners']
XGB = xgboost.XGBClassifier()
param_grid = {"learning_rate": [0.03],
              "n_estimators": [10000],
              "seed": [0],
#               "reg_alpha": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
#               "reg_lambda": [0e0, 1e-2, 1e-1, 1e0, 1e1, 1e2],
              "missing": [np.nan], 
              "max_depth": [1,3,10,30,100],
              "colsample_bytree": [0.9],              
              "subsample": [0.66]}
XGB_KFold(X, y, clf, XGB, param_grid)

[{1: 0.7407550077041603}, {1: 0.734206471494607}, {1: 0.7318952234206472}, {1: 0.7604006163328197}, {1: 0.734206471494607}]


In [None]:
train_mean = {}
vali_mean = {}
train_std = {}
vali_std = {}
for i in param_grid['max_depth']:
    train_mean[i] = np.mean([d[i] for d in train_scores])
    train_std[i] = np.std([d[i] for d in train_scores])
    vali_mean[i] = np.mean([d[i] for d in validation_scores])
    vali_std[i] = np.std([d[i] for d in validation_scores])
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(16,6))
ax1.plot(list(train_mean.keys()), list(train_mean.values()), label='train')
ax1.plot(list(vali_mean.keys()), list(vali_mean.values()), label='validation')
ax1.legend()
ax1.set_title('mean')
ax2.plot(list(train_std.keys()), list(train_std.values()), label='train')
ax2.plot(list(vali_std.keys()), list(vali_std.values()), label='validation')
ax2.legend()
ax2.set_title('std')
plt.show()