In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score

In [2]:
clean_data = pd.read_csv('data/clean_data.csv')
clean_data_gender = pd.read_csv('data/clean_data_gender.csv')
clean_data_alignment = pd.read_csv('data/clean_data_alignment.csv')
clean_data_publisher = pd.read_csv('data/clean_data_publisher.csv')
clean_data_grouped = pd.read_csv('data/clean_data_grouped.csv')
clean_data_grouped_gender = pd.read_csv('data/clean_data_grouped_gender.csv')
clean_data_grouped_alignment = pd.read_csv('data/clean_data_grouped_alignment.csv')
clean_data_grouped_publisher = pd.read_csv('data/clean_data_grouped_publisher.csv')

In [3]:
def get_scoring(average):
    f1 = make_scorer(f1_score, average=average)
    precision = make_scorer(precision_score, average=average)
    recall = make_scorer(recall_score, average=average)
    scoring = {'accuracy': 'accuracy',
           'precision': precision,
           'recall': recall, 
           'f1': f1}
    return scoring

In [4]:
def nested_cross_validation(data, label, clf, grid, average='binary', scalar=True):
    y = data[label]
    X = data.drop(label, axis=1)

    scoring = get_scoring(average)
    f1 = make_scorer(f1_score, average=average)
    
    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)

    clf = GridSearchCV(estimator=clf, param_grid=grid, cv=inner_cv, scoring=f1, iid=True)
    
    if scalar == True:
        scalar = StandardScaler()
        pipeline = Pipeline([('scalar', scalar), ('clf', clf)])
    else:
        pipeline = Pipeline([('clf', clf)])
    
    nested_scores = cross_validate(estimator=pipeline, X=X, y=y, cv=outer_cv, scoring=scoring)
    return {'accuracy':[nested_scores['test_accuracy'].mean()], 
            'precision': [nested_scores['test_precision'].mean()],
            'recall': [nested_scores['test_recall'].mean()],
            'f1': [nested_scores['test_f1'].mean()],
            'f1_std': [nested_scores['test_accuracy'].std()]}

In [5]:
def cross_validation(data, label, clf, average='binary', scalar=True):
    y = data[label]
    X = data.drop(label, axis=1)

    scoring = get_scoring(average)
    
    if scalar == True:
        scalar = StandardScaler()
        pipeline = Pipeline([('scalar', scalar), ('clf', clf)])
    else:
        pipeline = Pipeline([('clf', clf)])

    scores = cross_validate(estimator=pipeline, X=X, y=y, cv=4, scoring=scoring)
    return {'accuracy':[scores['test_accuracy'].mean()], 
            'precision': [scores['test_precision'].mean()],
            'recall': [scores['test_recall'].mean()],
            'f1': [scores['test_f1'].mean()],
            'f1_std': [scores['test_accuracy'].std()]}

In [6]:
forest_grid = {'max_depth':[2,5,10,15,None]}
forest = RandomForestClassifier(n_estimators=10, random_state=1)

In [7]:
knn_grid = {'n_neighbors':[1, 3, 5, 7, 11]}
knn = KNeighborsClassifier()

In [8]:
logistic_grid = {'C': [0.1, 1]}
logistic = LogisticRegression(penalty='l1', solver='liblinear')

In [9]:
bayes = GaussianNB()

In [10]:
nested_scores = nested_cross_validation(clean_data, "Super Strength", forest, forest_grid)
nested_scores

{'accuracy': [0.8406432748538011],
 'precision': [0.8704726921019056],
 'recall': [0.8306451612903225],
 'f1': [0.8498661127622071],
 'f1_std': [0.025947718346928144]}

In [11]:
nested_scores = nested_cross_validation(clean_data_publisher, "Publisher", knn, knn_grid, average='micro')
nested_scores

{'accuracy': [0.5363685526260404],
 'precision': [0.5363685526260404],
 'recall': [0.5363685526260404],
 'f1': [0.5363685526260404],
 'f1_std': [0.04445321360643852]}

Interessante como todos os valores para publisher e alignment fica igual entre as metricas quando usado micro, da uma olhada no porque depois. Olhar quantos avisos voce recebe com average macro, talvez tenha que juntar mais valores

In [12]:
nested_scores_strength = nested_cross_validation(clean_data_grouped, "Super Strength", logistic, logistic_grid,  scalar=False)
nested_scores_strength

{'accuracy': [0.8435672514619883],
 'precision': [0.8514251437795732],
 'recall': [0.8629032258064516],
 'f1': [0.8570026787234732],
 'f1_std': [0.02424433326283185]}

In [13]:
nested_scores_bayes = cross_validation(clean_data_grouped, "Super Strength", bayes, scalar=False)
nested_scores_bayes

{'accuracy': [0.6608187134502924],
 'precision': [0.7890670011955174],
 'recall': [0.5134408602150538],
 'f1': [0.6217216596201706],
 'f1_std': [0.036285595456113595]}

In [14]:
def append(dict1, dict2):
    if bool(dict1):
        for key in dict1:
            dict1[key].append(dict2[key][0])
        return dict1
    else:
        return dict2.copy()

result = append({}, nested_scores_strength)
result = append(result, nested_scores_bayes)
append(result, nested_scores)

{'accuracy': [0.8435672514619883, 0.6608187134502924, 0.5363685526260404],
 'precision': [0.8514251437795732, 0.7890670011955174, 0.5363685526260404],
 'recall': [0.8629032258064516, 0.5134408602150538, 0.5363685526260404],
 'f1': [0.8570026787234732, 0.6217216596201706, 0.5363685526260404],
 'f1_std': [0.02424433326283185, 0.036285595456113595, 0.04445321360643852]}

In [21]:
def pred_model(data, data_grouped, label, model, grid, scores, average):
    score = cross_validation(data, label, model, average=average)
    scores = append(scores, score)
    score = nested_cross_validation(data, label, model, grid, average=average)  
    scores = append(scores, score)
    score = cross_validation(data_grouped, label, model, average=average, scalar=False)
    scores = append(scores, score)
    score = nested_cross_validation(data_grouped, label, model, grid, average=average, scalar=False)
    scores = append(scores, score)
    return scores

In [22]:
def pred(data, data_grouped, label, average='binary'):
    scores = {}
    
    scores = pred_model(data, data_grouped, label, forest, forest_grid, scores, average)

    scores = pred_model(data, data_grouped, label, knn, knn_grid, scores, average)

    scores = pred_model(data, data_grouped, label, logistic, logistic_grid, scores, average)

    
    score = cross_validation(data, label, bayes, average=average)
    scores = append(scores, score)
    score = cross_validation(data_grouped, label, bayes, average=average, scalar=False)
    scores = append(scores, score)
    
    return scores

### Super Strength

In [24]:
index = ['RandomForest_default','RandomForest', 'RandomForest_agrupado_default', 'RandomForest_agrupado', 
         'knn_default', 'knn', 'knn_agrupado_default', 'knn_agrupado', 
         'logistic_default', 'logistic', 'logistic_agrupado_default', 'logistic_agrupado',
         'bayes', 'bayes_agrupado']
scores = pred(clean_data, clean_data_grouped, 'Super Strength')
strength = pd.DataFrame(scores, index=index)
strength

Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.820175,0.856072,0.803763,0.828443,0.041839
RandomForest,0.840643,0.870473,0.830645,0.849866,0.025948
RandomForest_agrupado_default,0.840643,0.868573,0.833333,0.849246,0.042448
RandomForest_agrupado,0.839181,0.867885,0.830645,0.848685,0.026316
knn_default,0.70614,0.775242,0.645161,0.703672,0.055228
knn,0.703216,0.78487,0.626344,0.696211,0.026918
knn_agrupado_default,0.795322,0.931498,0.674731,0.779486,0.037673
knn_agrupado,0.770468,0.886178,0.663978,0.757746,0.01956
logistic_default,0.815789,0.851822,0.801075,0.825072,0.015746
logistic,0.845029,0.88007,0.827957,0.852843,0.018723


### Accelerated Healing

In [25]:
scores = pred(clean_data, clean_data_grouped, 'Accelerated Healing')
healing = pd.DataFrame(scores, index=index)
healing

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.786581,0.69958,0.356499,0.463991,0.027379
RandomForest,0.791018,0.684817,0.444149,0.527617,0.026361
RandomForest_agrupado_default,0.785067,0.706208,0.356267,0.464115,0.015362
RandomForest_agrupado,0.792292,0.684117,0.431776,0.527574,0.032959
knn_default,0.757374,0.744792,0.15148,0.244949,0.018377
knn,0.754535,0.554993,0.406337,0.466209,0.037952
knn_agrupado_default,0.785161,0.735714,0.319265,0.429399,0.028891
knn_agrupado,0.783605,0.650972,0.453631,0.529267,0.00878
logistic_default,0.742711,0.526906,0.497456,0.509493,0.032649
logistic,0.767582,0.588088,0.481267,0.527302,0.013823


### Stamina

In [26]:
scores = pred(clean_data, clean_data_grouped, 'Stamina')
stamina = pd.DataFrame(scores, index=index)
stamina

Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.783653,0.790609,0.695395,0.739335,0.006157
RandomForest,0.803933,0.800556,0.748246,0.771604,0.028667
RandomForest_agrupado_default,0.776436,0.787758,0.682544,0.72827,0.030918
RandomForest_agrupado,0.786542,0.774714,0.735088,0.752162,0.014592
knn_default,0.732558,0.809063,0.516754,0.629651,0.028046
knn,0.692852,0.692402,0.549518,0.612101,0.022649
knn_agrupado_default,0.779258,0.826665,0.63943,0.717325,0.016906
knn_agrupado,0.777821,0.809827,0.655702,0.722455,0.020905
logistic_default,0.735294,0.711918,0.67886,0.693902,0.020182
logistic,0.798187,0.778803,0.771623,0.772934,0.03854


### Flight

In [27]:
scores = pred(clean_data, clean_data_grouped, 'Flight')
stamina = pd.DataFrame(scores, index=index)
stamina

Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.774927,0.742073,0.451852,0.559555,0.032594
RandomForest,0.801132,0.826363,0.483923,0.601496,0.040078
RandomForest_agrupado_default,0.768976,0.758396,0.419949,0.537096,0.009435
RandomForest_agrupado,0.782109,0.737064,0.506902,0.598521,0.032707
knn_default,0.714967,0.661433,0.237542,0.344228,0.015225
knn,0.688437,0.538317,0.34697,0.41819,0.039283
knn_agrupado_default,0.738376,0.724432,0.287795,0.410113,0.021848
knn_agrupado,0.719208,0.584836,0.4383,0.500689,0.026035
logistic_default,0.745618,0.606146,0.57963,0.591406,0.023503
logistic,0.801209,0.754082,0.56633,0.645776,0.028703


### Gender

In [28]:
scores = pred(clean_data_gender, clean_data_grouped_gender, 'Gender')
gender = pd.DataFrame(scores, index=index)
gender

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.802164,0.733333,0.455051,0.560337,0.015073
RandomForest,0.820875,0.834199,0.444066,0.576513,0.020777
RandomForest_agrupado_default,0.713412,0.462946,0.18548,0.255347,0.00693
RandomForest_agrupado,0.718022,0.468847,0.184848,0.264071,0.020964
knn_default,0.691566,0.376187,0.168561,0.232391,0.013364
knn,0.621419,0.332115,0.353535,0.342167,0.034821
knn_agrupado_default,0.70099,0.434405,0.241793,0.309474,0.022551
knn_agrupado,0.66973,0.396989,0.404167,0.399099,0.029519
logistic_default,0.688461,0.433261,0.410354,0.420534,0.013497
logistic,0.703979,0.464788,0.421086,0.440948,0.022729


### Alignment

In [30]:
scores = pred(clean_data_alignment, clean_data_grouped_alignment, 'Alignment', average='micro')
alignment = pd.DataFrame(scores, index=index)
alignment





Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.652338,0.652338,0.652338,0.652338,0.036744
RandomForest,0.663085,0.663085,0.663085,0.663085,0.001716
RandomForest_agrupado_default,0.624608,0.624608,0.624608,0.624608,0.051377
RandomForest_agrupado,0.653901,0.653901,0.653901,0.653901,0.010125
knn_default,0.647765,0.647765,0.647765,0.647765,0.025249
knn,0.646241,0.646241,0.646241,0.646241,0.013266
knn_agrupado_default,0.624674,0.624674,0.624674,0.624674,0.034141
knn_agrupado,0.64467,0.64467,0.64467,0.64467,0.011116
logistic_default,0.614107,0.614107,0.614107,0.614107,0.014999
logistic,0.690523,0.690523,0.690523,0.690523,0.0318


### Publisher

In [32]:
scores = pred(clean_data_publisher, clean_data_grouped_publisher, 'Publisher', average='micro')
publisher = pd.DataFrame(scores, index=index)
publisher





Unnamed: 0,accuracy,precision,recall,f1,f1_std
RandomForest_default,0.585864,0.585864,0.585864,0.585864,0.012047
RandomForest,0.607574,0.607574,0.607574,0.607574,0.02946
RandomForest_agrupado_default,0.580175,0.580175,0.580175,0.580175,0.050209
RandomForest_agrupado,0.604366,0.604366,0.604366,0.604366,0.005588
knn_default,0.502789,0.502789,0.502789,0.502789,0.038196
knn,0.536369,0.536369,0.536369,0.536369,0.044453
knn_agrupado_default,0.521444,0.521444,0.521444,0.521444,0.040549
knn_agrupado,0.541268,0.541268,0.541268,0.541268,0.032275
logistic_default,0.593812,0.593812,0.593812,0.593812,0.03945
logistic,0.641744,0.641744,0.641744,0.641744,0.02445
