In [38]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score

In [59]:
clean_data = pd.read_csv('data/clean_data.csv')
clean_data.head()

Unnamed: 0,Gender,Height,Weight,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,...,Publisher_DC Comics,Publisher_Dark Horse Comics,Publisher_George Lucas,Publisher_Image Comics,Publisher_Marvel Comics,Publisher_NBC - Heroes,Publisher_outros,Alignment_bad,Alignment_good,Alignment_neutral
0,0,203.0,441.0,False,True,False,False,False,True,False,...,0,0,0,0,1,0,0,0,1,0
1,0,191.0,65.0,True,True,False,False,True,True,False,...,0,1,0,0,0,0,0,0,1,0
2,0,185.0,90.0,False,False,True,False,False,False,False,...,1,0,0,0,0,0,0,0,1,0
3,0,203.0,441.0,False,True,False,False,False,False,False,...,0,0,0,0,1,0,0,1,0,0
4,0,183.0,81.0,False,False,False,True,False,False,False,...,0,0,0,0,1,0,0,1,0,0


In [79]:
clean_data_gender = pd.read_csv('data/clean_data_gender.csv')
clean_data_alignment = pd.read_csv('data/clean_data_alignment.csv')
clean_data_publisher = pd.read_csv('data/clean_data_publisher.csv')

In [95]:
def nested_cross_validation(data, label, clf, average='binary', scalar=True):
    y = data[label]
    X = data.drop(label, axis=1)

    f1 = make_scorer(f1_score, average=average)
    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
    
    if scalar == True:
        scalar = StandardScaler()
        pipeline = Pipeline([('scalar', scalar), ('clf', clf)])
    else:
        pipeline = Pipeline([('clf', clf)])


    grid = {'clf__max_depth':[2,5,10,15,None]}

    clf = GridSearchCV(estimator=pipeline, param_grid=grid, cv=inner_cv, scoring=f1, iid=True)
    nested_scores = cross_val_score(estimator=clf, X=X, y=y, cv=outer_cv, scoring=f1)
    return nested_scores

forest = RandomForestClassifier(n_estimators=10)
nested_scores = nested_cross_validation(clean_data, "Super Strength", forest)

In [96]:
nested_scores

array([0.82162162, 0.85082873, 0.84444444, 0.83798883])

In [87]:
nested_scores = nested_cross_validation(clean_data, "Stamina", forest)
nested_scores

array([0.77037037, 0.7483871 , 0.77777778, 0.73333333])

In [88]:
nested_scores = nested_cross_validation(clean_data, "Flight", forest)
nested_scores

array([0.52380952, 0.56818182, 0.55172414, 0.52380952])

In [89]:
nested_scores = nested_cross_validation(clean_data, "Accelerated Healing", forest)
nested_scores

array([0.56338028, 0.56410256, 0.55263158, 0.5       ])

In [90]:
nested_scores = nested_cross_validation(clean_data_gender, "Gender", forest)
nested_scores

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


array([0.57534247, 0.51515152, 0.56756757, 0.57534247])

In [91]:
nested_scores = nested_cross_validation(clean_data_alignment, "Alignment", forest, average='micro')
nested_scores

array([0.68292683, 0.66463415, 0.68711656, 0.69135802])

In [92]:
nested_scores = nested_cross_validation(clean_data_publisher, "Publisher", forest, average='micro')
nested_scores

array([0.58181818, 0.63803681, 0.63125   , 0.68553459])

In [93]:
clean_data_grouped = pd.read_csv('data/clean_data_grouped.csv')
clean_data_grouped.shape

(684, 198)

In [97]:
nested_scores = nested_cross_validation(clean_data_grouped, "Super Strength", forest,  scalar=False)
nested_scores

array([0.76300578, 0.83428571, 0.85393258, 0.80898876])