In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

training = pd.read_csv('setEntrenamiento.csv', low_memory = False)
test = pd.read_csv('setTesteo.csv', low_memory = False)

In [2]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve

In [3]:
training["person"] = training["person"].astype(str)
test["person"] = test["person"].astype(str)

In [4]:
label_array = np.array(training['label'])
data_array = np.array(training.drop(columns=['label', 'person']))

In [5]:
#CrossValidation y Grid Search RandomForest
x_train, x_test, y_train, y_test = train_test_split(data_array, label_array, test_size = 0, random_state = 0)
classifier = RandomForestClassifier(n_estimators = 300, random_state= 0)
all_accuracies = cross_val_score(estimator = classifier, X= x_train, y = y_train, cv = 5)
print(all_accuracies)

[0.94877156 0.94797386 0.94901961 0.94875817 0.94849673]


In [6]:
print(all_accuracies.mean())

0.9486039858006279


In [7]:
print(all_accuracies.std())

0.00035585480691379323


In [8]:
#Empieza el gridSearch
grid_param = {
    "n_estimators": [300, 500, 800],
    "criterion": ["gini", "entropy"],
    "bootstrap": [True, False]
}

In [9]:
from  sklearn.metrics import SCORERS

In [10]:
#metricas que se pueden usar
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [11]:
gd_sr = GridSearchCV(estimator=classifier, param_grid=grid_param, scoring="roc_auc", cv=5, n_jobs=1)

In [12]:
#Esto va a tardar
gd_sr.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'n_estimators': [300, 500, 800], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [13]:
#muestra el mejor hiperparametro
best_parameters = gd_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 800}


In [14]:
gd_sr.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
#Muestra el mejor score
gd_sr.best_score_

0.8173777837905141

In [16]:
x_train, x_test, y_train, y_test = train_test_split(data_array, label_array, test_size = 0.33, random_state = 7)

In [17]:
def logisticReg(x_train, x_test, y_train, y_test):
    logReg = LogisticRegression(solver='sag', random_state=1)
    logReg.fit(x_train, y_train)
    y_pred = logReg.predict(x_test)
    #return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [18]:
def knn(k, x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    #return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [19]:
def decisionTree(max_depth, x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
#     return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [20]:
def xgboost(max_depth, n_estimators, x_train, x_test, y_train, y_test):
    xgb = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    #return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [21]:
#result = logisticReg(x_train, x_test, y_train, y_test)
#result

In [22]:
#result1 = knn(10, x_train, x_test, y_train, y_test)
#result1

In [23]:
#resultDecTree = decisionTree(150, x_train, x_test, y_train, y_test)
#resultDecTree

In [24]:
#resultXgboost = xgboost(10, 200,x_train, x_test, y_train, y_test )
#resultXgboost

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [26]:
x_train_nn = scaler.transform(x_train)
x_test_nn = scaler.transform(x_test)

In [27]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)
mlp.fit(x_train_nn, y_train)
predictions = mlp.predict(x_test_nn)
roc_auc_score(y_test, predictions)

0.5256460221347669

In [28]:
def neural_networks(hidden_layer_sizes, max_iter, x_train, x_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,max_iter=max_iter)
    mlp.fit(x_train_nn, y_train)
    predictions = mlp.predict(x_test_nn)
    return roc_auc_score(y_test, predictions)

In [29]:
neural_networks((15, 15, 15), 600, x_train, x_test, y_train, y_test)

0.5169424840127176

In [30]:
grid = dict(hidden_layer_sizes=[(10, 10, 10), (11, 11, 11), (12, 12, 12), (15, 15 ,15)], max_iter=[500, 600, 700, 800])
gd_nn = GridSearchCV(estimator=mlp, param_grid=grid, n_jobs=1)
res = gd_nn.fit(x_train_nn, y_train)



In [31]:
res

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'hidden_layer_sizes': [(10, 10, 10), (11, 11, 11), (12, 12, 12), (15, 15, 15)], 'max_iter': [500, 600, 700, 800]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
gd_nn.best_estimator_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(11, 11, 11), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [33]:
gd_nn.best_score_
#(15, 15, 15) y 700

0.9245356641173716

In [34]:
grid = dict(hidden_layer_sizes=[(14, 14, 14), (15, 15, 15), (16, 16, 16), (17, 17, 17), (18, 18 ,18)], max_iter=[600, 700, 800, 900])
gd_nn = GridSearchCV(estimator=mlp, param_grid=grid, n_jobs=1)
res = gd_nn.fit(x_train_nn, y_train)



In [35]:
gd_nn.best_estimator_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(17, 17, 17), learning_rate='constant',
       learning_rate_init=0.001, max_iter=600, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [36]:
gd_nn.best_score_
#(18, 18, 18) y 800

0.9233650694552833

In [37]:
grd = {
    'hidden_layer_sizes': [(5, 5 ,5), (6, 6, 6), (7, 7, 7), (8, 8, 8)],
    'max_iter': [750, 775, 800, 825],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
gd_nn = GridSearchCV(estimator=mlp, param_grid=grid, n_jobs=1, cv=3)
res = gd_nn.fit(x_train_nn, y_train)
print(gd_nn.best_estimator_)
print(gd_nn.best_score_)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(16, 16, 16), learning_rate='constant',
       learning_rate_init=0.001, max_iter=600, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.9251599812704854
