In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

training = pd.read_csv('setEntrenamiento.csv', low_memory = False)
test = pd.read_csv('setTesteo.csv', low_memory = False)

In [31]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve

In [32]:
training["person"] = training["person"].astype(str)
test["person"] = test["person"].astype(str)

Unnamed: 0,person,viewed product,search engine hit,checkout,searched products,generic listing,visited site,ad campaign hit,brand listing,lead,...,Samsung Galaxy Gran Prime Duos TV,32GB,64GB,16GB,128GB,256GB,8GB,4GB,512MB,label


In [33]:
label_array = np.array(training['label'])
data_array = np.array(training.drop(columns=['label', 'person']))

In [40]:
#CrossValidation y Grid Search RandomForest
x_train, x_test, y_train, y_test = train_test_split(data_array, label_array, test_size = 0, random_state = 0)
classifier = RandomForestClassifier(n_estimators = 300, random_state= 0)
all_accuracies = cross_val_score(estimator = classifier, X= x_train, y = y_train, cv = 5)
print(all_accuracies)

[0.94877156 0.94797386 0.94901961 0.94875817 0.94849673]


In [41]:
print(all_accuracies.mean())

0.9486039858006279


In [42]:
print(all_accuracies.std())

0.00035585480691379323


In [70]:
#Empieza el gridSearch
grid_param = {
    "n_estimators": [300, 500, 800],
    "criterion": ["gini", "entropy"],
    "bootstrap": [True, False]
}

In [47]:
from  sklearn.metrics import SCORERS

In [48]:
#metricas que se pueden usar
SCORERS.keys()

['precision_samples',
 'f1_weighted',
 'balanced_accuracy',
 'f1_samples',
 'f1',
 'adjusted_mutual_info_score',
 'precision',
 'normalized_mutual_info_score',
 'neg_mean_squared_error',
 'precision_micro',
 'neg_mean_squared_log_error',
 'recall_samples',
 'homogeneity_score',
 'precision_weighted',
 'fowlkes_mallows_score',
 'average_precision',
 'roc_auc',
 'adjusted_rand_score',
 'recall_macro',
 'v_measure_score',
 'completeness_score',
 'r2',
 'recall_weighted',
 'recall',
 'precision_macro',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'mutual_info_score',
 'f1_macro',
 'f1_micro',
 'brier_score_loss',
 'recall_micro',
 'neg_median_absolute_error',
 'explained_variance',
 'accuracy']

In [50]:
gd_sr = GridSearchCV(estimator=classifier, param_grid=grid_param, scoring="roc_auc", cv=5, n_jobs=1)

In [51]:
#Esto va a tardar
gd_sr.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'n_estimators': [300, 500, 800], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [54]:
#muestra el mejor hiperparametro
best_parameters = gd_sr.best_params_
print(best_parameters)

{'n_estimators': 800, 'bootstrap': True, 'criterion': 'entropy'}


In [72]:
gd_sr.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [71]:
#Muestra el mejor score
gd_sr.best_score_

0.8173777837905141

In [61]:
x_train, x_test, y_train, y_test = train_test_split(data_array, label_array, test_size = 0.33, random_state = 7)

In [67]:
def logisticReg(x_train, x_test, y_train, y_test):
    logReg = LogisticRegression(solver='sag', random_state=1)
    logReg.fit(x_train, y_train)
    y_pred = logReg.predict(x_test)
    #return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [27]:
def knn(k, x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    #return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [63]:
def decisionTree(max_depth, x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    y_pred = dt.predict(x_test)
#     return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [29]:
def xgboost(max_depth, n_estimators, x_train, x_test, y_train, y_test):
    xgb = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    #return np.sqrt(mean_squared_error(y_test, y_pred))
    return roc_auc_score(y_test, y_pred)
#TAMBIEN PUEDEN USAR PARA VER EL ACCURACY
#ACCURACY = accuracy_score(y_test, y_pred)

In [68]:
result = logisticReg(x_train, x_test, y_train, y_test)
result

0.5

In [69]:
result1 = knn(10, x_train, x_test, y_train, y_test)
result1

0.5

In [37]:
resultDecTree = decisionTree(150, x_train, x_test, y_train, y_test)
resultDecTree

0.560895563914086

In [39]:
resultXgboost = xgboost(10, 200,x_train, x_test, y_train, y_test )
resultXgboost

0.5117385200587032