In [1]:
# for data
import pandas as pd
import numpy as np
import re

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

from category_encoders.hashing import HashingEncoder

## for statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statistics import mean

#For scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import KNNImputer

# for machine learning
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from winsound import Beep

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

# Para evaluar modelos
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score\
, balanced_accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import metrics

# Mostrar todas las columnas
pd.set_option('display.max_columns', None)

In [2]:
# Cargar datasets originales
data = pd.read_csv('z_train.csv')
data_test = pd.read_csv('z_test.csv')

print(data.shape)
print(data_test.shape)

(15326, 14)
(3832, 13)


# DTypes

In [34]:
# Mean encoding
data_pre = data.copy()
mean_encoding = data_pre.groupby(['city'])['target'].mean()

In [176]:
# Fusionar datasets
# Agregar columna -1 al de test
data_test_target = data_test.copy()
data_test_target['target'] = -1

# Combinar datasets
data_clean = pd.concat([data, data_test_target], axis = 0).reset_index(drop = True)

In [35]:
# CON predictions Kaggle
test_targets = pd.read_csv('submission_Voting_conparametrosmodeloganador__RF9_clean.csv')
test_targets = pd.concat([data_test, test_targets['target']], axis=1)

data_test_target = data_test.copy()
data_test_target['target'] = -1

# Dataset combinado que será limpiado
data_clean = pd.concat([data, test_targets], axis = 0).reset_index(drop = True)
data_clean = pd.concat([data_clean, data_test_target]).reset_index(drop = True)
print(data_clean.shape)

(22990, 14)


In [36]:
data_clean = data_clean.set_index("enrollee_id")

In [37]:
data_clean['city'] = data_clean['city'].map(mean_encoding)
#encoder = HashingEncoder(cols=['city'])
#data_clean = encoder.fit_transform(data_clean)

In [38]:
data_clean['city_development_index'] = data_clean['city_development_index'].round(3)

In [39]:
data_clean = data_clean.drop(['gender'], axis=1)

In [40]:
# Transformar education_level
education_level_mapping = {'Primary School':0,
                           'High School':1,
                           'Graduate':2,
                           'Masters':3,
                           'Phd':4}
data_clean['education_level'] = data_clean['education_level'].map(education_level_mapping)

In [41]:
data_clean['experience'] = data_clean['experience'].str.replace('<1', '0')
data_clean['experience'] = data_clean['experience'].str.replace('>20', '21')
data_clean['experience'] = data_clean['experience'].astype(float)

In [42]:
company_size_mapping = {
    '<10'          :    1,
    '10/49'        :    2, 
    '100-500'      :    3, 
    '1000-4999'    :    4, 
    '10000+'       :    5, 
    '50-99'        :    6, 
    '500-999'      :    7, 
    '5000-9999'    :    8
}
data_clean['company_size'] = data_clean['company_size'].map(company_size_mapping)

In [43]:
last_new_job_mapping = {'never':0,
                        '1':1,
                        '2':2,
                        '3':3,
                        '4':4,
                        '>4':5}

data_clean['last_new_job'] = data_clean['last_new_job'].map(last_new_job_mapping).astype(float)

# Nulos categóricos

In [44]:
# Menos de 10%
data_clean['enrolled_university'] = data_clean['enrolled_university'].fillna(str(data_clean['enrolled_university'].mode()[0]))
data_clean['education_level'] = data_clean['education_level'].fillna(str(data_clean['education_level'].mode()[0]))

# Más de 10%
#data_clean['gender'] = data_clean['gender'].fillna('missing')
data_clean['major_discipline'] = data_clean['major_discipline'].fillna('missing')
data_clean['company_size'] = data_clean['company_size'].fillna(0)
data_clean['company_type'] = data_clean['company_size'].fillna('missing')

# Dummies

In [45]:
data_clean = pd.get_dummies(data=data_clean, columns=['relevent_experience', 'enrolled_university', 'major_discipline', 'company_type'])
data_clean.head(4)

Unnamed: 0_level_0,city,city_development_index,education_level,experience,company_size,last_new_job,training_hours,target,relevent_experience_Has relevent experience,relevent_experience_No relevent experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM,major_discipline_missing,company_type_0.0,company_type_1.0,company_type_2.0,company_type_3.0,company_type_4.0,company_type_5.0,company_type_6.0,company_type_7.0,company_type_8.0
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
8949,0.213898,0.92,2,21.0,0.0,1.0,36,1.0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
29725,0.163636,0.776,2,15.0,6.0,5.0,47,0.0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
11561,0.598794,0.624,2,5.0,0.0,0.0,83,0.0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
33241,0.380952,0.789,2,0.0,0.0,0.0,52,1.0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0


# Nulos numéricos

In [46]:
imputer = KNNImputer(n_neighbors=5)
data_clean = pd.DataFrame(imputer.fit_transform(data_clean),columns = data_clean.columns)

# Scaling

In [47]:
# Scaler | MinMax
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(data_clean.drop("target", axis=1))

dtf_scaled= pd.DataFrame(X, columns=data_clean.drop("target", axis=1).columns, index=data_clean.index)
dtf_scaled["target"] = data_clean["target"]
data_clean = dtf_scaled.copy()

# Separar datasets y balanceo

In [48]:
# Separar datasets
data_clean_test = data_clean[data_clean['target'] == -1]
data_clean_test = data_clean_test.drop('target', axis=1)
data_clean = data_clean[data_clean['target'] != -1]

X = data_clean.drop(['target'], axis=1)
y = data_clean.target

In [49]:
# SMOTE
smote = SMOTE(random_state=1, k_neighbors=5)
X, y = smote.fit_resample(X, y)

In [50]:
# DIVIDIR DATASETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =1)

# Voting

In [51]:
model_name = 'Voting'
estimators = [('xgb', XGBClassifier(n_estimators=1000, max_depth=2, min_child_weight=8, random_state=1)),('rf', RandomForestClassifier(random_state=1, max_depth=9, max_features=6, min_samples_leaf=5, n_estimators=100)), ('svc', SVC(C=.9, probability=True))]
model = VotingClassifier(estimators=estimators, voting='soft')
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)

# Metrics
print(f"AUC: {metrics.roc_auc_score(y_test, y_pred_test, average='macro')}")

# Train score
y_pred_train = model.predict(X_train)
print(f"Train AUC: {metrics.roc_auc_score(y_train, y_pred_train, average='macro')}")

AUC: 0.8417755879985974
Train AUC: 0.8555762536918556


# XGClassifier

In [163]:
model = XGBClassifier(random_state=1)
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

# Metrics
print(f"AUC: {metrics.roc_auc_score(y_test, y_pred_test, average='macro')}")

# Train score
y_pred_train = model.predict(X_train)
print(f"Train AUC: {metrics.roc_auc_score(y_train, y_pred_train, average='macro')}")

AUC: 0.8204440549424552
Train AUC: 0.8290086294001963


In [143]:
kfold = KFold(n_splits = 10, random_state=1, shuffle=True)
cv_r2_scores = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc')
print('kfold scores:', cv_r2_scores)
print('kfold mean score.', np.mean(cv_r2_scores))

# LightGBM

In [166]:
model = LGBMClassifier(random_state=1)
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

# Metrics
print(f"AUC: {metrics.roc_auc_score(y_test, y_pred_test, average='macro')}")

# Train score
y_pred_train = model.predict(X_train)
print(f"Train AUC: {metrics.roc_auc_score(y_train, y_pred_train, average='macro')}")

AUC: 0.8471416008531757
Train AUC: 0.8724858632347017


In [145]:
model = CatBoostClassifier(random_state=1)
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)

# Metrics
print(f"AUC: {metrics.roc_auc_score(y_test, y_pred_test, average='macro')}")

# Train score
y_pred_train = model.predict(X_train)
print(f"Train AUC: {metrics.roc_auc_score(y_train, y_pred_train, average='macro')}")

Learning rate set to 0.035731
0:	learn: 0.6819172	total: 24ms	remaining: 23.9s
1:	learn: 0.6716458	total: 51.6ms	remaining: 25.8s
2:	learn: 0.6620449	total: 73.5ms	remaining: 24.4s
3:	learn: 0.6524413	total: 92ms	remaining: 22.9s
4:	learn: 0.6442823	total: 116ms	remaining: 23.1s
5:	learn: 0.6358325	total: 134ms	remaining: 22.2s
6:	learn: 0.6276157	total: 164ms	remaining: 23.3s
7:	learn: 0.6164296	total: 190ms	remaining: 23.6s
8:	learn: 0.6064305	total: 236ms	remaining: 26s
9:	learn: 0.6008396	total: 279ms	remaining: 27.6s
10:	learn: 0.5952271	total: 320ms	remaining: 28.7s
11:	learn: 0.5875353	total: 350ms	remaining: 28.8s
12:	learn: 0.5824613	total: 395ms	remaining: 30s
13:	learn: 0.5772613	total: 444ms	remaining: 31.3s
14:	learn: 0.5732526	total: 475ms	remaining: 31.2s
15:	learn: 0.5669840	total: 507ms	remaining: 31.2s
16:	learn: 0.5634076	total: 561ms	remaining: 32.5s
17:	learn: 0.5589770	total: 623ms	remaining: 34s
18:	learn: 0.5556950	total: 675ms	remaining: 34.9s
19:	learn: 0.5522

162:	learn: 0.4111788	total: 6.04s	remaining: 31s
163:	learn: 0.4108884	total: 6.07s	remaining: 30.9s
164:	learn: 0.4104529	total: 6.12s	remaining: 30.9s
165:	learn: 0.4100076	total: 6.14s	remaining: 30.9s
166:	learn: 0.4098646	total: 6.17s	remaining: 30.8s
167:	learn: 0.4095607	total: 6.22s	remaining: 30.8s
168:	learn: 0.4088217	total: 6.25s	remaining: 30.8s
169:	learn: 0.4086553	total: 6.29s	remaining: 30.7s
170:	learn: 0.4081264	total: 6.31s	remaining: 30.6s
171:	learn: 0.4080258	total: 6.34s	remaining: 30.5s
172:	learn: 0.4079065	total: 6.36s	remaining: 30.4s
173:	learn: 0.4073163	total: 6.39s	remaining: 30.3s
174:	learn: 0.4071899	total: 6.41s	remaining: 30.2s
175:	learn: 0.4062254	total: 6.48s	remaining: 30.3s
176:	learn: 0.4054437	total: 6.52s	remaining: 30.3s
177:	learn: 0.4051851	total: 6.55s	remaining: 30.2s
178:	learn: 0.4050292	total: 6.59s	remaining: 30.2s
179:	learn: 0.4040297	total: 6.62s	remaining: 30.1s
180:	learn: 0.4038397	total: 6.63s	remaining: 30s
181:	learn: 0.40

322:	learn: 0.3569662	total: 12s	remaining: 25.1s
323:	learn: 0.3563350	total: 12s	remaining: 25s
324:	learn: 0.3560355	total: 12s	remaining: 25s
325:	learn: 0.3556444	total: 12.1s	remaining: 24.9s
326:	learn: 0.3553614	total: 12.1s	remaining: 24.9s
327:	learn: 0.3549079	total: 12.1s	remaining: 24.8s
328:	learn: 0.3546576	total: 12.1s	remaining: 24.7s
329:	learn: 0.3543695	total: 12.2s	remaining: 24.7s
330:	learn: 0.3541408	total: 12.2s	remaining: 24.6s
331:	learn: 0.3538702	total: 12.2s	remaining: 24.6s
332:	learn: 0.3536315	total: 12.2s	remaining: 24.5s
333:	learn: 0.3531680	total: 12.3s	remaining: 24.5s
334:	learn: 0.3528536	total: 12.3s	remaining: 24.4s
335:	learn: 0.3525396	total: 12.3s	remaining: 24.4s
336:	learn: 0.3522325	total: 12.4s	remaining: 24.3s
337:	learn: 0.3516975	total: 12.4s	remaining: 24.3s
338:	learn: 0.3514683	total: 12.4s	remaining: 24.3s
339:	learn: 0.3512755	total: 12.5s	remaining: 24.2s
340:	learn: 0.3506398	total: 12.5s	remaining: 24.2s
341:	learn: 0.3497806	

481:	learn: 0.3241057	total: 16.8s	remaining: 18s
482:	learn: 0.3239664	total: 16.8s	remaining: 18s
483:	learn: 0.3238911	total: 16.8s	remaining: 17.9s
484:	learn: 0.3237857	total: 16.9s	remaining: 17.9s
485:	learn: 0.3236622	total: 16.9s	remaining: 17.9s
486:	learn: 0.3235594	total: 16.9s	remaining: 17.8s
487:	learn: 0.3234532	total: 16.9s	remaining: 17.8s
488:	learn: 0.3233742	total: 17s	remaining: 17.7s
489:	learn: 0.3232526	total: 17s	remaining: 17.7s
490:	learn: 0.3230996	total: 17s	remaining: 17.6s
491:	learn: 0.3229694	total: 17s	remaining: 17.6s
492:	learn: 0.3228344	total: 17.1s	remaining: 17.6s
493:	learn: 0.3227215	total: 17.1s	remaining: 17.5s
494:	learn: 0.3226048	total: 17.1s	remaining: 17.5s
495:	learn: 0.3224278	total: 17.2s	remaining: 17.4s
496:	learn: 0.3223253	total: 17.2s	remaining: 17.4s
497:	learn: 0.3222274	total: 17.2s	remaining: 17.3s
498:	learn: 0.3220666	total: 17.2s	remaining: 17.3s
499:	learn: 0.3219069	total: 17.3s	remaining: 17.3s
500:	learn: 0.3217945	to

642:	learn: 0.3068444	total: 21.7s	remaining: 12s
643:	learn: 0.3067380	total: 21.7s	remaining: 12s
644:	learn: 0.3066403	total: 21.7s	remaining: 12s
645:	learn: 0.3065422	total: 21.8s	remaining: 11.9s
646:	learn: 0.3064058	total: 21.8s	remaining: 11.9s
647:	learn: 0.3063578	total: 21.8s	remaining: 11.9s
648:	learn: 0.3062214	total: 21.8s	remaining: 11.8s
649:	learn: 0.3061448	total: 21.9s	remaining: 11.8s
650:	learn: 0.3060121	total: 21.9s	remaining: 11.7s
651:	learn: 0.3059380	total: 21.9s	remaining: 11.7s
652:	learn: 0.3058445	total: 21.9s	remaining: 11.7s
653:	learn: 0.3057488	total: 22s	remaining: 11.6s
654:	learn: 0.3056390	total: 22s	remaining: 11.6s
655:	learn: 0.3055595	total: 22s	remaining: 11.5s
656:	learn: 0.3054413	total: 22s	remaining: 11.5s
657:	learn: 0.3053545	total: 22.1s	remaining: 11.5s
658:	learn: 0.3052675	total: 22.1s	remaining: 11.4s
659:	learn: 0.3051978	total: 22.1s	remaining: 11.4s
660:	learn: 0.3051126	total: 22.1s	remaining: 11.4s
661:	learn: 0.3049963	tota

801:	learn: 0.2926996	total: 27.1s	remaining: 6.68s
802:	learn: 0.2926486	total: 27.1s	remaining: 6.64s
803:	learn: 0.2925726	total: 27.1s	remaining: 6.61s
804:	learn: 0.2925187	total: 27.1s	remaining: 6.57s
805:	learn: 0.2924523	total: 27.2s	remaining: 6.54s
806:	learn: 0.2923861	total: 27.2s	remaining: 6.5s
807:	learn: 0.2923282	total: 27.2s	remaining: 6.47s
808:	learn: 0.2922241	total: 27.2s	remaining: 6.43s
809:	learn: 0.2921461	total: 27.3s	remaining: 6.39s
810:	learn: 0.2920788	total: 27.3s	remaining: 6.36s
811:	learn: 0.2919732	total: 27.3s	remaining: 6.32s
812:	learn: 0.2918949	total: 27.3s	remaining: 6.29s
813:	learn: 0.2918168	total: 27.4s	remaining: 6.25s
814:	learn: 0.2917196	total: 27.4s	remaining: 6.22s
815:	learn: 0.2916381	total: 27.4s	remaining: 6.18s
816:	learn: 0.2915927	total: 27.4s	remaining: 6.15s
817:	learn: 0.2914713	total: 27.5s	remaining: 6.11s
818:	learn: 0.2913965	total: 27.5s	remaining: 6.07s
819:	learn: 0.2913077	total: 27.5s	remaining: 6.04s
820:	learn: 0

963:	learn: 0.2804462	total: 31.7s	remaining: 1.18s
964:	learn: 0.2803666	total: 31.7s	remaining: 1.15s
965:	learn: 0.2803315	total: 31.7s	remaining: 1.12s
966:	learn: 0.2802498	total: 31.7s	remaining: 1.08s
967:	learn: 0.2801976	total: 31.8s	remaining: 1.05s
968:	learn: 0.2801566	total: 31.8s	remaining: 1.02s
969:	learn: 0.2800824	total: 31.8s	remaining: 984ms
970:	learn: 0.2800137	total: 31.8s	remaining: 951ms
971:	learn: 0.2798616	total: 31.9s	remaining: 918ms
972:	learn: 0.2797735	total: 31.9s	remaining: 885ms
973:	learn: 0.2797069	total: 31.9s	remaining: 852ms
974:	learn: 0.2796270	total: 32s	remaining: 819ms
975:	learn: 0.2795641	total: 32s	remaining: 787ms
976:	learn: 0.2795010	total: 32s	remaining: 753ms
977:	learn: 0.2794211	total: 32s	remaining: 721ms
978:	learn: 0.2793661	total: 32.1s	remaining: 688ms
979:	learn: 0.2793006	total: 32.1s	remaining: 655ms
980:	learn: 0.2792668	total: 32.1s	remaining: 622ms
981:	learn: 0.2792360	total: 32.2s	remaining: 589ms
982:	learn: 0.279189

# Stacking

In [197]:
estimators_m = [('XGB',XGBClassifier(random_state=1)),('SVC', SVC(random_state=1))]
model = StackingClassifier(estimators=estimators_m, final_estimator=LogisticRegression())
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)

# Metrics 
print(f"AUC: {metrics.roc_auc_score(y_test, y_pred_test, average='macro')}")

# Train score
y_pred_train = model.predict(X_train)
print(f"Train AUC: {metrics.roc_auc_score(y_train, y_pred_train, average='macro')}")

duration = 3000  # milliseconds
freq = 480  # Hz
Beep(freq, duration)

AUC: 0.8263084092370927
Train AUC: 0.8341702834604954


In [170]:
estimators_m = [('XGB',XGBClassifier(random_state=1)),('Cat', CatBoostClassifier(random_state=1)), ('LGBM', LGBMClassifier(random_state=1))]
model = VotingClassifier(estimators=estimators_m, voting='soft')
model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)

# Metrics
print(f"AUC: {metrics.roc_auc_score(y_test, y_pred_test, average='macro')}")

# Train score
y_pred_train = model.predict(X_train)
print(f"Train AUC: {metrics.roc_auc_score(y_train, y_pred_train, average='macro')}")

Learning rate set to 0.035731
0:	learn: 0.6807358	total: 35.4ms	remaining: 35.4s
1:	learn: 0.6684956	total: 62.7ms	remaining: 31.3s
2:	learn: 0.6574849	total: 127ms	remaining: 42.3s
3:	learn: 0.6471013	total: 186ms	remaining: 46.3s
4:	learn: 0.6378323	total: 238ms	remaining: 47.4s
5:	learn: 0.6282261	total: 267ms	remaining: 44.2s
6:	learn: 0.6202715	total: 337ms	remaining: 47.8s
7:	learn: 0.6123178	total: 356ms	remaining: 44.1s
8:	learn: 0.6048645	total: 408ms	remaining: 44.9s
9:	learn: 0.5982733	total: 449ms	remaining: 44.4s
10:	learn: 0.5919222	total: 471ms	remaining: 42.3s
11:	learn: 0.5857094	total: 499ms	remaining: 41.1s
12:	learn: 0.5802392	total: 529ms	remaining: 40.1s
13:	learn: 0.5754808	total: 546ms	remaining: 38.5s
14:	learn: 0.5709449	total: 570ms	remaining: 37.4s
15:	learn: 0.5663515	total: 600ms	remaining: 36.9s
16:	learn: 0.5606609	total: 619ms	remaining: 35.8s
17:	learn: 0.5572286	total: 652ms	remaining: 35.6s
18:	learn: 0.5527152	total: 693ms	remaining: 35.8s
19:	learn

167:	learn: 0.4070776	total: 4.22s	remaining: 20.9s
168:	learn: 0.4062711	total: 4.24s	remaining: 20.8s
169:	learn: 0.4057447	total: 4.26s	remaining: 20.8s
170:	learn: 0.4055772	total: 4.29s	remaining: 20.8s
171:	learn: 0.4053061	total: 4.33s	remaining: 20.8s
172:	learn: 0.4048695	total: 4.35s	remaining: 20.8s
173:	learn: 0.4044246	total: 4.38s	remaining: 20.8s
174:	learn: 0.4043214	total: 4.4s	remaining: 20.7s
175:	learn: 0.4041561	total: 4.44s	remaining: 20.8s
176:	learn: 0.4035686	total: 4.46s	remaining: 20.8s
177:	learn: 0.4034835	total: 4.5s	remaining: 20.8s
178:	learn: 0.4028107	total: 4.52s	remaining: 20.7s
179:	learn: 0.4026185	total: 4.56s	remaining: 20.8s
180:	learn: 0.4020546	total: 4.59s	remaining: 20.8s
181:	learn: 0.4019016	total: 4.62s	remaining: 20.8s
182:	learn: 0.4017030	total: 4.67s	remaining: 20.8s
183:	learn: 0.4015345	total: 4.71s	remaining: 20.9s
184:	learn: 0.4012585	total: 4.72s	remaining: 20.8s
185:	learn: 0.4011037	total: 4.75s	remaining: 20.8s
186:	learn: 0.

331:	learn: 0.3540162	total: 8.54s	remaining: 17.2s
332:	learn: 0.3535296	total: 8.56s	remaining: 17.1s
333:	learn: 0.3529237	total: 8.59s	remaining: 17.1s
334:	learn: 0.3525043	total: 8.62s	remaining: 17.1s
335:	learn: 0.3523004	total: 8.64s	remaining: 17.1s
336:	learn: 0.3520279	total: 8.66s	remaining: 17s
337:	learn: 0.3514705	total: 8.68s	remaining: 17s
338:	learn: 0.3506651	total: 8.71s	remaining: 17s
339:	learn: 0.3504763	total: 8.74s	remaining: 17s
340:	learn: 0.3502236	total: 8.76s	remaining: 16.9s
341:	learn: 0.3497551	total: 8.78s	remaining: 16.9s
342:	learn: 0.3490356	total: 8.8s	remaining: 16.9s
343:	learn: 0.3488339	total: 8.82s	remaining: 16.8s
344:	learn: 0.3482051	total: 8.86s	remaining: 16.8s
345:	learn: 0.3478576	total: 8.87s	remaining: 16.8s
346:	learn: 0.3477744	total: 8.9s	remaining: 16.7s
347:	learn: 0.3473442	total: 8.93s	remaining: 16.7s
348:	learn: 0.3471002	total: 8.96s	remaining: 16.7s
349:	learn: 0.3467086	total: 8.98s	remaining: 16.7s
350:	learn: 0.3462860	

495:	learn: 0.3215284	total: 13.1s	remaining: 13.3s
496:	learn: 0.3213265	total: 13.1s	remaining: 13.2s
497:	learn: 0.3212608	total: 13.1s	remaining: 13.2s
498:	learn: 0.3210994	total: 13.1s	remaining: 13.2s
499:	learn: 0.3209891	total: 13.2s	remaining: 13.2s
500:	learn: 0.3209251	total: 13.2s	remaining: 13.2s
501:	learn: 0.3208445	total: 13.3s	remaining: 13.2s
502:	learn: 0.3206734	total: 13.3s	remaining: 13.2s
503:	learn: 0.3205558	total: 13.4s	remaining: 13.1s
504:	learn: 0.3204875	total: 13.4s	remaining: 13.1s
505:	learn: 0.3202443	total: 13.4s	remaining: 13.1s
506:	learn: 0.3201389	total: 13.5s	remaining: 13.1s
507:	learn: 0.3199632	total: 13.5s	remaining: 13.1s
508:	learn: 0.3199081	total: 13.5s	remaining: 13s
509:	learn: 0.3198086	total: 13.5s	remaining: 13s
510:	learn: 0.3196866	total: 13.6s	remaining: 13s
511:	learn: 0.3194011	total: 13.6s	remaining: 13s
512:	learn: 0.3192963	total: 13.6s	remaining: 12.9s
513:	learn: 0.3191887	total: 13.6s	remaining: 12.9s
514:	learn: 0.319056

660:	learn: 0.3054083	total: 17.5s	remaining: 8.98s
661:	learn: 0.3053431	total: 17.5s	remaining: 8.95s
662:	learn: 0.3052807	total: 17.5s	remaining: 8.92s
663:	learn: 0.3051917	total: 17.6s	remaining: 8.89s
664:	learn: 0.3050612	total: 17.6s	remaining: 8.86s
665:	learn: 0.3049753	total: 17.6s	remaining: 8.83s
666:	learn: 0.3048689	total: 17.6s	remaining: 8.8s
667:	learn: 0.3048128	total: 17.6s	remaining: 8.77s
668:	learn: 0.3047622	total: 17.7s	remaining: 8.74s
669:	learn: 0.3046890	total: 17.7s	remaining: 8.71s
670:	learn: 0.3045282	total: 17.7s	remaining: 8.68s
671:	learn: 0.3042315	total: 17.7s	remaining: 8.65s
672:	learn: 0.3041208	total: 17.8s	remaining: 8.63s
673:	learn: 0.3040099	total: 17.8s	remaining: 8.6s
674:	learn: 0.3039202	total: 17.8s	remaining: 8.57s
675:	learn: 0.3038562	total: 17.8s	remaining: 8.54s
676:	learn: 0.3037827	total: 17.9s	remaining: 8.52s
677:	learn: 0.3037272	total: 17.9s	remaining: 8.48s
678:	learn: 0.3035518	total: 17.9s	remaining: 8.46s
679:	learn: 0.

820:	learn: 0.2914622	total: 21.7s	remaining: 4.73s
821:	learn: 0.2913681	total: 21.7s	remaining: 4.7s
822:	learn: 0.2912923	total: 21.7s	remaining: 4.67s
823:	learn: 0.2912139	total: 21.8s	remaining: 4.65s
824:	learn: 0.2911416	total: 21.8s	remaining: 4.62s
825:	learn: 0.2910859	total: 21.8s	remaining: 4.59s
826:	learn: 0.2910325	total: 21.8s	remaining: 4.56s
827:	learn: 0.2909667	total: 21.8s	remaining: 4.54s
828:	learn: 0.2909218	total: 21.8s	remaining: 4.51s
829:	learn: 0.2908490	total: 21.9s	remaining: 4.48s
830:	learn: 0.2907887	total: 21.9s	remaining: 4.45s
831:	learn: 0.2906934	total: 21.9s	remaining: 4.42s
832:	learn: 0.2905401	total: 21.9s	remaining: 4.4s
833:	learn: 0.2904935	total: 22s	remaining: 4.37s
834:	learn: 0.2904685	total: 22s	remaining: 4.34s
835:	learn: 0.2903912	total: 22s	remaining: 4.32s
836:	learn: 0.2902855	total: 22s	remaining: 4.29s
837:	learn: 0.2901931	total: 22.1s	remaining: 4.26s
838:	learn: 0.2900995	total: 22.1s	remaining: 4.24s
839:	learn: 0.2900594	

980:	learn: 0.2802476	total: 26.9s	remaining: 520ms
981:	learn: 0.2801981	total: 26.9s	remaining: 493ms
982:	learn: 0.2801498	total: 27s	remaining: 466ms
983:	learn: 0.2800954	total: 27s	remaining: 439ms
984:	learn: 0.2800304	total: 27s	remaining: 411ms
985:	learn: 0.2799533	total: 27.1s	remaining: 384ms
986:	learn: 0.2798841	total: 27.1s	remaining: 357ms
987:	learn: 0.2798243	total: 27.1s	remaining: 329ms
988:	learn: 0.2797552	total: 27.2s	remaining: 302ms
989:	learn: 0.2796752	total: 27.2s	remaining: 275ms
990:	learn: 0.2796287	total: 27.2s	remaining: 247ms
991:	learn: 0.2795756	total: 27.3s	remaining: 220ms
992:	learn: 0.2794923	total: 27.3s	remaining: 192ms
993:	learn: 0.2793611	total: 27.3s	remaining: 165ms
994:	learn: 0.2792970	total: 27.3s	remaining: 137ms
995:	learn: 0.2792368	total: 27.4s	remaining: 110ms
996:	learn: 0.2791724	total: 27.4s	remaining: 82.5ms
997:	learn: 0.2791261	total: 27.4s	remaining: 55ms
998:	learn: 0.2790989	total: 27.5s	remaining: 27.5ms
999:	learn: 0.279

# Publicar

In [52]:
estimators = [('xgb', XGBClassifier(n_estimators=1000, max_depth=2, min_child_weight=8, random_state=1)),('rf', RandomForestClassifier(random_state=1, max_depth=9, max_features=6, min_samples_leaf=5, n_estimators=100)), ('svc', SVC(C=.9, probability=True))]
model = VotingClassifier(estimators=estimators, voting='soft')
model.fit(X_train, y_train)
model.fit(X, y)
duration = 3000  # milliseconds
freq = 480  # Hz
Beep(freq, duration)

In [53]:
y_predict = model.predict(data_clean_test)
y_predict[:50]

array([1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1.,
       1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1.,
       1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.])

In [54]:
target_column = pd.Series(y_predict).astype('int64')
submission = pd.concat([data_test['enrollee_id'], target_column], axis=1) 
submission.columns = ['enrollee_id', 'target']
print(submission.shape)
submission.head()

(3832, 2)


Unnamed: 0,enrollee_id,target
0,23603,1
1,22499,0
2,10465,1
3,8293,0
4,4246,0


In [55]:
nombre_modelo = 'Voting_conparametrosmodeloganador__RF9_clean'
nombre_archivo = f'submission_{nombre_modelo}.csv'
submission.to_csv(nombre_archivo, index=False)