In [23]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
import category_encoders as ce


# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('once')

In [24]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([np.inf,-np.inf], np.nan)
test_set = test_set.replace([np.inf,-np.inf], np.nan)

In [25]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [34]:
# Selección de las variables por típo
# ==============================================================================
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import make_column_selector

# Preprocedado
# ==============================================================================

# Identificación de columnas numéricas
numeric_cols = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()
numeric_cols_test = X_test.select_dtypes(include=['float64', 'int']).columns.to_list()

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', KBinsDiscretizer(n_bins=5, strategy='quantile'))
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

In [35]:
clf = RandomForestClassifier(n_estimators= 200, min_samples_split= 7, 
                           min_samples_leaf=1, max_features= 'auto', 
                           max_depth= 32,bootstrap= False, criterion='gini')


# Se combinan los pasos de preprocesado y el modelo en un mismo pipeline
pipe = Pipeline([('preprocessing', preprocessor),('rf',clf)])

# Train
# ==============================================================================
# Se asigna el resultado a _ para que no se imprima por pantalla
_ = pipe.fit(X=X_train, y=y_train)



In [36]:
log_loss(y_test, pipe.predict_proba(X_test))

0.3628189473201792

In [37]:
pred=pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))
log_loss(test_set.Target, pred)

0.46316922619408646

In [38]:
final = test_set.loc[:, ['Opportunity_ID', 'Target']].copy()
final['Target'] = pd.DataFrame(pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))[:, 1])
final.to_csv('new_cat.csv', index=False)

In [None]:
'''
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

max_features = ['auto', 'sqrt', 'log2']

max_depth = [int(x) for x in np.linspace(10, 110, num = 10)]
max_depth.append(None)

min_samples_split = [2, 5, 7,8,10, 12, 13, 15, 17,18]

min_samples_leaf = [1, 2, 4,5,6,7,8,9,10]

bootstrap = [True, False]
criterion = ['gini', 'entropy']

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion':criterion}
print(random_grid)
'''

In [None]:
'''
clf=RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train,y_train)

y_pred=rf_random.predict_proba(X_test)
'''

In [None]:
#rf_random.best_params_

In [None]:
#print(log_loss(y_test, y_pred))

In [None]:
'''
c = RandomForestClassifier(n_estimators= 200, min_samples_split= 7, 
                           min_samples_leaf=1, max_features= 'auto', 
                           max_depth= 32,bootstrap= False, criterion='gini')
c.fit(X_train, y_train)
print(log_loss(y_test, c.predict_proba(X_test)))
'''