In [21]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
import category_encoders as ce


# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.impute import KNNImputer

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('once')

In [22]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

In [23]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [25]:
'''
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 5)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
'''

"\nn_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 5)]\n\nmax_features = ['auto', 'sqrt']\n\nmax_depth = [int(x) for x in np.linspace(10, 110, num = 5)]\nmax_depth.append(None)\n\nmin_samples_split = [2, 5, 10]\n\nmin_samples_leaf = [1, 2, 4]\n\nbootstrap = [True, False]\n\nrandom_grid = {'n_estimators': n_estimators,\n               'max_features': max_features,\n               'max_depth': max_depth,\n               'min_samples_split': min_samples_split,\n               'min_samples_leaf': min_samples_leaf,\n               'bootstrap': bootstrap}\nprint(random_grid)\n"

In [26]:
'''
clf=RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train,y_train)

y_pred=rf_random.predict_proba(X_test)
'''

'\nclf=RandomForestClassifier()\nrf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)\n\nrf_random.fit(X_train,y_train)\n\ny_pred=rf_random.predict_proba(X_test)\n'

In [27]:
#rf_random.best_params_

In [28]:
#print(log_loss(y_test, y_pred))

In [29]:
c = RandomForestClassifier(n_estimators= 50, min_samples_split= 2, 
                           min_samples_leaf=5, max_features= 'sqrt', 
                           max_depth= 25,bootstrap= False)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

numeric_cols = X_train.select_dtypes(include=['int', 'float64']).columns.to_list()
#numeric_cols = ['Total_Amount_Sum_USD', 'Total_Taxable_Amount_USD']

# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[
                            ('imputer', SimpleImputer()),
                            ('scaler', StandardScaler())
                        ]
                      )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, numeric_cols)
                        ],
                    remainder='passthrough'
                    )

pipe = Pipeline([('preprocessing', preprocessor),('rf',c)])

pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Convertibility',
                                                   'Total_Taxable_Amount_USD',
                                                   'Year_Creation',
                                                   'Month_Creation',
                                                   'Year_Delivery',
                                                   'Days_Passed',
                                                   'Wait_Time_Days',
                             

In [30]:
from sklearn.metrics import f1_score

print(log_loss(y_test, pipe.predict_proba(X_test)))
f1_score(y_test, pipe.predict(X_test))

0.3431545729262328


0.8655139289145054

In [31]:
print(log_loss(y_train, pipe.predict_proba(X_train)))
f1_score(y_train, pipe.predict(X_train))

0.14300618872292314


0.983218163869694

# EN TEST_SET

In [32]:
test_pred = pipe.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))

In [33]:
test_set['Prediction'] = pd.DataFrame(test_pred)[1].to_list()
test_set.loc[:, ['Opportunity_ID', 'Prediction']]

Unnamed: 0,Opportunity_ID,Prediction
0,10689,0.718500
1,10690,0.520365
2,10691,0.468365
3,10692,0.332762
4,10693,0.928786
...,...,...
1562,12364,0.945413
1563,12365,0.302048
1564,12366,0.312817
1565,12367,0.454833


In [34]:
test_set['Prediction_Aux'] = 1 - test_set.Prediction
test_set.loc[:, ['Opportunity_ID', 'Prediction', 'Prediction_Aux']]

Unnamed: 0,Opportunity_ID,Prediction,Prediction_Aux
0,10689,0.718500,0.281500
1,10690,0.520365,0.479635
2,10691,0.468365,0.531635
3,10692,0.332762,0.667238
4,10693,0.928786,0.071214
...,...,...,...
1562,12364,0.945413,0.054587
1563,12365,0.302048,0.697952
1564,12366,0.312817,0.687183
1565,12367,0.454833,0.545167


In [35]:
log_loss(test_set.Target, test_set.Prediction_Aux)

1.7667264714063666

In [36]:
log_loss(test_set.Target, test_set.Prediction)

0.47131555916723217

In [37]:
submission = test_set.loc[:, ['Opportunity_ID', 'Prediction']]
submission.columns = ['Opportunity_ID', 'Target']

In [38]:
submission.head()

Unnamed: 0,Opportunity_ID,Target
0,10689,0.7185
1,10690,0.520365
2,10691,0.468365
3,10692,0.332762
4,10693,0.928786


In [39]:
submission.to_csv('predictions/sub_rf.csv', index=False)