In [106]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
import category_encoders as ce


# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.impute import KNNImputer

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('once')

In [107]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

In [108]:
train_fts = train_set.columns

imputer_train = KNNImputer()
imputer_train.fit(train_set)
train_set = pd.DataFrame(imputer_train.transform(train_set))

train_set.columns = train_fts

In [109]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [110]:
rf = RandomForestClassifier()

In [111]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [112]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [113]:
f_imp = rf.feature_importances_
f = X_test.columns
fts = ['Opportunity_ID', 'Target']

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))
    if f_imp[i]>=0.0095:
        fts.append(f[i])

Pricing, Delivery_Terms_Quote_Appr		                             0.0019192720370139285
Pricing, Delivery_Terms_Approved		                              0.005222124708653475
Bureaucratic_Code_0_Approval		                              0.002519549024576439
Bureaucratic_Code_0_Approved		                             0.0018572589336858963
Quote_Type		                             0.0006448300272668097
Convertibility		                              0.009991123669211219
Total_Amount_Sum_USD		                              0.021714094145734296
Total_Taxable_Amount_USD		                               0.01647423970545952
Year_Creation		                              0.005561843469507605
Month_Creation		                              0.008597326507364193
Year_Delivery		                              0.004499679835403851
Month_Delivery		                              0.008686675451262837
Days_Passed		                              0.021734540954510453
Wait_Time_Days		                               0.0726493

In [114]:
len(fts)-2

34

In [115]:
train_filtered = train_set.loc[:, fts]

X = train_filtered.drop(columns=['Opportunity_ID', 'Target'])
y = train_filtered.Target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [116]:
'''
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
'''

"\nn_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]\n\nmax_features = ['auto', 'sqrt']\n\nmax_depth = [int(x) for x in np.linspace(10, 110, num = 5)]\nmax_depth.append(None)\n\nmin_samples_split = [2, 5, 10]\n\nmin_samples_leaf = [1, 2, 4]\n\nbootstrap = [True, False]\n\nrandom_grid = {'n_estimators': n_estimators,\n               'max_features': max_features,\n               'max_depth': max_depth,\n               'min_samples_split': min_samples_split,\n               'min_samples_leaf': min_samples_leaf,\n               'bootstrap': bootstrap}\nprint(random_grid)\n"

In [117]:
'''
clf=RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train,y_train)

y_pred=rf_random.predict_proba(X_test)
'''

'\nclf=RandomForestClassifier()\nrf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)\n\nrf_random.fit(X_train,y_train)\n\ny_pred=rf_random.predict_proba(X_test)\n'

In [118]:
#rf_random.best_params_

In [119]:
#print(log_loss(y_test, y_pred))

In [120]:
c = RandomForestClassifier(n_estimators= 2000, min_samples_split= 2, 
                           min_samples_leaf=5, max_features= 'sqrt', 
                           max_depth= 30,bootstrap= False)
c.fit(X_train, y_train)
print(log_loss(y_test, c.predict_proba(X_test)))

0.31158504240223006


# EN TEST_SET

In [121]:
test_filtered = test_set.loc[:, fts]

In [122]:
X_test_set = test_filtered.drop(columns=['Target', 'Opportunity_ID'])
y_test_set = test_filtered.Target

X_test_fts = X_test_set.columns

imputer_test = KNNImputer()
imputer_test.fit(X_test_set)
X_test_set = pd.DataFrame(imputer_test.transform(X_test_set))

X_test_set.columns=X_test_fts

In [123]:
test_pred = c.predict_proba(X_test_set)

In [124]:
log_loss(y_test_set, test_pred)

0.4538450051358958

In [125]:
test_filtered['Prediction'] = pd.DataFrame(test_pred)[1].to_list()
test_filtered.loc[:, ['Opportunity_ID', 'Target', 'Prediction']]

Unnamed: 0,Opportunity_ID,Target,Prediction
0,10689,1,0.730973
1,10690,1,0.515754
2,10691,1,0.419551
3,10692,1,0.401279
4,10693,1,0.939666
...,...,...,...
1562,12364,1,0.960352
1563,12365,1,0.323449
1564,12366,1,0.289360
1565,12367,0,0.363214


In [126]:
test_filtered['Prediction_Aux'] = 1 - test_filtered.Prediction
test_filtered.loc[:, ['Opportunity_ID', 'Target', 'Prediction', 'Prediction_Aux']]

Unnamed: 0,Opportunity_ID,Target,Prediction,Prediction_Aux
0,10689,1,0.730973,0.269027
1,10690,1,0.515754,0.484246
2,10691,1,0.419551,0.580449
3,10692,1,0.401279,0.598721
4,10693,1,0.939666,0.060334
...,...,...,...,...
1562,12364,1,0.960352,0.039648
1563,12365,1,0.323449,0.676551
1564,12366,1,0.289360,0.710640
1565,12367,0,0.363214,0.636786


In [128]:
log_loss(test_filtered.Target, test_filtered.Prediction_Aux)

1.6025260398414778

In [129]:
submission = test_filtered.loc[:, ['Opportunity_ID', 'Prediction']]
submission.columns = ['Opportunity_ID', 'Target']

In [130]:
submission.head()

Unnamed: 0,Opportunity_ID,Target
0,10689,0.730973
1,10690,0.515754
2,10691,0.419551
3,10692,0.401279
4,10693,0.939666


In [131]:
submission.to_csv('predictions/sub_rf.csv', index=False)