In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
import math

# Preprocesado y modelado
# ==============================================================================
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

import pickle

from datetime import datetime

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/FeatureEngineering/df_time_series.csv" )
df = df.drop(columns = {'Unnamed: 0', 'Delivery_Year', 'Delivery_Quarter'})
df

Unnamed: 0,Region,Total_Amount,Años en entregar,Month Created,Product_Name,Opportunity_Owner,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
0,EMEA,2.773332,0,2,Other,Other,0,1,0,2.496126,2.585640,,3.088401
1,EMEA,2.740425,0,2,Product_Name_210,Other,1,1,0,2.486798,2.545803,2.585640,2.098247
2,EMEA,2.999379,0,3,Product_Name_432,Other,1,1,0,2.499777,2.852610,2.545803,2.583974
3,APAC,2.780462,0,7,Other,Other,0,0,0,2.448872,2.594236,2.852610,1.552569
4,APAC,2.647444,0,2,Product_Name_91,Person_Name_11,1,1,1,2.467621,2.468684,,3.019307
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,Americas,2.845924,1,6,Product_Name_488,Person_Name_8,1,1,1,2.544620,2.706302,2.740744,2.097463
13579,Americas,2.819796,1,6,Product_Name_483,Person_Name_8,1,1,1,2.544620,2.706302,2.740744,2.097463
13580,Americas,2.845924,1,6,Product_Name_488,Person_Name_8,1,1,1,2.544620,2.706302,2.740744,2.097463
13581,Americas,3.032890,1,10,Product_Name_445,Person_Name_8,1,1,0,2.544620,2.706302,2.740744,2.097463


In [3]:
df = df.fillna(0)

In [4]:
df.describe()

Unnamed: 0,Total_Amount,Años en entregar,Month Created,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
count,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0
mean,2.65108,0.255025,6.574321,0.569388,0.77332,0.554001,2.449497,2.444527,2.339767,2.183309
std,0.17725,0.466251,3.421165,0.49518,0.4187,0.497094,0.098289,0.142587,0.531066,0.214954
min,0.0,-1.0,1.0,0.0,0.0,0.0,2.219173,1.336753,0.0,0.0
25%,2.588674,0.0,4.0,0.0,1.0,0.0,2.437621,2.401642,2.389793,2.088779
50%,2.656112,0.0,6.0,1.0,1.0,1.0,2.491107,2.465992,2.466176,2.159336
75%,2.753412,0.0,10.0,1.0,1.0,1.0,2.499777,2.54416,2.546746,2.217261
max,3.104954,5.0,12.0,1.0,1.0,1.0,2.553549,2.9739,2.85261,3.240224


# Encode

In [5]:
Mean_encoded_subject_region = df.groupby(['Region'])['Decision'].mean().to_dict() 

df['Region'] =  df['Region'].map(Mean_encoded_subject_region)

#--------------------------------------------------------------------------------------------

mean_encoded_product = df.groupby("Product_Name")["Decision"].mean().to_dict()

df["Product_Name"] = df["Product_Name"].map(mean_encoded_product)

#--------------------------------------------------------------------------------------------

#Mean_encoded_subject_quarter = df.groupby(['Delivery_Quarter'])['Decision'].mean().to_dict() 

#df['Delivery_Quarter'] =  df['Delivery_Quarter'].map(Mean_encoded_subject_quarter)

#--------------------------------------------------------------------------------------------

Mean_encoded_subject_owner = df.groupby(['Opportunity_Owner'])['Decision'].mean().to_dict() 

df['Opportunity_Owner'] =  df['Opportunity_Owner'].map(Mean_encoded_subject_owner)

# Split

In [6]:
X_test = df.tail(math.floor(df.shape[0]*0.20)).drop(columns = {'Decision'})
y_test = df.tail(math.floor(df.shape[0]*0.20))['Decision']
X_train = df.head(round(df.shape[0]*0.80)).drop(columns = {'Decision'})
y_train = df.head(round(df.shape[0]*0.80))['Decision']
X_train

Unnamed: 0,Region,Total_Amount,Años en entregar,Month Created,Product_Name,Opportunity_Owner,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
0,0.532255,2.773332,0,2,0.445844,0.000000,0,1,2.496126,2.585640,0.000000,3.088401
1,0.532255,2.740425,0,2,0.283582,0.000000,1,1,2.486798,2.545803,2.585640,2.098247
2,0.532255,2.999379,0,3,0.009174,0.000000,1,1,2.499777,2.852610,2.545803,2.583974
3,0.587888,2.780462,0,7,0.445844,0.000000,0,0,2.448872,2.594236,2.852610,1.552569
4,0.587888,2.647444,0,2,0.900000,0.371429,1,1,2.467621,2.468684,0.000000,3.019307
...,...,...,...,...,...,...,...,...,...,...,...,...
10861,0.587888,2.817755,0,9,0.055556,0.252427,1,1,2.448872,2.594527,2.441786,2.357726
10862,0.587888,2.793234,0,10,0.333333,0.252427,1,1,2.448872,2.594527,2.441786,2.357726
10863,0.587888,2.541749,0,10,0.722798,0.252427,1,1,2.448872,2.594527,2.441786,2.357726
10864,0.587888,2.772403,0,10,0.333333,0.252427,1,1,2.448872,2.594527,2.441786,2.357726


# Random Search

In [7]:
import sklearn as sk
import sklearn.neural_network


lr = 0.001           # learning rate
nn = [2, 16, 8, 1]  # número de neuronas por capa.
nr = [3, 15, 10]
nd = [1,20,6,2]

# Creamos el objeto del modelo de red neuronal multicapa.
modelRegresor = sk.neural_network.MLPRegressor(solver='adam', 
                                     activation = 'logistic',
                                     learning_rate_init=lr,
                                     hidden_layer_sizes=tuple(nn[1:]),
                                     verbose=True,
                                     n_iter_no_change=1000,
                                     batch_size = 64)

In [8]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [9]:
params = {
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'alpha': [0.0001,0.10, 0.001,1, 10],
        'learning_rate': ['constant', 'invscaling', 'adaptive'],
        'hidden_layer_sizes': [tuple(nn[1:]), tuple(nr[1:]), tuple(nd[1:])],
        'learning_rate_init' : [0.1,0.01, 0.001],
        'n_iter_no_change' : [10, 1000, 100, 10000],
        'batch_size' : [64, 100, 200]
        }

In [10]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(modelRegresor, param_distributions=params, n_iter=param_comb,scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001)

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:   57.7s finished


Iteration 1, loss = 1.81114933
Iteration 2, loss = 0.71168308
Iteration 3, loss = 0.47251980
Iteration 4, loss = 0.32832728
Iteration 5, loss = 0.24221854
Iteration 6, loss = 0.18935910
Iteration 7, loss = 0.15643221
Iteration 8, loss = 0.13314376
Iteration 9, loss = 0.11757707
Iteration 10, loss = 0.10706051
Iteration 11, loss = 0.09986258
Iteration 12, loss = 0.09541704
Iteration 13, loss = 0.09247522
Iteration 14, loss = 0.09057594
Iteration 15, loss = 0.08951520
Iteration 16, loss = 0.08853406
Iteration 17, loss = 0.08827133
Iteration 18, loss = 0.08759050
Iteration 19, loss = 0.08728386
Iteration 20, loss = 0.08712352
Iteration 21, loss = 0.08653697
Iteration 22, loss = 0.08618845
Iteration 23, loss = 0.08599126
Iteration 24, loss = 0.08574941
Iteration 25, loss = 0.08584665
Iteration 26, loss = 0.08543501
Iteration 27, loss = 0.08491701
Iteration 28, loss = 0.08498510
Iteration 29, loss = 0.08475467
Iteration 30, loss = 0.08481799
Iteration 31, loss = 0.08393912
Iteration 32, los

In [11]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([24.82265306, 25.25925215,  9.15331276, 10.6287547 ,  4.09762645]), 'std_fit_time': array([0.4947289 , 2.32881417, 1.73726636, 2.34139674, 0.70209254]), 'mean_score_time': array([0.01242375, 0.01068377, 0.00936476, 0.00783896, 0.00909344]), 'std_score_time': array([0.00014572, 0.00134891, 0.00039552, 0.00166166, 0.00268359]), 'param_n_iter_no_change': masked_array(data=[1000, 1000, 100, 10000, 10],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate_init': masked_array(data=[0.1, 0.01, 0.001, 0.001, 0.01],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=['constant', 'invscaling', 'adaptive', 'constant',
                   'invscaling'],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_hidden_layer_sizes': masked_arra

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_iter_no_change,param_learning_rate_init,param_learning_rate,param_hidden_layer_sizes,param_batch_size,param_alpha,param_activation,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,24.822653,0.494729,0.012424,0.000146,1000,0.1,constant,"(15, 10)",64,10.0,logistic,"{'n_iter_no_change': 1000, 'learning_rate_init...",0.58352,0.474337,0.593845,0.550567,0.054068,5
1,25.259252,2.328814,0.010684,0.001349,1000,0.01,invscaling,"(16, 8, 1)",100,0.1,relu,"{'n_iter_no_change': 1000, 'learning_rate_init...",0.5,0.855963,0.863045,0.739669,0.169496,4
2,9.153313,1.737266,0.009365,0.000396,100,0.001,adaptive,"(15, 10)",200,0.0001,identity,"{'n_iter_no_change': 100, 'learning_rate_init'...",0.857491,0.853278,0.855688,0.855486,0.001726,2
3,10.628755,2.341397,0.007839,0.001662,10000,0.001,constant,"(16, 8, 1)",200,0.0001,identity,"{'n_iter_no_change': 10000, 'learning_rate_ini...",0.857724,0.858423,0.855885,0.857344,0.00107,1
4,4.097626,0.702093,0.009093,0.002684,10,0.01,invscaling,"(15, 10)",64,0.1,logistic,"{'n_iter_no_change': 10, 'learning_rate_init':...",0.851716,0.851321,0.84825,0.850429,0.00155,3


In [13]:
LinReg_pred_random = random_search.predict(X_test)

In [14]:
probando = pd.DataFrame()
probando['target'] = LinReg_pred_random
probando.describe()

Unnamed: 0,target
count,2716.0
mean,0.518174
std,0.276749
min,-0.353489
25%,0.320076
50%,0.591459
75%,0.720125
max,1.163884


In [15]:
random_search.score(X_test, y_test)

0.8416370450945766

In [16]:
loss_random = log_loss(y_test  , LinReg_pred_random)
print(f"Log loss is {loss_random}")

Log loss is 0.5959359587587496


# No Serch

In [17]:
import sklearn as sk
import sklearn.neural_network


lr = 0.001           # learning rate
nn = [2, 16, 8, 1]  # número de neuronas por capa.

# Creamos el objeto del modelo de red neuronal multicapa.
modelRegresor = sk.neural_network.MLPRegressor(solver='adam', 
                                     activation = 'logistic',
                                     learning_rate_init=lr,
                                     hidden_layer_sizes=tuple(nn[1:]),
                                     verbose=True,
                                     n_iter_no_change=1000,
                                     batch_size = 64)

In [18]:
modelRegresor.fit(X_train, y_train)

Iteration 1, loss = 0.37413850
Iteration 2, loss = 0.19945453
Iteration 3, loss = 0.14182870
Iteration 4, loss = 0.12716625
Iteration 5, loss = 0.12458183
Iteration 6, loss = 0.12425088
Iteration 7, loss = 0.12421126
Iteration 8, loss = 0.12421647
Iteration 9, loss = 0.12421145
Iteration 10, loss = 0.12420953
Iteration 11, loss = 0.12420744
Iteration 12, loss = 0.12419793
Iteration 13, loss = 0.12420567
Iteration 14, loss = 0.12418888
Iteration 15, loss = 0.12414361
Iteration 16, loss = 0.12381134
Iteration 17, loss = 0.12027286
Iteration 18, loss = 0.10778773
Iteration 19, loss = 0.09150661
Iteration 20, loss = 0.08532256
Iteration 21, loss = 0.08314646
Iteration 22, loss = 0.08161652
Iteration 23, loss = 0.08086650
Iteration 24, loss = 0.08032618
Iteration 25, loss = 0.07991739
Iteration 26, loss = 0.07968840
Iteration 27, loss = 0.07944583
Iteration 28, loss = 0.07925249
Iteration 29, loss = 0.07930982
Iteration 30, loss = 0.07901708
Iteration 31, loss = 0.07890894
Iteration 32, los

MLPRegressor(activation='logistic', batch_size=64,
             hidden_layer_sizes=(16, 8, 1), n_iter_no_change=1000,
             verbose=True)

In [19]:
LinReg_pred = modelRegresor.predict(X_test)

In [20]:
probando = pd.DataFrame()
probando['target'] = LinReg_pred
probando.describe()

Unnamed: 0,target
count,2716.0
mean,0.602217
std,0.309079
min,-0.017504
25%,0.30068
50%,0.696388
75%,0.868411
max,1.015976


In [21]:
modelRegresor.score(X_test, y_test)

0.34252770842732805

In [22]:
loss = log_loss(y_test  , LinReg_pred)
print(f"Log loss is {loss}")

Log loss is 0.5345242151147312


In [23]:
#pickle.dump(modelRegresor, open('modeloRegresorRN.p', 'wb'))

# Kaggle

In [24]:
DataFrame_test = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/FeatureEngineering/df_time_series_test.csv" )
DataFrame_test

Unnamed: 0.1,Unnamed: 0,Opportunity_ID,Region,Total_Amount,Años en entregar,Month Created,Product_Name,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
0,171,10792,EMEA,12.087389,0,1,Product_Name_201,Person_Name_13,2019,Q1,1,1,11.753909,11.001991,,11.000704
1,226,10806,EMEA,10.325097,0,1,Product_Name_241,Person_Name_13,2019,Q1,0,1,11.753909,11.001991,,11.000704
2,238,10812,EMEA,11.479095,0,1,Product_Name_303,Person_Name_13,2019,Q1,1,1,11.753909,11.001991,,11.000704
3,270,10836,EMEA,10.155219,0,1,Product_Name_182,Person_Name_13,2019,Q1,1,1,11.753909,11.001991,,11.000704
4,282,10845,EMEA,9.975017,0,1,Product_Name_283,Person_Name_13,2019,Q1,1,1,11.753909,11.001991,,11.000704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2519,12347,Americas,15.340732,1,4,Product_Name_487,Person_Name_9,2020,Q2,1,1,13.556432,15.849836,0.905656,15.849826
1563,2520,12348,Americas,16.193551,1,4,Product_Name_475,Person_Name_9,2020,Q2,1,1,13.556432,15.849836,0.905656,15.849826
1564,2521,12349,Americas,15.831542,1,4,Product_Name_432,Person_Name_9,2020,Q2,1,1,13.556432,15.849836,0.905656,15.849826
1565,2522,12350,Americas,16.193551,1,4,Product_Name_475,Person_Name_9,2020,Q3,1,1,14.756877,16.193551,0.939492,16.193544


In [25]:
subir = pd.DataFrame()
subir['Opportunity_ID'] = DataFrame_test['Opportunity_ID']
DataFrame_test = DataFrame_test.drop(columns = {'Opportunity_ID', 'Unnamed: 0', 'Delivery_Year', 'Delivery_Quarter'}).fillna(0)
DataFrame_test

Unnamed: 0,Region,Total_Amount,Años en entregar,Month Created,Product_Name,Opportunity_Owner,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
0,EMEA,12.087389,0,1,Product_Name_201,Person_Name_13,1,1,11.753909,11.001991,0.000000,11.000704
1,EMEA,10.325097,0,1,Product_Name_241,Person_Name_13,0,1,11.753909,11.001991,0.000000,11.000704
2,EMEA,11.479095,0,1,Product_Name_303,Person_Name_13,1,1,11.753909,11.001991,0.000000,11.000704
3,EMEA,10.155219,0,1,Product_Name_182,Person_Name_13,1,1,11.753909,11.001991,0.000000,11.000704
4,EMEA,9.975017,0,1,Product_Name_283,Person_Name_13,1,1,11.753909,11.001991,0.000000,11.000704
...,...,...,...,...,...,...,...,...,...,...,...,...
1562,Americas,15.340732,1,4,Product_Name_487,Person_Name_9,1,1,13.556432,15.849836,0.905656,15.849826
1563,Americas,16.193551,1,4,Product_Name_475,Person_Name_9,1,1,13.556432,15.849836,0.905656,15.849826
1564,Americas,15.831542,1,4,Product_Name_432,Person_Name_9,1,1,13.556432,15.849836,0.905656,15.849826
1565,Americas,16.193551,1,4,Product_Name_475,Person_Name_9,1,1,14.756877,16.193551,0.939492,16.193544


# Encoding

In [26]:
DataFrame_test['Region'] =  DataFrame_test['Region'].map(Mean_encoded_subject_region)

#--------------------------------------------------------------------------------------------

DataFrame_test["Product_Name"] = DataFrame_test["Product_Name"].map(mean_encoded_product)

#--------------------------------------------------------------------------------------------

#DataFrame_test['Delivery_Quarter'] =  DataFrame_test['Delivery_Quarter'].map(Mean_encoded_subject_quarter)

#--------------------------------------------------------------------------------------------

DataFrame_test['Opportunity_Owner'] =  DataFrame_test['Opportunity_Owner'].map(Mean_encoded_subject_owner)
DataFrame_test = DataFrame_test.fillna(0.406068)

In [27]:
DataFrame_test.isnull().sum()

Region                                 0
Total_Amount                           0
Años en entregar                       0
Month Created                          0
Product_Name                           0
Opportunity_Owner                      0
Pricing, Delivery_Terms_Approved       0
Pricing, Delivery_Terms_Quote_Appr     0
Promedio_Region_Por_Quarter            0
Promedio_Owner_Por_Year_And_Quarter    0
Lag_1                                  0
Delta                                  0
dtype: int64

# Prediction

In [28]:
pred_posta = modelRegresor.predict(DataFrame_test)

In [29]:
subir['target'] = pred_posta
subir.set_index('Opportunity_ID', inplace = True)
subir

Unnamed: 0_level_0,target
Opportunity_ID,Unnamed: 1_level_1
10792,-0.029849
10806,-0.027832
10812,-0.029611
10836,-0.027267
10845,-0.027453
...,...
12347,-0.030298
12348,-0.030325
12349,-0.030327
12350,-0.030320


In [30]:
subir.describe()

Unnamed: 0,target
count,1567.0
mean,-0.02607
std,0.028361
min,-0.030502
25%,-0.030085
50%,-0.029474
75%,-0.028094
max,0.690071


In [31]:
#subir.to_csv('RedesNeuronales_Entrega_9.csv')

# Random Search

In [32]:
pred_posta = random_search.predict(DataFrame_test)

In [34]:
subir['target'] = pred_posta
subir

Unnamed: 0_level_0,target
Opportunity_ID,Unnamed: 1_level_1
10792,-0.136945
10806,0.601262
10812,-0.042961
10836,0.775432
10845,0.665877
...,...
12347,-0.267525
12348,-0.604637
12349,-0.700715
12350,-0.478323


In [35]:
subir.describe()

Unnamed: 0,target
count,1567.0
mean,0.353762
std,0.652268
min,-1.645667
25%,-0.067649
50%,0.325813
75%,0.721734
max,3.610881
