In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm


# Preprocesado y modelado
# ==============================================================================
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

from sklearn.metrics import log_loss

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

import pickle

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/Training.csv" )
df

Unnamed: 0,ID,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Stage,Prod_Category_A
0,27761,EMEA,,1,1,1,1,0,Bureaucratic_Code_4,6/16/2015,...,Q2,2016,NaT,10,EUR,5272800.00,EUR,5272800.0,Closed Lost,Prod_Category_A_None
1,27760,EMEA,,0,0,0,0,0,Bureaucratic_Code_4,6/16/2015,...,Q1,2016,NaT,0,EUR,48230.00,EUR,48230.0,Closed Won,Prod_Category_A_None
2,27446,Americas,NW America,0,0,0,0,0,Bureaucratic_Code_4,4/21/2015,...,Q1,2016,NaT,0,USD,83865.60,USD,83865.6,Closed Won,Prod_Category_A_None
3,16808,Americas,NW America,1,0,1,0,0,Bureaucratic_Code_5,7/27/2013,...,Q1,2018,NaT,14,USD,7421881.50,USD,7421881.5,Closed Lost,Prod_Category_A_None
4,16805,Americas,NW America,1,0,1,0,0,Bureaucratic_Code_5,7/27/2013,...,Q1,2018,NaT,25,USD,13357192.50,USD,13357192.5,Closed Lost,Prod_Category_A_None
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16942,8781,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q1,2016,NaT,0,EUR,103350.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16943,8786,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q2,2016,NaT,0,EUR,93015.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16944,8792,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q1,2016,NaT,0,EUR,103350.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16945,28561,Americas,NE America,1,1,1,1,0,Bureaucratic_Code_4,10/20/2015,...,Q2,2016,NaT,4,USD,2346796.88,USD,0.0,Closed Lost,Prod_Category_A_None


In [3]:
df.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount'] = df['Total_Amount']*0.0096
df.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount'] = df['Total_Amount']*1.17
df.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount'] = df['Total_Amount']*0.70
df.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount'] = df['Total_Amount']*1.29
df.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount_Currency'] = 'USD'

In [4]:
short_df = df[['Region','Total_Amount','TRF','Pricing, Delivery_Terms_Approved','Pricing, Delivery_Terms_Quote_Appr','Stage' ]].rename(columns={'Stage': 'Decision'})
short_df = short_df[ (short_df['Decision'] == 'Closed Won') | (short_df['Decision'] == 'Closed Lost') ]
short_df['Decision'] = np.where(short_df['Decision'] == 'Closed Won',1,0)
short_df

Unnamed: 0,Region,Total_Amount,TRF,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
0,EMEA,6169176.00,10,1,1,0
1,EMEA,56429.10,0,0,0,1
2,Americas,83865.60,0,0,0,1
3,Americas,7421881.50,14,0,1,0
4,Americas,13357192.50,25,0,1,0
...,...,...,...,...,...,...
16942,EMEA,120919.50,0,1,1,1
16943,EMEA,108827.55,0,1,1,1
16944,EMEA,120919.50,0,1,1,1
16945,Americas,2346796.88,4,1,1,0


In [5]:
short_df = short_df[short_df['Total_Amount'] > 0]
short_df.describe()

Unnamed: 0,Total_Amount,TRF,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
count,16772.0,16772.0,16772.0,16772.0,16772.0
mean,1010612.0,2.28166,0.58544,0.807834,0.56803
std,5873179.0,12.084678,0.492661,0.394014,0.495365
min,0.01,0.0,0.0,0.0,0.0
25%,4750.453,0.0,0.0,1.0,0.0
50%,77140.0,0.0,1.0,1.0,1.0
75%,291466.9,1.0,1.0,1.0,1.0
max,315000100.0,500.0,1.0,1.0,1.0


In [6]:
short_df.Total_Amount = np.log(short_df.Total_Amount)
short_df.shape

(16772, 6)

# Encode

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
                                        short_df.drop(columns = 'Decision'),
                                        short_df['Decision'],
                                        random_state = 123
                                    )
# One-hot-encoding de las variables categóricas
# ==============================================================================
# Se identifica el nobre de las columnas numéricas y categóricas
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.to_list()
numeric_cols = X_train.select_dtypes(include=['float64', 'int']).columns.to_list()

# Se aplica one-hot-encoding solo a las columnas categóricas
preprocessor = ColumnTransformer(
                    [('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                    remainder='passthrough'
               )

# Una vez que se ha definido el objeto ColumnTransformer, con el método fit()
# se aprenden las transformaciones con los datos de entrenamiento y se aplican a
# los dos conjuntos con transform(). Ambas operaciones a la vez con fit_transform().
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

#El resultado devuelto por ColumnTransformer es un numpy array, por lo que se pierden los nombres de las columnas. Es interesante poder inspeccionar cómo queda el set de datos tras el preprocesado en formato dataframe. Por defecto, OneHotEncoder ordena las nuevas columnas de izquierda a derecha por orden alfabético.

# Convertir el output del ColumnTransformer en dataframe y añadir nombre columnas
# ==============================================================================
# Nombre de todas las columnas
encoded_cat = preprocessor.named_transformers_['onehot'].get_feature_names(cat_cols)
labels = np.concatenate([encoded_cat,numeric_cols])

# Conversión a dataframe
X_train_prep = pd.DataFrame(X_train_prep, columns=labels)
X_test_prep  = pd.DataFrame(X_test_prep, columns=labels)
X_train_prep.info()

AttributeError: 'numpy.ndarray' object has no attribute 'info'

In [8]:
X_train_prep

Unnamed: 0,Region_APAC,Region_Americas,Region_EMEA,Region_Japan,Region_Middle East,Total_Amount,TRF,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr"
0,0.0,0.0,0.0,1.0,0.0,16.166902,30.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,9.922952,0.0,1.0,1.0
2,1.0,0.0,0.0,0.0,0.0,9.682704,0.0,1.0,1.0
3,0.0,0.0,0.0,1.0,0.0,10.608936,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,11.353461,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
12574,0.0,0.0,0.0,1.0,0.0,4.086312,0.0,0.0,1.0
12575,0.0,0.0,0.0,1.0,0.0,5.933751,0.0,1.0,1.0
12576,1.0,0.0,0.0,0.0,0.0,8.825266,0.0,1.0,1.0
12577,0.0,0.0,0.0,1.0,0.0,5.714604,0.0,1.0,1.0


In [9]:
model = MLPClassifier((2, ), random_state = 0, learning_rate_init = 0.1, activation = "logistic")

In [10]:
model.fit(X_train_prep, y_train)

MLPClassifier(activation='logistic', hidden_layer_sizes=(2,),
              learning_rate_init=0.1, random_state=0)

In [11]:
model.predict(X_test_prep)

array([0, 1, 1, ..., 1, 1, 1])

In [12]:
model.score(X_test_prep, y_test)

0.7624612449320296

In [13]:
LinReg_pred = model.predict(X_test_prep)
loss = log_loss(y_test  , LinReg_pred)
print(f"Log loss is {loss}")

Log loss is 8.204452411936442


# Regresion

In [14]:
modelRegresor = MLPRegressor(activation = 'relu', hidden_layer_sizes = (60,60,60), solver = 'lbfgs')

In [15]:
modelRegresor.fit(X_train_prep, y_train)

MLPRegressor(hidden_layer_sizes=(60, 60, 60), solver='lbfgs')

In [16]:
LinReg_pred = modelRegresor.predict(X_test_prep)

In [17]:
probando = pd.DataFrame()
probando['target'] = LinReg_pred
probando.describe()

Unnamed: 0,target
count,4193.0
mean,0.56811
std,0.311251
min,-0.139106
25%,0.358669
50%,0.629729
75%,0.802438
max,1.42581


In [18]:
modelRegresor.score(X_test_prep, y_test)

0.39301277686071334

In [19]:
loss = log_loss(y_test  , LinReg_pred)
print(f"Log loss is {loss}")

Log loss is 0.5167036964798567


In [20]:
pickle.dump(modelRegresor, open('modeloRegresorRN.p', 'wb'))

# Kaggle

In [21]:
DataFrame_test = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/Test/Test.csv" )
DataFrame_test

Unnamed: 0,ID,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Month,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Prod_Category_A
0,6140,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,162240.0,EUR,367419.0,Prod_Category_A_None
1,6146,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,78624.0,EUR,367419.0,Prod_Category_A_None
2,6151,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,126555.0,EUR,367419.0,Prod_Category_A_None
3,6118,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 6,Q2,2019,NaT,1,EUR,243360.0,EUR,757783.5,Prod_Category_A_None
4,6124,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 6,Q2,2019,NaT,0,EUR,157248.0,EUR,757783.5,Prod_Category_A_None
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,16345,EMEA,KSA,1,1,0,0,0,Bureaucratic_Code_4,6/12/2017,...,2019 - 5,Q2,2019,NaT,0,USD,124740.0,USD,147750.0,Prod_Category_A_None
2547,15218,Americas,SE America,1,1,0,0,0,Bureaucratic_Code_4,6/8/2018,...,2019 - 10,Q4,2019,NaT,0,USD,45054.9,USD,45054.9,Prod_Category_A_None
2548,15224,Americas,SE America,1,1,1,1,0,Bureaucratic_Code_4,6/8/2018,...,2019 - 10,Q4,2019,NaT,0,USD,100122.0,USD,100122.0,Prod_Category_A_None
2549,7286,Americas,NE America,1,1,0,0,0,Bureaucratic_Code_4,8/29/2018,...,2019 - 8,Q3,2019,NaT,0,USD,143220.0,USD,143220.0,Prod_Category_A_None


In [22]:
DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'JPY', 'Total_Amount'] = DataFrame_test['Total_Amount']*0.0096
DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'JPY', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'EUR', 'Total_Amount'] = DataFrame_test['Total_Amount']*1.17
DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'EUR', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'AUD', 'Total_Amount'] = DataFrame_test['Total_Amount']*0.70
DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'AUD', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'GBP', 'Total_Amount'] = DataFrame_test['Total_Amount']*1.29
DataFrame_test.loc[DataFrame_test['Total_Amount_Currency'] == 'GBP', 'Total_Amount_Currency'] = 'USD'

In [23]:
DataFrame_test = DataFrame_test[['Opportunity_ID','Region','Total_Amount','TRF','Pricing, Delivery_Terms_Approved','Pricing, Delivery_Terms_Quote_Appr' ]]
DataFrame_test = DataFrame_test.drop_duplicates('Opportunity_ID',keep = 'last')
subir = pd.DataFrame()
subir['Opportunity_ID'] = DataFrame_test['Opportunity_ID']
DataFrame_test = DataFrame_test.drop(columns = ['Opportunity_ID'])
DataFrame_test

Unnamed: 0,Region,Total_Amount,TRF,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr"
2,EMEA,148069.350,0,1,1
7,EMEA,131582.880,0,1,1
8,Americas,21037.500,0,1,1
14,Americas,228327.000,1,1,1
15,Americas,5752.500,0,0,0
...,...,...,...,...,...
2546,EMEA,124740.000,0,1,1
2547,Americas,45054.900,0,1,1
2548,Americas,100122.000,0,1,1
2549,Americas,143220.000,0,1,1


In [24]:
DataFrame_test.Total_Amount = np.log(DataFrame_test.Total_Amount)
DataFrame_test['Total_Amount'].describe()

count    1567.000000
mean       11.581272
std         1.843534
min         4.516018
25%        10.700326
50%        11.664994
75%        12.496506
max        19.996115
Name: Total_Amount, dtype: float64

# Encoding

In [25]:
# One-hot-encoding de las variables categóricas
# ==============================================================================
# Se identifica el nobre de las columnas numéricas y categóricas
cat_cols = DataFrame_test.select_dtypes(include=['object', 'category']).columns.to_list()
numeric_cols = DataFrame_test.select_dtypes(include=['float64', 'int']).columns.to_list()

# Se aplica one-hot-encoding solo a las columnas categóricas
preprocessor = ColumnTransformer(
                    [('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                    remainder='passthrough'
               )

# Una vez que se ha definido el objeto ColumnTransformer, con el método fit()
# se aprenden las transformaciones con los datos de entrenamiento y se aplican a
# los dos conjuntos con transform(). Ambas operaciones a la vez con fit_transform().
DataFrame_test_prep = preprocessor.fit_transform(DataFrame_test)

#El resultado devuelto por ColumnTransformer es un numpy array, por lo que se pierden los nombres de las columnas. Es interesante poder inspeccionar cómo queda el set de datos tras el preprocesado en formato dataframe. Por defecto, OneHotEncoder ordena las nuevas columnas de izquierda a derecha por orden alfabético.

# Convertir el output del ColumnTransformer en dataframe y añadir nombre columnas
# ==============================================================================
# Nombre de todas las columnas
encoded_cat = preprocessor.named_transformers_['onehot'].get_feature_names(cat_cols)
labels = np.concatenate([encoded_cat, numeric_cols])

# Conversión a dataframe
DataFrame_test_prep = pd.DataFrame(DataFrame_test_prep, columns=labels)
DataFrame_test_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Data columns (total 9 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Region_APAC                         1567 non-null   float64
 1   Region_Americas                     1567 non-null   float64
 2   Region_EMEA                         1567 non-null   float64
 3   Region_Japan                        1567 non-null   float64
 4   Region_Middle East                  1567 non-null   float64
 5   Total_Amount                        1567 non-null   float64
 6   TRF                                 1567 non-null   float64
 7   Pricing, Delivery_Terms_Approved    1567 non-null   float64
 8   Pricing, Delivery_Terms_Quote_Appr  1567 non-null   float64
dtypes: float64(9)
memory usage: 110.3 KB


# Prediction

In [26]:
pred_posta = modelRegresor.predict(DataFrame_test_prep)

In [27]:
subir['target'] = pred_posta
subir.set_index('Opportunity_ID', inplace = True)
subir

Unnamed: 0_level_0,target
Opportunity_ID,Unnamed: 1_level_1
10689,0.598296
10690,0.615872
10691,0.607911
10692,0.501724
10693,0.705794
...,...
12364,0.623824
12365,0.600705
12366,0.539873
12367,0.512258


In [28]:
subir.describe()

Unnamed: 0,target
count,1567.0
mean,0.558376
std,0.222193
min,-0.056309
25%,0.405358
50%,0.609417
75%,0.705481
max,1.177333


In [29]:
subir.to_csv('RedesNeuronales_Entrega_3.csv')

# Otra forma

In [30]:
import sklearn as sk
import sklearn.neural_network


lr = 0.01           # learning rate
nn = [2, 16, 8, 1]  # número de neuronas por capa.

# Creamos el objeto del modelo de red neuronal multicapa.
clf = sk.neural_network.MLPRegressor(solver='adam', 
                                     activation = 'logistic',
                                     learning_rate_init=lr,
                                     hidden_layer_sizes=tuple(nn[1:]),
                                     verbose=True,
                                     n_iter_no_change=1000,
                                     batch_size = 64)

In [31]:
clf.fit(X_train_prep, y_train)

Iteration 1, loss = 0.12023946
Iteration 2, loss = 0.09245277
Iteration 3, loss = 0.08299989
Iteration 4, loss = 0.07935505
Iteration 5, loss = 0.07795114
Iteration 6, loss = 0.07739887
Iteration 7, loss = 0.07745922
Iteration 8, loss = 0.07735022
Iteration 9, loss = 0.07683985
Iteration 10, loss = 0.07699831
Iteration 11, loss = 0.07701800
Iteration 12, loss = 0.07669531
Iteration 13, loss = 0.07654976
Iteration 14, loss = 0.07658646
Iteration 15, loss = 0.07614724
Iteration 16, loss = 0.07627567
Iteration 17, loss = 0.07629762
Iteration 18, loss = 0.07610948
Iteration 19, loss = 0.07633619
Iteration 20, loss = 0.07613818
Iteration 21, loss = 0.07600729
Iteration 22, loss = 0.07592808
Iteration 23, loss = 0.07604916
Iteration 24, loss = 0.07597046
Iteration 25, loss = 0.07579305
Iteration 26, loss = 0.07613769
Iteration 27, loss = 0.07571758
Iteration 28, loss = 0.07604861
Iteration 29, loss = 0.07597759
Iteration 30, loss = 0.07555006
Iteration 31, loss = 0.07568707
Iteration 32, los

MLPRegressor(activation='logistic', batch_size=64,
             hidden_layer_sizes=(16, 8, 1), learning_rate_init=0.01,
             n_iter_no_change=1000, verbose=True)

In [32]:
predict = clf.predict(X_test_prep)

In [33]:
clf.score(X_test_prep, y_test)

0.40042091651989475

In [34]:
loss = log_loss(y_test  , predict)
print(f"Log loss is {loss}")

Log loss is 0.44699259861741203


In [35]:
pred_posta = clf.predict(DataFrame_test_prep)

In [36]:
subir['target'] = pred_posta
subir

Unnamed: 0_level_0,target
Opportunity_ID,Unnamed: 1_level_1
10689,0.581747
10690,0.622592
10691,0.573534
10692,0.515179
10693,0.686917
...,...
12364,0.641084
12365,0.566921
12366,0.554812
12367,0.546703


In [37]:
subir.to_csv('RedesNeuronales_Entrega_4.csv')