In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
import math

# Preprocesado y modelado
# ==============================================================================
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

import pickle

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/FeatureEngineering/df_time_series.csv" )
df = df.drop(columns = {'Unnamed: 0', 'Delivery_Year', 'Delivery_Quarter'})
df

Unnamed: 0,Region,Total_Amount,Años en entregar,Month Created,Product_Name,Opportunity_Owner,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
0,EMEA,2.773332,0,2,Product_Name_191,Other,0,1,0,2.496126,2.585640,,3.088401
1,EMEA,2.740425,0,2,Product_Name_210,Other,1,1,0,2.486798,2.545803,2.585640,2.098247
2,EMEA,2.999379,0,3,Product_Name_432,Other,1,1,0,2.499777,2.852610,2.545803,2.583974
3,APAC,2.780462,0,7,Product_Name_461,Other,0,0,0,2.448872,2.594236,2.852610,1.552569
4,APAC,2.647444,0,2,Product_Name_91,Person_Name_11,1,1,1,2.467621,2.468684,,3.019307
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13578,Americas,2.845924,1,6,Product_Name_488,Person_Name_8,1,1,1,2.544620,2.706302,2.740744,2.097463
13579,Americas,2.819796,1,6,Product_Name_483,Person_Name_8,1,1,1,2.544620,2.706302,2.740744,2.097463
13580,Americas,2.845924,1,6,Product_Name_488,Person_Name_8,1,1,1,2.544620,2.706302,2.740744,2.097463
13581,Americas,3.032890,1,10,Product_Name_445,Person_Name_8,1,1,0,2.544620,2.706302,2.740744,2.097463


In [3]:
df = df.fillna(0)

In [4]:
df.describe()

Unnamed: 0,Total_Amount,Años en entregar,Month Created,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
count,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0,13583.0
mean,2.65108,0.255025,6.574321,0.569388,0.77332,0.554001,2.449497,2.444527,2.339767,2.183309
std,0.17725,0.466251,3.421165,0.49518,0.4187,0.497094,0.098289,0.142587,0.531066,0.214954
min,0.0,-1.0,1.0,0.0,0.0,0.0,2.219173,1.336753,0.0,0.0
25%,2.588674,0.0,4.0,0.0,1.0,0.0,2.437621,2.401642,2.389793,2.088779
50%,2.656112,0.0,6.0,1.0,1.0,1.0,2.491107,2.465992,2.466176,2.159336
75%,2.753412,0.0,10.0,1.0,1.0,1.0,2.499777,2.54416,2.546746,2.217261
max,3.104954,5.0,12.0,1.0,1.0,1.0,2.553549,2.9739,2.85261,3.240224


# Encode

In [5]:
Mean_encoded_subject_region = df.groupby(['Region'])['Decision'].mean().to_dict() 

df['Region'] =  df['Region'].map(Mean_encoded_subject_region)

#--------------------------------------------------------------------------------------------

mean_encoded_product = df.groupby("Product_Name")["Decision"].mean().to_dict()

df["Product_Name"] = df["Product_Name"].map(mean_encoded_product)

#--------------------------------------------------------------------------------------------

#Mean_encoded_subject_quarter = df.groupby(['Delivery_Quarter'])['Decision'].mean().to_dict() 

#df['Delivery_Quarter'] =  df['Delivery_Quarter'].map(Mean_encoded_subject_quarter)

#--------------------------------------------------------------------------------------------

Mean_encoded_subject_owner = df.groupby(['Opportunity_Owner'])['Decision'].mean().to_dict() 

df['Opportunity_Owner'] =  df['Opportunity_Owner'].map(Mean_encoded_subject_owner)

# Split

In [6]:
X_test = df.tail(math.floor(df.shape[0]*0.20)).drop(columns = {'Decision'})
y_test = df.tail(math.floor(df.shape[0]*0.20))['Decision']
X_train = df.head(round(df.shape[0]*0.80)).drop(columns = {'Decision'})
y_train = df.head(round(df.shape[0]*0.80))['Decision']
X_train

Unnamed: 0,Region,Total_Amount,Años en entregar,Month Created,Product_Name,Opportunity_Owner,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
0,0.532255,2.773332,0,2,Product_Name_191,0.000000,0,1,2.496126,2.585640,0.000000,3.088401
1,0.532255,2.740425,0,2,Product_Name_210,0.000000,1,1,2.486798,2.545803,2.585640,2.098247
2,0.532255,2.999379,0,3,Product_Name_432,0.000000,1,1,2.499777,2.852610,2.545803,2.583974
3,0.587888,2.780462,0,7,Product_Name_461,0.000000,0,0,2.448872,2.594236,2.852610,1.552569
4,0.587888,2.647444,0,2,Product_Name_91,0.371429,1,1,2.467621,2.468684,0.000000,3.019307
...,...,...,...,...,...,...,...,...,...,...,...,...
10861,0.587888,2.817755,0,9,Product_Name_326,0.252427,1,1,2.448872,2.594527,2.441786,2.357726
10862,0.587888,2.793234,0,10,Product_Name_324,0.252427,1,1,2.448872,2.594527,2.441786,2.357726
10863,0.587888,2.541749,0,10,Product_Name_111,0.252427,1,1,2.448872,2.594527,2.441786,2.357726
10864,0.587888,2.772403,0,10,Product_Name_324,0.252427,1,1,2.448872,2.594527,2.441786,2.357726


# Regresion

In [7]:
import sklearn as sk
import sklearn.neural_network


lr = 0.001           # learning rate
nn = [2, 16, 8, 1]  # número de neuronas por capa.

# Creamos el objeto del modelo de red neuronal multicapa.
modelRegresor = sk.neural_network.MLPRegressor(solver='adam', 
                                     activation = 'logistic',
                                     learning_rate_init=lr,
                                     hidden_layer_sizes=tuple(nn[1:]),
                                     verbose=True,
                                     n_iter_no_change=1000,
                                     batch_size = 64)

In [8]:
modelRegresor.fit(X_train, y_train)

ValueError: could not convert string to float: 'Product_Name_191'

In [None]:
LinReg_pred = modelRegresor.predict(X_test)

In [None]:
probando = pd.DataFrame()
probando['target'] = LinReg_pred
probando.describe()

In [None]:
modelRegresor.score(X_test, y_test)

In [None]:
loss = log_loss(y_test  , LinReg_pred)
print(f"Log loss is {loss}")

In [None]:
#pickle.dump(modelRegresor, open('modeloRegresorRN.p', 'wb'))

# Kaggle

In [None]:
DataFrame_test = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/FeatureEngineering/df_time_series_test.csv" )
DataFrame_test

In [None]:
subir = pd.DataFrame()
subir['Opportunity_ID'] = DataFrame_test['Opportunity_ID']
DataFrame_test = DataFrame_test.drop(columns = {'Opportunity_ID', 'Unnamed: 0', 'Delivery_Year', 'Delivery_Quarter'}).fillna(0)
DataFrame_test

# Encoding

In [None]:
DataFrame_test['Region'] =  DataFrame_test['Region'].map(Mean_encoded_subject_region) 

#DataFrame_test['Delivery_Quarter'] =  DataFrame_test['Delivery_Quarter'].map(Mean_encoded_subject_quarter)

DataFrame_test['Opportunity_Owner'] =  DataFrame_test['Opportunity_Owner'].map(Mean_encoded_subject_owner)
DataFrame_test = DataFrame_test.fillna(0.406068)

In [None]:
DataFrame_test.isnull().sum()

# Prediction

In [None]:
pred_posta = modelRegresor.predict(DataFrame_test)

In [None]:
subir['target'] = pred_posta
subir.set_index('Opportunity_ID', inplace = True)
subir

In [None]:
subir.describe()

In [None]:
#subir.to_csv('RedesNeuronales_Entrega_8.csv')