In [329]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [330]:
df = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/Training.csv" )
df

Unnamed: 0,ID,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Stage,Prod_Category_A
0,27761,EMEA,,1,1,1,1,0,Bureaucratic_Code_4,6/16/2015,...,Q2,2016,NaT,10,EUR,5272800.00,EUR,5272800.0,Closed Lost,Prod_Category_A_None
1,27760,EMEA,,0,0,0,0,0,Bureaucratic_Code_4,6/16/2015,...,Q1,2016,NaT,0,EUR,48230.00,EUR,48230.0,Closed Won,Prod_Category_A_None
2,27446,Americas,NW America,0,0,0,0,0,Bureaucratic_Code_4,4/21/2015,...,Q1,2016,NaT,0,USD,83865.60,USD,83865.6,Closed Won,Prod_Category_A_None
3,16808,Americas,NW America,1,0,1,0,0,Bureaucratic_Code_5,7/27/2013,...,Q1,2018,NaT,14,USD,7421881.50,USD,7421881.5,Closed Lost,Prod_Category_A_None
4,16805,Americas,NW America,1,0,1,0,0,Bureaucratic_Code_5,7/27/2013,...,Q1,2018,NaT,25,USD,13357192.50,USD,13357192.5,Closed Lost,Prod_Category_A_None
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16942,8781,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q1,2016,NaT,0,EUR,103350.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16943,8786,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q2,2016,NaT,0,EUR,93015.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16944,8792,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q1,2016,NaT,0,EUR,103350.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16945,28561,Americas,NE America,1,1,1,1,0,Bureaucratic_Code_4,10/20/2015,...,Q2,2016,NaT,4,USD,2346796.88,USD,0.0,Closed Lost,Prod_Category_A_None


In [331]:
df.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount'] = df['Total_Amount']*0.0096
df.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount'] = df['Total_Amount']*1.17
df.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount'] = df['Total_Amount']*0.70
df.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount'] = df['Total_Amount']*1.29
df.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount_Currency'] = 'USD'

In [332]:
short_df = df[['Region','Total_Amount','TRF','Opportunity_Owner','Delivery_Year','Delivery_Quarter','Pricing, Delivery_Terms_Approved','Pricing, Delivery_Terms_Quote_Appr','Stage' ]].rename(columns={'Stage': 'Decision'})
short_df = short_df[ (short_df['Decision'] == 'Closed Won') | (short_df['Decision'] == 'Closed Lost') ]
short_df['Decision'] = np.where(short_df['Decision'] == 'Closed Won',1,0)
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
0,EMEA,6169176.00,10,Person_Name_18,2016,Q2,1,1,0
1,EMEA,56429.10,0,Person_Name_20,2016,Q1,0,0,1
2,Americas,83865.60,0,Person_Name_8,2016,Q1,0,0,1
3,Americas,7421881.50,14,Person_Name_8,2018,Q1,0,1,0
4,Americas,13357192.50,25,Person_Name_8,2018,Q1,0,1,0
...,...,...,...,...,...,...,...,...,...
16942,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16943,EMEA,108827.55,0,Person_Name_13,2016,Q2,1,1,1
16944,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16945,Americas,2346796.88,4,Person_Name_3,2016,Q2,1,1,0


In [333]:
short_df = short_df[short_df['Total_Amount'] > 0]
short_df.describe()

Unnamed: 0,Total_Amount,TRF,Delivery_Year,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
count,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0
mean,1010612.0,2.28166,2017.134927,0.58544,0.807834,0.56803
std,5873179.0,12.084678,0.820031,0.492661,0.394014,0.495365
min,0.01,0.0,2016.0,0.0,0.0,0.0
25%,4750.453,0.0,2016.0,0.0,1.0,0.0
50%,77140.0,0.0,2017.0,1.0,1.0,1.0
75%,291466.9,1.0,2018.0,1.0,1.0,1.0
max,315000100.0,500.0,2019.0,1.0,1.0,1.0


In [334]:
short_df.loc[(short_df.apply(lambda x: x.map(x.value_counts()))['Opportunity_Owner']) < round(short_df.shape[0] * 0.01),'Opportunity_Owner'] = 'Other'
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
0,EMEA,6169176.00,10,Person_Name_18,2016,Q2,1,1,0
1,EMEA,56429.10,0,Other,2016,Q1,0,0,1
2,Americas,83865.60,0,Person_Name_8,2016,Q1,0,0,1
3,Americas,7421881.50,14,Person_Name_8,2018,Q1,0,1,0
4,Americas,13357192.50,25,Person_Name_8,2018,Q1,0,1,0
...,...,...,...,...,...,...,...,...,...
16942,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16943,EMEA,108827.55,0,Person_Name_13,2016,Q2,1,1,1
16944,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16945,Americas,2346796.88,4,Person_Name_3,2016,Q2,1,1,0


In [335]:
short_df['Promedio'] = short_df.groupby(['Delivery_Year','Opportunity_Owner','Delivery_Quarter'])['Total_Amount'].transform('mean')
short_df = short_df.sort_values(['Opportunity_Owner','Delivery_Year', 'Delivery_Quarter'])
short_df.head(25)

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio
1,EMEA,56429.1,0,Other,2016,Q1,0,0,1,1036480.0
180,EMEA,125482.5,0,Other,2016,Q1,0,0,0,1036480.0
182,EMEA,76658.4,0,Other,2016,Q1,0,0,1,1036480.0
183,EMEA,287469.0,0,Other,2016,Q1,0,0,1,1036480.0
184,EMEA,246402.0,0,Other,2016,Q1,0,0,1,1036480.0
185,EMEA,126732.9375,0,Other,2016,Q1,0,0,0,1036480.0
194,Japan,86835.84,0,Other,2016,Q1,0,0,1,1036480.0
195,Japan,95040.0,0,Other,2016,Q1,1,1,1,1036480.0
196,Japan,326.4,0,Other,2016,Q1,1,1,1,1036480.0
197,Japan,249.6,0,Other,2016,Q1,1,1,1,1036480.0


# Lag_1 y Delta

In [336]:
last_person = 'NaN'
i = 0
Lag_1 = []
for person in short_df['Opportunity_Owner']:
    if person == last_person:
        if (short_df['Delivery_Quarter'][short_df.index[i]]) == (short_df['Delivery_Quarter'][short_df.index[i - 1]]):
            Lag_1.append(Lag_1[i - 1])
        else:
            Lag_1.append(short_df['Promedio'][short_df.index[i - 1]])
    else:
        Lag_1.append(np.nan)
        last_person = person
    i = i + 1
short_df['Lag_1'] = Lag_1
short_df['Delta'] = short_df['Promedio'] - short_df['Lag_1'].fillna(0)
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio,Lag_1,Delta
1,EMEA,56429.10,0,Other,2016,Q1,0,0,1,1.036480e+06,,1.036480e+06
180,EMEA,125482.50,0,Other,2016,Q1,0,0,0,1.036480e+06,,1.036480e+06
182,EMEA,76658.40,0,Other,2016,Q1,0,0,1,1.036480e+06,,1.036480e+06
183,EMEA,287469.00,0,Other,2016,Q1,0,0,1,1.036480e+06,,1.036480e+06
184,EMEA,246402.00,0,Other,2016,Q1,0,0,1,1.036480e+06,,1.036480e+06
...,...,...,...,...,...,...,...,...,...,...,...,...
14179,Americas,1938332.88,6,Person_Name_8,2019,Q3,1,1,1,1.441206e+07,1.221451e+07,2.197554e+06
14180,Americas,1243338.53,4,Person_Name_8,2019,Q3,1,1,1,1.441206e+07,1.221451e+07,2.197554e+06
14181,Americas,1938332.88,6,Person_Name_8,2019,Q3,1,1,1,1.441206e+07,1.221451e+07,2.197554e+06
15466,Americas,66787517.13,195,Person_Name_8,2019,Q3,1,1,0,1.441206e+07,1.221451e+07,2.197554e+06


# Log

In [337]:
short_df.Total_Amount = np.log(short_df.Total_Amount)
short_df.Lag_1 = np.log(short_df.Lag_1)
short_df.Promedio = np.log(short_df.Promedio)
short_df.Delta = np.log(short_df.Delta + 1 - min(short_df.Delta))
short_df.describe()

Unnamed: 0,Total_Amount,TRF,Delivery_Year,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio,Lag_1,Delta
count,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0,15520.0,16772.0
mean,10.470044,2.28166,2017.134927,0.58544,0.807834,0.56803,12.334148,12.624562,16.055206
std,3.308478,12.084678,0.820031,0.492661,0.394014,0.495365,2.343259,2.257019,0.294646
min,-4.60517,0.0,2016.0,0.0,0.0,0.0,7.959184,7.959184,0.0
25%,8.465995,0.0,2016.0,0.0,1.0,0.0,10.971975,11.505608,16.040237
50%,11.253377,0.0,2017.0,1.0,1.0,1.0,13.159299,13.455352,16.086761
75%,12.582682,1.0,2018.0,1.0,1.0,1.0,14.080868,14.262234,16.105114
max,19.568083,500.0,2019.0,1.0,1.0,1.0,17.296622,17.296622,17.334309


In [338]:
short_df.to_csv('df_mejorado.csv')

# Kaggle

In [339]:
DataFrame_test = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/Test/Test.csv" )
DataFrame_test

Unnamed: 0,ID,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Month,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Prod_Category_A
0,6140,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,162240.0,EUR,367419.0,Prod_Category_A_None
1,6146,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,78624.0,EUR,367419.0,Prod_Category_A_None
2,6151,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,126555.0,EUR,367419.0,Prod_Category_A_None
3,6118,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 6,Q2,2019,NaT,1,EUR,243360.0,EUR,757783.5,Prod_Category_A_None
4,6124,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 6,Q2,2019,NaT,0,EUR,157248.0,EUR,757783.5,Prod_Category_A_None
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,16345,EMEA,KSA,1,1,0,0,0,Bureaucratic_Code_4,6/12/2017,...,2019 - 5,Q2,2019,NaT,0,USD,124740.0,USD,147750.0,Prod_Category_A_None
2547,15218,Americas,SE America,1,1,0,0,0,Bureaucratic_Code_4,6/8/2018,...,2019 - 10,Q4,2019,NaT,0,USD,45054.9,USD,45054.9,Prod_Category_A_None
2548,15224,Americas,SE America,1,1,1,1,0,Bureaucratic_Code_4,6/8/2018,...,2019 - 10,Q4,2019,NaT,0,USD,100122.0,USD,100122.0,Prod_Category_A_None
2549,7286,Americas,NE America,1,1,0,0,0,Bureaucratic_Code_4,8/29/2018,...,2019 - 8,Q3,2019,NaT,0,USD,143220.0,USD,143220.0,Prod_Category_A_None


In [340]:
DataFrame_test.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount'] = DataFrame_test['Total_Amount']*0.0096
DataFrame_test.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount'] = DataFrame_test['Total_Amount']*1.17
DataFrame_test.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount'] = DataFrame_test['Total_Amount']*0.70
DataFrame_test.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount'] = DataFrame_test['Total_Amount']*1.29
DataFrame_test.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount_Currency'] = 'USD'

In [341]:
DataFrame_test = DataFrame_test[['Opportunity_ID','Region','Total_Amount','TRF','Delivery_Year','Opportunity_Owner','Delivery_Quarter','Pricing, Delivery_Terms_Approved','Pricing, Delivery_Terms_Quote_Appr' ]]
DataFrame_test = DataFrame_test.drop_duplicates('Opportunity_ID',keep = 'last')
DataFrame_test

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr"
2,10689,EMEA,126555.0,0,2019,Person_Name_13,Q2,1,1
7,10690,EMEA,112464.0,0,2019,Person_Name_13,Q2,1,1
8,10691,Americas,21037.5,0,2019,Person_Name_9,Q4,1,1
14,10692,Americas,228327.0,1,2019,Person_Name_8,Q4,1,1
15,10693,Americas,5752.5,0,2019,Person_Name_64,Q2,0,0
...,...,...,...,...,...,...,...,...,...
2546,12364,EMEA,124740.0,0,2019,Person_Name_13,Q2,1,1
2547,12365,Americas,45054.9,0,2019,Person_Name_38,Q4,1,1
2548,12366,Americas,100122.0,0,2019,Person_Name_38,Q4,1,1
2549,12367,Americas,143220.0,0,2019,Person_Name_9,Q3,1,1


In [342]:
for persona in DataFrame_test.drop_duplicates('Opportunity_Owner', keep = 'last')['Opportunity_Owner']:
    flag = False
    for chequeo in short_df.drop_duplicates('Opportunity_Owner', keep = 'last')['Opportunity_Owner']:
        if persona == chequeo:
            flag = True
    if flag == False:
        DataFrame_test.loc[DataFrame_test['Opportunity_Owner'] == persona, 'Opportunity_Owner'] = 'Other'
DataFrame_test

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr"
2,10689,EMEA,126555.0,0,2019,Person_Name_13,Q2,1,1
7,10690,EMEA,112464.0,0,2019,Person_Name_13,Q2,1,1
8,10691,Americas,21037.5,0,2019,Other,Q4,1,1
14,10692,Americas,228327.0,1,2019,Person_Name_8,Q4,1,1
15,10693,Americas,5752.5,0,2019,Person_Name_64,Q2,0,0
...,...,...,...,...,...,...,...,...,...
2546,12364,EMEA,124740.0,0,2019,Person_Name_13,Q2,1,1
2547,12365,Americas,45054.9,0,2019,Person_Name_38,Q4,1,1
2548,12366,Americas,100122.0,0,2019,Person_Name_38,Q4,1,1
2549,12367,Americas,143220.0,0,2019,Other,Q3,1,1


In [343]:
DataFrame_test['Opportunity_Owner'].value_counts()

Other             506
Person_Name_64    130
Person_Name_18     99
Person_Name_13     97
Person_Name_8      94
Person_Name_16     82
Person_Name_62     64
Person_Name_4      62
Person_Name_3      60
Person_Name_38     57
Person_Name_46     53
Person_Name_32     53
Person_Name_50     42
Person_Name_65     42
Person_Name_54     28
Person_Name_43     28
Person_Name_60     22
Person_Name_52     18
Person_Name_49     18
Person_Name_66     11
Person_Name_23      1
Name: Opportunity_Owner, dtype: int64

In [344]:
DataFrame_test['Promedio'] = DataFrame_test.groupby(['Delivery_Year','Opportunity_Owner','Delivery_Quarter'])['Total_Amount'].transform('mean')
DataFrame_test = DataFrame_test.sort_values(['Opportunity_Owner','Delivery_Year', 'Delivery_Quarter'])
DataFrame_test.head(25)

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio
117,10757,Americas,4851.0,0,2019,Other,Q1,0,0,641846.154359
125,10758,Americas,4568.85,0,2019,Other,Q1,0,1,641846.154359
145,10772,Japan,3967200.0,0,2019,Other,Q1,1,1,641846.154359
150,10777,APAC,97574.4,0,2019,Other,Q1,0,0,641846.154359
151,10778,EMEA,50960.0,0,2019,Other,Q1,1,1,641846.154359
154,10779,EMEA,5668.65,0,2019,Other,Q1,1,1,641846.154359
172,10793,Japan,16045120.0,0,2019,Other,Q1,0,0,641846.154359
173,10794,Japan,16045120.0,0,2019,Other,Q1,0,0,641846.154359
232,10808,EMEA,101376.0,0,2019,Other,Q1,0,1,641846.154359
236,10811,APAC,53727.3,0,2019,Other,Q1,0,0,641846.154359


# Lag_1 y Delta

In [345]:
last_person = 'NaN'
i = 0
Lag_1_test = []
for person in DataFrame_test['Opportunity_Owner']:
    if person == last_person:
        if (DataFrame_test['Delivery_Quarter'][DataFrame_test.index[i]]) == (DataFrame_test['Delivery_Quarter'][DataFrame_test.index[i - 1]]):
            Lag_1_test.append(Lag_1_test[i - 1])
        else:
            Lag_1_test.append(short_df['Promedio'][DataFrame_test.index[i - 1]])
    else:
        Lag_1_test.append(np.nan)
        last_person = person
    i = i + 1
DataFrame_test['Lag_1'] = Lag_1_test
DataFrame_test['Delta'] = DataFrame_test['Promedio'] - DataFrame_test['Lag_1'].fillna(0)
DataFrame_test

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio,Lag_1,Delta
117,10757,Americas,4851.00,0,2019,Other,Q1,0,0,6.418462e+05,,6.418462e+05
125,10758,Americas,4568.85,0,2019,Other,Q1,0,1,6.418462e+05,,6.418462e+05
145,10772,Japan,3967200.00,0,2019,Other,Q1,1,1,6.418462e+05,,6.418462e+05
150,10777,APAC,97574.40,0,2019,Other,Q1,0,0,6.418462e+05,,6.418462e+05
151,10778,EMEA,50960.00,0,2019,Other,Q1,1,1,6.418462e+05,,6.418462e+05
...,...,...,...,...,...,...,...,...,...,...,...,...
1445,11690,Americas,64749955.60,175,2020,Person_Name_8,Q1,0,1,1.608113e+08,13.851341,1.608112e+08
1451,11691,Americas,11099948.20,30,2020,Person_Name_8,Q1,0,1,1.608113e+08,13.851341,1.608112e+08
1524,11748,Americas,38999961.00,100,2020,Person_Name_8,Q1,0,1,1.608113e+08,13.851341,1.608112e+08
969,11385,Americas,8462961.00,22,2020,Person_Name_8,Q2,1,1,8.462961e+06,13.501731,8.462947e+06


# Log

In [346]:
DataFrame_test.Total_Amount = np.log(DataFrame_test.Total_Amount)
DataFrame_test.Lag_1 = np.log(DataFrame_test.Lag_1)
DataFrame_test.Promedio = np.log(DataFrame_test.Promedio)
DataFrame_test.Delta = np.log(DataFrame_test.Delta + 1 - min(DataFrame_test.Delta))
DataFrame_test.describe()

Unnamed: 0,Opportunity_ID,Total_Amount,TRF,Delivery_Year,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio,Lag_1,Delta
count,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1061.0,1567.0
mean,11554.0485,11.930953,3.067007,2019.031908,0.493937,0.619655,13.195414,2.431769,13.165366
std,492.205946,1.990895,36.395461,0.179407,0.500123,0.485626,1.49659,0.279604,1.551913
min,10689.0,4.359014,0.0,2018.0,0.0,0.0,8.630522,2.077836,0.0
25%,11080.5,10.917225,0.0,2019.0,0.0,0.0,11.932543,2.081695,11.895047
50%,11583.0,11.705435,0.0,2019.0,0.0,1.0,13.372104,2.577129,13.363342
75%,11976.5,12.797007,1.0,2019.0,1.0,1.0,14.24677,2.6608,14.243117
max,12368.0,19.996115,1272.0,2020.0,1.0,1.0,19.924758,2.775416,19.924746


In [347]:
DataFrame_test.to_csv('df_mejorado_test.csv')