In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as ticker
import seaborn as sns

# Preprocesado y modelado
# ==============================================================================
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

# Configuración matplotlib
# ==============================================================================
plt.rcParams['image.cmap'] = "bwr"
#plt.rcParams['figure.dpi'] = "100"
plt.rcParams['savefig.bbox'] = "tight"
style.use('ggplot') or plt.style.use('ggplot')

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/Training.csv" )
df

Unnamed: 0,ID,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Stage,Prod_Category_A
0,27761,EMEA,,1,1,1,1,0,Bureaucratic_Code_4,6/16/2015,...,Q2,2016,NaT,10,EUR,5272800.00,EUR,5272800.0,Closed Lost,Prod_Category_A_None
1,27760,EMEA,,0,0,0,0,0,Bureaucratic_Code_4,6/16/2015,...,Q1,2016,NaT,0,EUR,48230.00,EUR,48230.0,Closed Won,Prod_Category_A_None
2,27446,Americas,NW America,0,0,0,0,0,Bureaucratic_Code_4,4/21/2015,...,Q1,2016,NaT,0,USD,83865.60,USD,83865.6,Closed Won,Prod_Category_A_None
3,16808,Americas,NW America,1,0,1,0,0,Bureaucratic_Code_5,7/27/2013,...,Q1,2018,NaT,14,USD,7421881.50,USD,7421881.5,Closed Lost,Prod_Category_A_None
4,16805,Americas,NW America,1,0,1,0,0,Bureaucratic_Code_5,7/27/2013,...,Q1,2018,NaT,25,USD,13357192.50,USD,13357192.5,Closed Lost,Prod_Category_A_None
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16942,8781,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q1,2016,NaT,0,EUR,103350.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16943,8786,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q2,2016,NaT,0,EUR,93015.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16944,8792,EMEA,Austria,1,1,1,1,0,Bureaucratic_Code_4,1/15/2016,...,Q1,2016,NaT,0,EUR,103350.00,EUR,299715.0,Closed Won,Prod_Category_A_None
16945,28561,Americas,NE America,1,1,1,1,0,Bureaucratic_Code_4,10/20/2015,...,Q2,2016,NaT,4,USD,2346796.88,USD,0.0,Closed Lost,Prod_Category_A_None


In [3]:
df.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount'] = df['Total_Amount']*0.0096
df.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount'] = df['Total_Amount']*1.17
df.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount'] = df['Total_Amount']*0.70
df.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount_Currency'] = 'USD'

df.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount'] = df['Total_Amount']*1.29
df.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount_Currency'] = 'USD'

In [4]:
short_df = df[['Region','Total_Amount','TRF','Opportunity_Owner','Delivery_Year','Delivery_Quarter','Pricing, Delivery_Terms_Approved','Pricing, Delivery_Terms_Quote_Appr','Stage' ]].rename(columns={'Stage': 'Decision'})
short_df = short_df[ (short_df['Decision'] == 'Closed Won') | (short_df['Decision'] == 'Closed Lost') ]
short_df['Decision'] = np.where(short_df['Decision'] == 'Closed Won',1,0)
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
0,EMEA,6169176.00,10,Person_Name_18,2016,Q2,1,1,0
1,EMEA,56429.10,0,Person_Name_20,2016,Q1,0,0,1
2,Americas,83865.60,0,Person_Name_8,2016,Q1,0,0,1
3,Americas,7421881.50,14,Person_Name_8,2018,Q1,0,1,0
4,Americas,13357192.50,25,Person_Name_8,2018,Q1,0,1,0
...,...,...,...,...,...,...,...,...,...
16942,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16943,EMEA,108827.55,0,Person_Name_13,2016,Q2,1,1,1
16944,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16945,Americas,2346796.88,4,Person_Name_3,2016,Q2,1,1,0


In [5]:
short_df = short_df[short_df['Total_Amount'] > 0]
short_df.describe()

Unnamed: 0,Total_Amount,TRF,Delivery_Year,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
count,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0
mean,1010612.0,2.28166,2017.134927,0.58544,0.807834,0.56803
std,5873179.0,12.084678,0.820031,0.492661,0.394014,0.495365
min,0.01,0.0,2016.0,0.0,0.0,0.0
25%,4750.453,0.0,2016.0,0.0,1.0,0.0
50%,77140.0,0.0,2017.0,1.0,1.0,1.0
75%,291466.9,1.0,2018.0,1.0,1.0,1.0
max,315000100.0,500.0,2019.0,1.0,1.0,1.0


# Cambio los OpportunityOwner < 5 a Other

In [6]:
short_df.loc[(short_df.apply(lambda x: x.map(x.value_counts()))['Opportunity_Owner']) < 5,'Opportunity_Owner'] = 'Other'
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision
0,EMEA,6169176.00,10,Person_Name_18,2016,Q2,1,1,0
1,EMEA,56429.10,0,Person_Name_20,2016,Q1,0,0,1
2,Americas,83865.60,0,Person_Name_8,2016,Q1,0,0,1
3,Americas,7421881.50,14,Person_Name_8,2018,Q1,0,1,0
4,Americas,13357192.50,25,Person_Name_8,2018,Q1,0,1,0
...,...,...,...,...,...,...,...,...,...
16942,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16943,EMEA,108827.55,0,Person_Name_13,2016,Q2,1,1,1
16944,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1
16945,Americas,2346796.88,4,Person_Name_3,2016,Q2,1,1,0


# Creo El promedio de las regiones por quarter

In [7]:
short_df['Promedio_Region_Por_Quarter'] = short_df.groupby(['Region','Delivery_Quarter'])['Total_Amount'].transform('mean')
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter
0,EMEA,6169176.00,10,Person_Name_18,2016,Q2,1,1,0,1.102320e+06
1,EMEA,56429.10,0,Person_Name_20,2016,Q1,0,0,1,1.080380e+06
2,Americas,83865.60,0,Person_Name_8,2016,Q1,0,0,1,2.273304e+06
3,Americas,7421881.50,14,Person_Name_8,2018,Q1,0,1,0,2.273304e+06
4,Americas,13357192.50,25,Person_Name_8,2018,Q1,0,1,0,2.273304e+06
...,...,...,...,...,...,...,...,...,...,...
16942,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1,1.080380e+06
16943,EMEA,108827.55,0,Person_Name_13,2016,Q2,1,1,1,1.102320e+06
16944,EMEA,120919.50,0,Person_Name_13,2016,Q1,1,1,1,1.080380e+06
16945,Americas,2346796.88,4,Person_Name_3,2016,Q2,1,1,0,2.225721e+06


# Ordeno por Anio, OpportunityOwner y Quarter

In [8]:
short_df['Promedio_Owner_Por_Year_And_Quarter'] = short_df.groupby(['Delivery_Year','Opportunity_Owner','Delivery_Quarter'])['Total_Amount'].transform('mean')
short_df = short_df.sort_values(['Opportunity_Owner','Delivery_Year', 'Delivery_Quarter'])
short_df.head(25)

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter
1923,EMEA,580580.0,1,Other,2016,Q1,0,1,0,1080380.0,580580.0
1593,EMEA,345746.0,1,Other,2017,Q2,1,1,0,1102320.0,345746.0
1817,EMEA,33698320.0,80,Other,2017,Q3,1,1,0,1083249.0,33698320.0
14286,APAC,651061.1,2,Other,2018,Q4,0,0,0,1217925.0,651061.1
2153,APAC,87360.0,0,Person_Name_11,2016,Q1,1,1,1,1553124.0,335516.2
16184,APAC,115637.5,0,Person_Name_11,2016,Q1,0,1,0,1553124.0,335516.2
16279,APAC,521664.0,1,Person_Name_11,2016,Q1,0,1,0,1553124.0,335516.2
16280,APAC,210630.0,0,Person_Name_11,2016,Q1,0,1,0,1553124.0,335516.2
16677,APAC,14250.0,0,Person_Name_11,2016,Q1,0,0,0,1553124.0,335516.2
16680,APAC,820000.0,0,Person_Name_11,2016,Q1,0,0,0,1553124.0,335516.2


# Lag_1 y Delta

In [9]:
last_person = 'NaN'
i = 0
Lag_1 = []
for person in short_df['Opportunity_Owner']:
    if person == last_person:
        if (short_df['Delivery_Quarter'][short_df.index[i]]) == (short_df['Delivery_Quarter'][short_df.index[i - 1]]):
            Lag_1.append(Lag_1[i - 1])
        else:
            Lag_1.append(short_df['Promedio_Owner_Por_Year_And_Quarter'][short_df.index[i - 1]])
    else:
        Lag_1.append(np.nan)
        last_person = person
    i = i + 1
short_df['Lag_1'] = Lag_1
short_df['Delta'] = short_df['Promedio_Owner_Por_Year_And_Quarter'] - short_df['Lag_1'].fillna(0)
short_df

Unnamed: 0,Region,Total_Amount,TRF,Opportunity_Owner,Delivery_Year,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
1923,EMEA,5.805800e+05,1,Other,2016,Q1,0,1,0,1.080380e+06,5.805800e+05,,5.805800e+05
1593,EMEA,3.457460e+05,1,Other,2017,Q2,1,1,0,1.102320e+06,3.457460e+05,5.805800e+05,-2.348340e+05
1817,EMEA,3.369832e+07,80,Other,2017,Q3,1,1,0,1.083249e+06,3.369832e+07,3.457460e+05,3.335257e+07
14286,APAC,6.510611e+05,2,Other,2018,Q4,0,0,0,1.217925e+06,6.510611e+05,3.369832e+07,-3.304726e+07
2153,APAC,8.736000e+04,0,Person_Name_11,2016,Q1,1,1,1,1.553124e+06,3.355162e+05,,3.355162e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14179,Americas,1.938333e+06,6,Person_Name_8,2019,Q3,1,1,1,2.239389e+06,1.441206e+07,1.221451e+07,2.197554e+06
14180,Americas,1.243339e+06,4,Person_Name_8,2019,Q3,1,1,1,2.239389e+06,1.441206e+07,1.221451e+07,2.197554e+06
14181,Americas,1.938333e+06,6,Person_Name_8,2019,Q3,1,1,1,2.239389e+06,1.441206e+07,1.221451e+07,2.197554e+06
15466,Americas,6.678752e+07,195,Person_Name_8,2019,Q3,1,1,0,2.239389e+06,1.441206e+07,1.221451e+07,2.197554e+06


# Log

In [10]:
short_df.Total_Amount = np.log(short_df.Total_Amount)
short_df.Lag_1 = np.log(short_df.Lag_1)
short_df['Promedio_Region_Por_Quarter'] = np.log(short_df['Promedio_Region_Por_Quarter'])
short_df['Promedio_Owner_Por_Year_And_Quarter'] = np.log(short_df['Promedio_Owner_Por_Year_And_Quarter'])
short_df.Delta = np.log(short_df.Delta + 1 - min(short_df.Delta))
short_df.describe()

Unnamed: 0,Total_Amount,TRF,Delivery_Year,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Decision,Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
count,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0,16772.0,15482.0,16772.0
mean,10.470044,2.28166,2017.134927,0.58544,0.807834,0.56803,13.255775,12.244222,12.503113,17.309752
std,3.308478,12.084678,0.820031,0.492661,0.394014,0.495365,1.329157,2.325201,2.239367,0.148853
min,-4.60517,0.0,2016.0,0.0,0.0,0.0,11.058881,3.806662,5.234312,0.0
25%,8.465995,0.0,2016.0,0.0,1.0,0.0,11.464494,10.971975,11.304912,17.303476
50%,11.253377,0.0,2017.0,1.0,1.0,1.0,13.892824,12.847837,13.179942,17.313452
75%,12.582682,1.0,2018.0,1.0,1.0,1.0,14.012659,14.000724,14.12654,17.319557
max,19.568083,500.0,2019.0,1.0,1.0,1.0,15.245204,19.568083,17.528668,19.542568


In [11]:
short_df.to_csv('df_time_series.csv')

# Kaggle

In [12]:
DataFrame_test = pd.read_csv( "/home/bautista/Datos/Machine-Learning-Datos/Test/Test.csv" )
DataFrame_test

Unnamed: 0,ID,Region,Territory,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Month,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Prod_Category_A
0,6140,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,162240.0,EUR,367419.0,Prod_Category_A_None
1,6146,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,78624.0,EUR,367419.0,Prod_Category_A_None
2,6151,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 5,Q2,2019,NaT,0,EUR,126555.0,EUR,367419.0,Prod_Category_A_None
3,6118,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 6,Q2,2019,NaT,1,EUR,243360.0,EUR,757783.5,Prod_Category_A_None
4,6124,EMEA,Germany,1,1,1,1,0,Bureaucratic_Code_4,7/5/2017,...,2019 - 6,Q2,2019,NaT,0,EUR,157248.0,EUR,757783.5,Prod_Category_A_None
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,16345,EMEA,KSA,1,1,0,0,0,Bureaucratic_Code_4,6/12/2017,...,2019 - 5,Q2,2019,NaT,0,USD,124740.0,USD,147750.0,Prod_Category_A_None
2547,15218,Americas,SE America,1,1,0,0,0,Bureaucratic_Code_4,6/8/2018,...,2019 - 10,Q4,2019,NaT,0,USD,45054.9,USD,45054.9,Prod_Category_A_None
2548,15224,Americas,SE America,1,1,1,1,0,Bureaucratic_Code_4,6/8/2018,...,2019 - 10,Q4,2019,NaT,0,USD,100122.0,USD,100122.0,Prod_Category_A_None
2549,7286,Americas,NE America,1,1,0,0,0,Bureaucratic_Code_4,8/29/2018,...,2019 - 8,Q3,2019,NaT,0,USD,143220.0,USD,143220.0,Prod_Category_A_None


In [13]:
DataFrame_test.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount'] = DataFrame_test['Total_Amount']*0.0096
DataFrame_test.loc[df['Total_Amount_Currency'] == 'JPY', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount'] = DataFrame_test['Total_Amount']*1.17
DataFrame_test.loc[df['Total_Amount_Currency'] == 'EUR', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount'] = DataFrame_test['Total_Amount']*0.70
DataFrame_test.loc[df['Total_Amount_Currency'] == 'AUD', 'Total_Amount_Currency'] = 'USD'

DataFrame_test.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount'] = DataFrame_test['Total_Amount']*1.29
DataFrame_test.loc[df['Total_Amount_Currency'] == 'GBP', 'Total_Amount_Currency'] = 'USD'

In [14]:
DataFrame_test = DataFrame_test[['Opportunity_ID','Region','Total_Amount','TRF','Delivery_Year','Opportunity_Owner','Delivery_Quarter','Pricing, Delivery_Terms_Approved','Pricing, Delivery_Terms_Quote_Appr' ]]
DataFrame_test = DataFrame_test.drop_duplicates('Opportunity_ID',keep = 'last')
DataFrame_test

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr"
2,10689,EMEA,126555.0,0,2019,Person_Name_13,Q2,1,1
7,10690,EMEA,112464.0,0,2019,Person_Name_13,Q2,1,1
8,10691,Americas,21037.5,0,2019,Person_Name_9,Q4,1,1
14,10692,Americas,228327.0,1,2019,Person_Name_8,Q4,1,1
15,10693,Americas,5752.5,0,2019,Person_Name_64,Q2,0,0
...,...,...,...,...,...,...,...,...,...
2546,12364,EMEA,124740.0,0,2019,Person_Name_13,Q2,1,1
2547,12365,Americas,45054.9,0,2019,Person_Name_38,Q4,1,1
2548,12366,Americas,100122.0,0,2019,Person_Name_38,Q4,1,1
2549,12367,Americas,143220.0,0,2019,Person_Name_9,Q3,1,1


In [15]:
#for persona in DataFrame_test.drop_duplicates('Opportunity_Owner', keep = 'last')['Opportunity_Owner']:
#    flag = False
#    for chequeo in short_df.drop_duplicates('Opportunity_Owner', keep = 'last')['Opportunity_Owner']:
#        if persona == chequeo:
#            flag = True
#    if flag == False:
#        DataFrame_test.loc[DataFrame_test['Opportunity_Owner'] == persona, 'Opportunity_Owner'] = 'Other'
#DataFrame_test

In [16]:
DataFrame_test['Promedio_Region_Por_Quarter'] = DataFrame_test.groupby(['Region','Delivery_Quarter'])['Total_Amount'].transform('mean')
DataFrame_test

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter
2,10689,EMEA,126555.0,0,2019,Person_Name_13,Q2,1,1,2.562380e+05
7,10690,EMEA,112464.0,0,2019,Person_Name_13,Q2,1,1,2.562380e+05
8,10691,Americas,21037.5,0,2019,Person_Name_9,Q4,1,1,1.910773e+06
14,10692,Americas,228327.0,1,2019,Person_Name_8,Q4,1,1,1.910773e+06
15,10693,Americas,5752.5,0,2019,Person_Name_64,Q2,0,0,7.748300e+05
...,...,...,...,...,...,...,...,...,...,...
2546,12364,EMEA,124740.0,0,2019,Person_Name_13,Q2,1,1,2.562380e+05
2547,12365,Americas,45054.9,0,2019,Person_Name_38,Q4,1,1,1.910773e+06
2548,12366,Americas,100122.0,0,2019,Person_Name_38,Q4,1,1,1.910773e+06
2549,12367,Americas,143220.0,0,2019,Person_Name_9,Q3,1,1,2.563482e+06


In [17]:
DataFrame_test['Promedio_Owner_Por_Year_And_Quarter'] = DataFrame_test.groupby(['Delivery_Year','Opportunity_Owner','Delivery_Quarter'])['Total_Amount'].transform('mean')
DataFrame_test = DataFrame_test.sort_values(['Opportunity_Owner','Delivery_Year', 'Delivery_Quarter'])
DataFrame_test.head(25)

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter
171,10792,EMEA,177617.7,1,2019,Person_Name_13,Q1,1,1,127249.999307,59993.47619
226,10806,EMEA,30488.25,0,2019,Person_Name_13,Q1,0,1,127249.999307,59993.47619
238,10812,EMEA,96673.5,0,2019,Person_Name_13,Q1,1,1,127249.999307,59993.47619
270,10836,EMEA,25725.0,0,2019,Person_Name_13,Q1,1,1,127249.999307,59993.47619
282,10845,EMEA,21483.0,0,2019,Person_Name_13,Q1,1,1,127249.999307,59993.47619
311,10864,EMEA,41418.0,0,2019,Person_Name_13,Q1,0,0,127249.999307,59993.47619
322,10871,EMEA,10612.8,0,2019,Person_Name_13,Q1,0,0,127249.999307,59993.47619
361,10900,EMEA,10478.0,0,2019,Person_Name_13,Q1,1,1,127249.999307,59993.47619
365,10904,EMEA,20594.25,0,2019,Person_Name_13,Q1,1,1,127249.999307,59993.47619
366,10905,EMEA,52593.75,0,2019,Person_Name_13,Q1,0,1,127249.999307,59993.47619


# Lag_1 y Delta

In [18]:
last_person = 'NaN'
i = 0
Lag_1_test = []
for person in DataFrame_test['Opportunity_Owner']:
    if person == last_person:
        if (DataFrame_test['Delivery_Quarter'][DataFrame_test.index[i]]) == (DataFrame_test['Delivery_Quarter'][DataFrame_test.index[i - 1]]):
            Lag_1_test.append(Lag_1_test[i - 1])
        else:
            Lag_1_test.append(short_df['Promedio_Owner_Por_Year_And_Quarter'][DataFrame_test.index[i - 1]])
    else:
        Lag_1_test.append(np.nan)
        last_person = person
    i = i + 1
DataFrame_test['Lag_1'] = Lag_1_test
DataFrame_test['Delta'] = DataFrame_test['Promedio_Owner_Por_Year_And_Quarter'] - DataFrame_test['Lag_1'].fillna(0)
DataFrame_test

Unnamed: 0,Opportunity_ID,Region,Total_Amount,TRF,Delivery_Year,Opportunity_Owner,Delivery_Quarter,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
171,10792,EMEA,177617.70,1,2019,Person_Name_13,Q1,1,1,1.272500e+05,5.999348e+04,,5.999348e+04
226,10806,EMEA,30488.25,0,2019,Person_Name_13,Q1,0,1,1.272500e+05,5.999348e+04,,5.999348e+04
238,10812,EMEA,96673.50,0,2019,Person_Name_13,Q1,1,1,1.272500e+05,5.999348e+04,,5.999348e+04
270,10836,EMEA,25725.00,0,2019,Person_Name_13,Q1,1,1,1.272500e+05,5.999348e+04,,5.999348e+04
282,10845,EMEA,21483.00,0,2019,Person_Name_13,Q1,1,1,1.272500e+05,5.999348e+04,,5.999348e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2519,12347,Americas,4596160.80,12,2020,Person_Name_9,Q2,1,1,7.748300e+05,7.647092e+06,8.018046,7.647084e+06
2520,12348,Americas,10783749.90,28,2020,Person_Name_9,Q2,1,1,7.748300e+05,7.647092e+06,8.018046,7.647084e+06
2521,12349,Americas,7508464.43,20,2020,Person_Name_9,Q2,1,1,7.748300e+05,7.647092e+06,8.018046,7.647084e+06
2522,12350,Americas,10783749.90,28,2020,Person_Name_9,Q3,1,1,2.563482e+06,1.078375e+07,8.018046,1.078374e+07


# Log

In [19]:
DataFrame_test.Total_Amount = np.log(DataFrame_test.Total_Amount)
DataFrame_test.Lag_1 = np.log(DataFrame_test.Lag_1)
DataFrame_test.Promedio_Region_Por_Quarter = np.log(DataFrame_test.Promedio_Region_Por_Quarter)
DataFrame_test.Promedio_Owner_Por_Year_And_Quarter = np.log(DataFrame_test.Promedio_Owner_Por_Year_And_Quarter)
DataFrame_test.Delta = np.log(DataFrame_test.Delta + 1 - min(DataFrame_test.Delta))
DataFrame_test.describe()

Unnamed: 0,Opportunity_ID,Total_Amount,TRF,Delivery_Year,"Pricing, Delivery_Terms_Approved","Pricing, Delivery_Terms_Quote_Appr",Promedio_Region_Por_Quarter,Promedio_Owner_Por_Year_And_Quarter,Lag_1,Delta
count,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1567.0,1059.0,1567.0
mean,11554.0485,11.930953,3.067007,2019.031908,0.493937,0.619655,13.561275,12.729049,2.401218,12.725728
std,492.205946,1.990895,36.395461,0.179407,0.500123,0.485626,1.5019,1.587187,0.277472,1.60603
min,10689.0,4.359014,0.0,2018.0,0.0,0.0,11.733856,4.359014,2.077836,0.0
25%,11080.5,10.917225,0.0,2019.0,0.0,0.0,12.453862,11.671593,2.077836,11.670934
50%,11583.0,11.705435,0.0,2019.0,0.0,1.0,13.560399,12.416221,2.542886,12.415847
75%,11976.5,12.797007,1.0,2019.0,1.0,1.0,14.756877,13.43227,2.656727,13.432135
max,12368.0,19.996115,1272.0,2020.0,1.0,1.0,17.360418,19.924758,2.764433,19.924758


In [20]:
DataFrame_test.to_csv('df_time_series_test.csv')