In [1]:
import numpy as np
import category_encoders
import pandas as pd
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Limpieza de datos

Eliminar columnas que tienen mas de la mitad de sus datos vacios

In [3]:
clean_train = train.drop(columns = ['Sales_Contract_No', 'Last_Activity','Actual_Delivery_Date','Price','Size','Product_Type','Brand', 'Product_Category_B', 'Source '])
clean_test = test.drop(columns = ['Sales_Contract_No', 'Last_Activity','Actual_Delivery_Date','Price','Size','Product_Type','Brand', 'Product_Category_B', 'Source '])

Filtrar registros que esten en estado finalizado y conversion en valores binarios  
Closed Won = 1 - Closed Lost = 0

In [4]:
clean_train = clean_train[(clean_train['Stage'] == 'Closed Won') | (clean_train['Stage'] == 'Closed Lost')]
clean_train['Stage'] = clean_train['Stage'].transform(func=lambda x : 1 if x=='Closed Won' else 0)

Reemplazar los datos vacios con "Nan"

In [5]:
clean_train = clean_train.fillna("Nan")
clean_test = clean_test.fillna("Nan")

# Creacion de features

Convierto columnas con fecha a date

In [6]:
clean_train["Last_Modified_Date"] = pd.to_datetime(clean_train["Last_Modified_Date"], errors='coerce')
clean_train["Opportunity_Created_Date"] = pd.to_datetime(clean_train["Opportunity_Created_Date"], errors='coerce')
clean_train["Quote_Expiry_Date"] = pd.to_datetime(clean_train["Quote_Expiry_Date"], errors='coerce')
clean_train["Planned_Delivery_Start_Date"] = pd.to_datetime(clean_train["Planned_Delivery_Start_Date"], errors='coerce')
clean_train["Planned_Delivery_End_Date"] = pd.to_datetime(clean_train["Planned_Delivery_End_Date"], errors='coerce')

clean_test["Last_Modified_Date"] = pd.to_datetime(clean_test["Last_Modified_Date"], errors='coerce')
clean_test["Opportunity_Created_Date"] = pd.to_datetime(clean_test["Opportunity_Created_Date"], errors='coerce')
clean_test["Quote_Expiry_Date"] = pd.to_datetime(clean_test["Quote_Expiry_Date"], errors='coerce')
clean_test["Planned_Delivery_Start_Date"] = pd.to_datetime(clean_test["Planned_Delivery_Start_Date"], errors='coerce')
clean_test["Planned_Delivery_End_Date"] = pd.to_datetime(clean_test["Planned_Delivery_End_Date"], errors='coerce')

Agrego columna Diferencia en dias entre la fecha de creacion de la oportunidad y la ultima modificacion de la oportunidad

In [7]:
clean_train["diferencia_en_dias"] = (clean_train["Last_Modified_Date"] - clean_train["Opportunity_Created_Date"]).dt.days

clean_test["diferencia_en_dias"] = (clean_test["Last_Modified_Date"] - clean_test["Opportunity_Created_Date"]).dt.days

Casteo variables numericas

In [8]:
clean_train["ASP_(converted)"] = pd.to_numeric(clean_train["ASP_(converted)"],errors='coerce')
clean_train["ASP"] = pd.to_numeric(clean_train["ASP"],errors='coerce')
clean_train["ASP_(converted)_Currency"] = pd.to_numeric(clean_train["ASP_(converted)_Currency"],errors='coerce')
clean_train["TRF"] = pd.to_numeric(clean_train["TRF"],errors='coerce')

In [9]:
clean_test["ASP_(converted)"] = pd.to_numeric(clean_test["ASP_(converted)"],errors='coerce')
clean_test["ASP"] = pd.to_numeric(clean_test["ASP"],errors='coerce')
clean_test["ASP_(converted)_Currency"] = pd.to_numeric(clean_test["ASP_(converted)_Currency"],errors='coerce')
clean_test["TRF"] = pd.to_numeric(clean_test["TRF"],errors='coerce')

Agrupando por misma oportunidad, se agrega al cantidad de items por oportunidad

In [10]:
opportunities_train = clean_train.groupby(["Opportunity_ID"]).agg({'ID': 'count',
                                                                 'Region': 'first',
                                                                 'Territory': 'first',
                                                                 'Pricing, Delivery_Terms_Quote_Appr': 'first',
                                                                 'Bureaucratic_Code_0_Approval': 'first',
                                                                 'Bureaucratic_Code_0_Approved': 'first',
                                                                  'Submitted_for_Approval': 'first',
                                                                  'Bureaucratic_Code': 'first',
                                                                  'Account_Created_Date': 'first',
                                                                  'Billing_Country': 'first', 
                                                                  'Account_Name': 'first', 
                                                                  'Opportunity_Name': 'first', 
                                                                  'Account_Owner': 'first', 
                                                                  'Opportunity_Owner': 'first', 
                                                                  'Account_Type': 'first', 
                                                                  'Opportunity_Type': 'first', 
                                                                  'Quote_Type': 'first', 
                                                                  'Delivery_Terms': 'first', 
                                                                  'Opportunity_Created_Date': 'first', 
                                                                  'Currency': 'first', 
                                                                  'Quote_Expiry_Date': 'first', 
                                                                  'Last_Modified_Date': 'first', 
                                                                  'Last_Modified_By': 'first', 
                                                                  'Product_Family': 'first', 
                                                                  'Product_Name': 'first', 
                                                                  'ASP_Currency': 'first', 
                                                                  'ASP': 'mean', 
                                                                  'ASP_(converted)_Currency': 'mean', 
                                                                  'ASP_(converted)': 'mean', 
                                                                  'Planned_Delivery_Start_Date': 'first', 
                                                                  'Planned_Delivery_End_Date': 'first', 
                                                                  'Month': 'first',
                                                                  'Delivery_Quarter': 'first',
                                                                  'Delivery_Year': 'first',
                                                                  'TRF': 'mean',
                                                                  'Total_Amount_Currency': 'first',
                                                                  'Total_Amount': 'sum',
                                                                  'Total_Taxable_Amount_Currency': 'sum',
                                                                  'Total_Taxable_Amount': 'sum',
                                                                  'Prod_Category_A': 'first',
                                                                  'diferencia_en_dias': 'first', 
                                                                   'Stage': 'first'
                                                                 }).reset_index()

In [11]:
opportunities_test = clean_test.groupby(["Opportunity_ID"]).agg({'ID': 'count',
                                                                 'Region': 'first',
                                                                 'Territory': 'first',
                                                                 'Pricing, Delivery_Terms_Quote_Appr': 'first',
                                                                 'Bureaucratic_Code_0_Approval': 'first',
                                                                 'Bureaucratic_Code_0_Approved': 'first',
                                                                  'Submitted_for_Approval': 'first',
                                                                  'Bureaucratic_Code': 'first',
                                                                  'Account_Created_Date': 'first',
                                                                  'Billing_Country': 'first', 
                                                                  'Account_Name': 'first', 
                                                                  'Opportunity_Name': 'first', 
                                                                  'Account_Owner': 'first', 
                                                                  'Opportunity_Owner': 'first', 
                                                                  'Account_Type': 'first', 
                                                                  'Opportunity_Type': 'first', 
                                                                  'Quote_Type': 'first', 
                                                                  'Delivery_Terms': 'first', 
                                                                  'Opportunity_Created_Date': 'first', 
                                                                  'Currency': 'first', 
                                                                  'Quote_Expiry_Date': 'first', 
                                                                  'Last_Modified_Date': 'first', 
                                                                  'Last_Modified_By': 'first', 
                                                                  'Product_Family': 'first', 
                                                                  'Product_Name': 'first', 
                                                                  'ASP_Currency': 'first', 
                                                                  'ASP': 'mean', 
                                                                  'ASP_(converted)_Currency': 'mean', 
                                                                  'ASP_(converted)': 'mean', 
                                                                  'Planned_Delivery_Start_Date': 'first', 
                                                                  'Planned_Delivery_End_Date': 'first', 
                                                                  'Month': 'first',
                                                                  'Delivery_Quarter': 'first',
                                                                  'Delivery_Year': 'first',
                                                                  'TRF': 'mean',
                                                                  'Total_Amount_Currency': 'first',
                                                                  'Total_Amount': 'sum',
                                                                  'Total_Taxable_Amount_Currency': 'sum',
                                                                  'Total_Taxable_Amount': 'sum',
                                                                  'Prod_Category_A': 'first',
                                                                  'diferencia_en_dias': 'first'
                                                                }).reset_index()

In [12]:
opportunities_train.rename(columns={'ID':'Items_Count'}, inplace=True)

In [13]:
opportunities_test.rename(columns={'ID':'Items_Count'}, inplace=True)

Division variables de tipo fecha en DOY, a√±o

In [14]:
opportunities_train["Last_Modified_DOY"] = opportunities_train['Last_Modified_Date'].dt.dayofyear
opportunities_train["Last_Modified_Year"] = opportunities_train['Last_Modified_Date'].dt.year

opportunities_train["Opportunity_Created_DOY"] = opportunities_train["Opportunity_Created_Date"].dt.dayofyear
opportunities_train["Opportunity_Created_Year"] = opportunities_train["Opportunity_Created_Date"]

opportunities_train["Quote_Expiry_DOY"] = opportunities_train["Quote_Expiry_Date"].dt.dayofyear
opportunities_train["Quote_Expiry_Year"] = opportunities_train["Quote_Expiry_Date"].dt.year

opportunities_train["Planned_Delivery_Start_DOY"] = opportunities_train["Planned_Delivery_Start_Date"].dt.dayofyear
opportunities_train["Planned_Delivery_Start_Year"] = opportunities_train["Planned_Delivery_Start_Date"].dt.year

opportunities_train["Planned_Delivery_End_DOY"] = opportunities_train["Planned_Delivery_End_Date"].dt.dayofyear
opportunities_train["Planned_Delivery_End_Year"] = opportunities_train["Planned_Delivery_End_Date"].dt.year 


opportunities_train.drop(columns = ['Planned_Delivery_End_Date', 'Planned_Delivery_Start_Date','Quote_Expiry_Date','Opportunity_Created_Date','Last_Modified_Date'], inplace=True)



In [15]:
opportunities_test["Last_Modified_DOY"] = opportunities_test['Last_Modified_Date'].dt.dayofyear
opportunities_test["Last_Modified_Year"] = opportunities_test['Last_Modified_Date'].dt.year

opportunities_test["Opportunity_Created_DOY"] = opportunities_test["Opportunity_Created_Date"].dt.dayofyear
opportunities_test["Opportunity_Created_Year"] = opportunities_test["Opportunity_Created_Date"]

opportunities_test["Quote_Expiry_DOY"] = opportunities_test["Quote_Expiry_Date"].dt.dayofyear
opportunities_test["Quote_Expiry_Year"] = opportunities_test["Quote_Expiry_Date"].dt.year

opportunities_test["Planned_Delivery_Start_DOY"] = opportunities_test["Planned_Delivery_Start_Date"].dt.dayofyear
opportunities_test["Planned_Delivery_Start_Year"] = opportunities_test["Planned_Delivery_Start_Date"].dt.year

opportunities_test["Planned_Delivery_End_DOY"] = opportunities_test["Planned_Delivery_End_Date"].dt.dayofyear
opportunities_test["Planned_Delivery_End_Year"] = opportunities_test["Planned_Delivery_End_Date"].dt.year 


opportunities_test.drop(columns = ['Planned_Delivery_End_Date', 'Planned_Delivery_Start_Date','Quote_Expiry_Date','Opportunity_Created_Date','Last_Modified_Date'], inplace=True)

Dejar la columna Stage al final 

In [16]:
opportunities_train["Target"] = opportunities_train["Stage"]
opportunities_train.drop(columns = ['Stage'], inplace=True)

In [17]:
opportunities_train.to_csv('data/cleaned_train.csv')
opportunities_test.to_csv('data/cleaned_test.csv')