In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import math

PREDICCION_REAL = False

In [2]:
#APERTURA DE ARCHIVO DE ARCHIVOS
entrenamiento_temp = pd.read_csv("Train_TP2_Datos_2020-2C.csv")
test = pd.read_csv("Test_TP2_Datos_2020-2C.csv")

In [3]:
#FILTRADO DE OPORTUNIDADES QUE NO SON CLOSE WON NI CLOSE LOST

entrenamiento_temp = entrenamiento_temp[( entrenamiento_temp['Stage'] == 'Closed Won') | ( entrenamiento_temp['Stage'] == 'Closed Lost')]

In [4]:
#FORMATO FECHAS

#Respalda fecha, usada para separa entrenamiento y test

columnas_fecha = ['Month','Last_Modified_Date','Account_Created_Date','Opportunity_Created_Date','Quote_Expiry_Date','Planned_Delivery_Start_Date','Planned_Delivery_End_Date']

def formato_fechas(x):
    for columna in columnas_fecha:
        x[columna] = pd.to_datetime(x[columna])
        
formato_fechas(entrenamiento_temp)
if(PREDICCION_REAL): formato_fechas(test)

In [5]:
#DIVISION ENTRE SET DE ENTRENAMIENTO Y SET DE TEST

if(PREDICCION_REAL):
    entrenamiento = entrenamiento_temp
else:
    entrenamiento_temp['Fecha'] = pd.to_datetime(entrenamiento_temp['Opportunity_Created_Date'])
    
    entrenamiento = entrenamiento_temp.loc[entrenamiento_temp['Fecha'].dt.year <= 2017].copy()
    test          = entrenamiento_temp.loc[entrenamiento_temp['Fecha'].dt.year > 2017].copy()
    
    entrenamiento = entrenamiento.drop('Fecha',1)
    test = test.drop('Fecha',1)

del entrenamiento_temp


In [6]:
def recuperacion_y_conversion(df):
    #Arreglo el nombre de la columna 'Source'
    df = df.rename(columns={'Source ' : 'Source'})

    # Convierto Total_Amount y Total_Taxable_Amount a dolares, filtro columnas innecesarias (codigo extraido de TP1)

    df['Convertion_Factor'] = df['ASP_(converted)'] / df['ASP']

    #Intentamos salvar algunas entradas reemplazando con la media de la oportunidad

    df['Convertion_Factor_AVG'] = df.groupby('Opportunity_ID')['Convertion_Factor'].transform('mean')
    df['Convertion_Factor_is_null'] = df['Convertion_Factor'].isnull()
    df['Convertion_Factor'] = np.where(df['Convertion_Factor_is_null'] == 1, df['Convertion_Factor_AVG'], df['Convertion_Factor'])
    df['Convertion_Factor_AVG'] = df.groupby('ASP_Currency')['Convertion_Factor'].transform('mean')
    df['Convertion_Factor_is_null'] = df['Convertion_Factor'].isnull()
    df['Convertion_Factor'] = np.where(df['Convertion_Factor_is_null'] == 1, df['Convertion_Factor_AVG'], df['Convertion_Factor'])

    #Eliminamos las entradas que no pudimos salvar.
    df = df.loc[df.Convertion_Factor.isnull() == 0]

    #Eliminamos infinitos
    df = df.loc[df.Convertion_Factor != np.inf]

    # Eliminamos todos los 'Opportunity_ID' que tienen algun registro con Total_Amount == None 
    df['Total_Amount_is_null'] = df.Total_Amount.isnull().astype(int)
    df['Total_Amount_is_null'] = df.groupby('Opportunity_ID')['Total_Amount_is_null'].transform('sum')
    df = df.loc[df.Total_Amount_is_null == 0]

    #Convertimos los valores a dolares
    df['Total_Amount(USD)'] = df['Total_Amount'] * df['Convertion_Factor']
    df['Total_Taxable_Amount(USD)'] = df['Total_Taxable_Amount'] * df['Convertion_Factor']

    #Rescatamos ASP
    df['ASP_(converted)_avg'] = df.groupby('Region')['ASP_(converted)'].transform('mean')
    df['ASP_(converted)_is_null'] = df['ASP_(converted)'].isnull()
    df['ASP_(converted)'] = np.where(df['ASP_(converted)_is_null'] == 1, df['ASP_(converted)_avg'], df['ASP_(converted)'])

    #Eliminamos columnas utilizadas

    df = df.drop(columns={'Total_Amount_is_null', 
                          'Convertion_Factor_AVG',
                          'Convertion_Factor',
                          'Convertion_Factor_is_null',
                          'ASP_(converted)_avg',
                          'ASP_(converted)_is_null'})
    
    # Intentamos recuperar los datos faltantes

    #ACA LE PUEDO HABER PIFIADO
        
    # Billing Country y #Account_Type
    idx = df[df.Billing_Country == 'None'].index

    # Se completan los campos faltantes en base a la moda de la region.
    # Se intentaron completar a partir del nombre de cuenta, nombre de oportunidad e id de oportunidad, pero solo aparecia esta entrada.
    df.loc[idx, 'Billing_Country'] = df[df.Region == 'APAC']['Billing_Country'].mode()[0]
    df.loc[idx, 'Account_Type'] = df[df.Region == 'APAC']['Account_Type'].mode()[0]

    # Planned_Delivery_End_Date

    # Ya que son oportunidades de un unico producto, podemos predecir una fecha de fin de entrega a partir de la media de la region y familia de producto

    df['Planned_Delivery_Duration'] = (df['Planned_Delivery_End_Date'] - df['Planned_Delivery_Start_Date']) / np.timedelta64(1, 'D')
    df['Planned_Delivery_Duration_AVG'] = np.round(df.groupby(['Region', 'Product_Family'])['Planned_Delivery_Duration'].transform('mean'))
    df['Planned_Delivery_Duration_AVG'] = pd.to_timedelta(df['Planned_Delivery_Duration_AVG'], 'D')
    df['Predicted_Planned_Delivery_End_Date'] = df['Planned_Delivery_Start_Date'] + df['Planned_Delivery_Duration_AVG']

    df['Planned_Delivery_End_Date_is_null'] = df['Planned_Delivery_End_Date'].isnull()
    df['Planned_Delivery_End_Date'] = np.where(df['Planned_Delivery_End_Date_is_null'] == 1, df['Predicted_Planned_Delivery_End_Date'], df['Planned_Delivery_End_Date'])

    # Completamos las que quedaron con la media de la region, sin tener en cuenta el tipo de producto

    df['Planned_Delivery_Duration_AVG'] = round(df.groupby('Region')['Planned_Delivery_Duration'].transform('mean'))
    df['Planned_Delivery_Duration_AVG'] = pd.to_timedelta(df['Planned_Delivery_Duration_AVG'], 'D')
    df['Predicted_Planned_Delivery_End_Date'] = df['Planned_Delivery_Start_Date'] + df['Planned_Delivery_Duration_AVG']

    df['Planned_Delivery_End_Date_is_null'] = df['Planned_Delivery_End_Date'].isnull()
    df['Planned_Delivery_End_Date'] = np.where(df['Planned_Delivery_End_Date_is_null'] == 1, df['Predicted_Planned_Delivery_End_Date'], df['Planned_Delivery_End_Date'])

    df = df.drop(columns = ['Planned_Delivery_Duration',
                                      'Planned_Delivery_Duration_AVG',
                                      'Predicted_Planned_Delivery_End_Date', 
                                      'Planned_Delivery_End_Date_is_null'])
    
    return df

In [7]:
entrenamiento = recuperacion_y_conversion(entrenamiento)
test = recuperacion_y_conversion(test)

In [8]:
#LIMPIEZA COLUMNAS IRRECUPERABLES

entrenamiento = entrenamiento.drop(columns=['ASP_(converted)_Currency','Quote_Type','Brand','Product_Type','Size','Product_Category_B','Price','Currency','Last_Activity','Actual_Delivery_Date','Prod_Category_A'])
test = test.drop(columns=['ASP_(converted)_Currency','Quote_Type','Brand','Product_Type','Size','Product_Category_B','Price','Currency','Last_Activity','Actual_Delivery_Date','Prod_Category_A'])

In [9]:
# COLUMNAS OBJECT A CATEGORY
columnas_object = list(entrenamiento.select_dtypes(include=['object']).columns)
for columna in columnas_object:
    entrenamiento[columna] = entrenamiento[columna].astype('category')
    test[columna] = test[columna].astype('category')

In [10]:
# GUARDADO ARCHIVOS TEMPORALES
entrenamiento.to_pickle("entrenamiento-limpio.pkl")
test.to_pickle("test-limpio.pkl")

In [11]:
entrenamiento.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12081 entries, 0 to 16946
Data columns (total 43 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   ID                                  12081 non-null  int64         
 1   Region                              12081 non-null  category      
 2   Territory                           12081 non-null  category      
 3   Pricing, Delivery_Terms_Quote_Appr  12081 non-null  int64         
 4   Pricing, Delivery_Terms_Approved    12081 non-null  int64         
 5   Bureaucratic_Code_0_Approval        12081 non-null  int64         
 6   Bureaucratic_Code_0_Approved        12081 non-null  int64         
 7   Submitted_for_Approval              12081 non-null  int64         
 8   Bureaucratic_Code                   12081 non-null  category      
 9   Account_Created_Date                12081 non-null  datetime64[ns]
 10  Source                