In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%run "../notebooks/cargar_df.ipynb"

plt.style.use('default')
sns.set(style="whitegrid")
pd.options.display.float_format = '{:20,.2f}'.format

In [2]:
data_folder  = "../data/"

In [3]:
train = pd.read_csv(data_folder + "Entrenamieto_ECI_2020.csv")

### Analisis del set

In [4]:
#Obtengo la info basica del set de datos 
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16947 entries, 0 to 16946
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  16947 non-null  int64  
 1   Region                              16947 non-null  object 
 2   Territory                           16947 non-null  object 
 3   Pricing, Delivery_Terms_Quote_Appr  16947 non-null  int64  
 4   Pricing, Delivery_Terms_Approved    16947 non-null  int64  
 5   Bureaucratic_Code_0_Approval        16947 non-null  int64  
 6   Bureaucratic_Code_0_Approved        16947 non-null  int64  
 7   Submitted_for_Approval              16947 non-null  int64  
 8   Bureaucratic_Code                   16947 non-null  object 
 9   Account_Created_Date                16947 non-null  object 
 10  Source                              16947 non-null  object 
 11  Billing_Country                     16947

In [5]:
#Visualizo las dimensiones del set
train.shape

(16947, 52)

In [6]:
#Vemos una porcion del mismo
train.head().T

Unnamed: 0,0,1,2,3,4
ID,27761,27760,27446,16808,16805
Region,EMEA,EMEA,Americas,Americas,Americas
Territory,,,NW America,NW America,NW America
"Pricing, Delivery_Terms_Quote_Appr",1,0,0,1,1
"Pricing, Delivery_Terms_Approved",1,0,0,0,0
Bureaucratic_Code_0_Approval,1,0,0,1,1
Bureaucratic_Code_0_Approved,1,0,0,0,0
Submitted_for_Approval,0,0,0,0,0
Bureaucratic_Code,Bureaucratic_Code_4,Bureaucratic_Code_4,Bureaucratic_Code_4,Bureaucratic_Code_5,Bureaucratic_Code_5
Account_Created_Date,6/16/2015,6/16/2015,4/21/2015,7/27/2013,7/27/2013


In [7]:
#Exploramos los tipos de datos
train.dtypes

ID                                      int64
Region                                 object
Territory                              object
Pricing, Delivery_Terms_Quote_Appr      int64
Pricing, Delivery_Terms_Approved        int64
Bureaucratic_Code_0_Approval            int64
Bureaucratic_Code_0_Approved            int64
Submitted_for_Approval                  int64
Bureaucratic_Code                      object
Account_Created_Date                   object
Source                                 object
Billing_Country                        object
Account_Name                           object
Opportunity_Name                       object
Opportunity_ID                          int64
Sales_Contract_No                      object
Account_Owner                          object
Opportunity_Owner                      object
Account_Type                           object
Opportunity_Type                       object
Quote_Type                             object
Delivery_Terms                    

In [8]:
#Vemos la cantidad de tipos de datos 
train.dtypes.value_counts()

object     39
int64       9
float64     4
dtype: int64

In [9]:
#Vemos la cantidad de valores nulos en cada columna del set de datos
train.isnull().sum()

ID                                       0
Region                                   0
Territory                                0
Pricing, Delivery_Terms_Quote_Appr       0
Pricing, Delivery_Terms_Approved         0
Bureaucratic_Code_0_Approval             0
Bureaucratic_Code_0_Approved             0
Submitted_for_Approval                   0
Bureaucratic_Code                        0
Account_Created_Date                     0
Source                                   0
Billing_Country                          0
Account_Name                             0
Opportunity_Name                         0
Opportunity_ID                           0
Sales_Contract_No                        0
Account_Owner                            0
Opportunity_Owner                        0
Account_Type                             0
Opportunity_Type                         0
Quote_Type                               0
Delivery_Terms                           0
Opportunity_Created_Date                 0
Brand      

In [10]:
#Inspeccionamos las columnas del data frame con mas detalle,en particular datos de tipo numerio ,categorico y object
train.describe(include=[np.int64,np.float64,np.object, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
ID,16947.0,,,,17666.65,6940.86,4854.0,11953.0,18114.0,23845.5,28773.0
Region,16947.0,5.0,Japan,4892.0,,,,,,,
Territory,16947.0,76.0,,4999.0,,,,,,,
"Pricing, Delivery_Terms_Quote_Appr",16947.0,,,,0.81,0.39,0.0,1.0,1.0,1.0,1.0
"Pricing, Delivery_Terms_Approved",16947.0,,,,0.58,0.49,0.0,0.0,1.0,1.0,1.0
Bureaucratic_Code_0_Approval,16947.0,,,,0.48,0.5,0.0,0.0,0.0,1.0,1.0
Bureaucratic_Code_0_Approved,16947.0,,,,0.32,0.47,0.0,0.0,0.0,1.0,1.0
Submitted_for_Approval,16947.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bureaucratic_Code,16947.0,7.0,Bureaucratic_Code_4,12587.0,,,,,,,
Account_Created_Date,16947.0,809.0,4/20/2015,3011.0,,,,,,,


In [11]:
#Viendo que hay columnas de tipo int64 ,analizamos si estas pueden optimizarse cambiando el formato
train["ID"].max()

28773

In [12]:
train["Pricing, Delivery_Terms_Quote_Appr"].max()

1

In [13]:
train["Pricing, Delivery_Terms_Approved"].max()

1

In [14]:
train["Submitted_for_Approval"].max()

0

In [15]:
#Observamos que varias columnas pueden optimizarse cambiando sus tipos de datos para lograr un menor uso de la memoria
#al momento de cargar el set de datos.
#Creamos ademas una funcion de carga del dataframe para usar en general en cada analisis exploratorio
#(ver en cargar_df.ipynb)
train2=get_train()

In [16]:
#Visualizamos ahora con la funcion de carga, los tipos de datos modificados y su consumo de memoria
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16947 entries, 0 to 16946
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   ID                                  16947 non-null  int16         
 1   Region                              16947 non-null  category      
 2   Territory                           16947 non-null  category      
 3   Pricing, Delivery_Terms_Quote_Appr  16947 non-null  int16         
 4   Pricing, Delivery_Terms_Approved    16947 non-null  int16         
 5   Bureaucratic_Code_0_Approval        16947 non-null  category      
 6   Bureaucratic_Code_0_Approved        16947 non-null  category      
 7   Submitted_for_Approval              16947 non-null  int16         
 8   Bureaucratic_Code                   16947 non-null  category      
 9   Account_Created_Date                16947 non-null  datetime64[ns]
 10  Source                

In [17]:
train["Stage"].value_counts()

Closed Won       9533
Closed Lost      7350
Proposal           35
Negotiation        18
Qualification      11
Name: Stage, dtype: int64

In [18]:
train["Territory"].value_counts()

None              4999
Germany           1682
NW America        1568
Australia         1208
India              809
                  ... 
Egypt                1
Monaco               1
Czech Republic       1
Georgia              1
Slovenia             1
Name: Territory, Length: 76, dtype: int64

In [19]:
train["Region"].value_counts()

Japan          4892
EMEA           4664
Americas       3945
APAC           3262
Middle East     184
Name: Region, dtype: int64

In [20]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16947 entries, 0 to 16946
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   ID                                  16947 non-null  int16         
 1   Region                              16947 non-null  category      
 2   Territory                           16947 non-null  category      
 3   Pricing, Delivery_Terms_Quote_Appr  16947 non-null  int16         
 4   Pricing, Delivery_Terms_Approved    16947 non-null  int16         
 5   Bureaucratic_Code_0_Approval        16947 non-null  category      
 6   Bureaucratic_Code_0_Approved        16947 non-null  category      
 7   Submitted_for_Approval              16947 non-null  int16         
 8   Bureaucratic_Code                   16947 non-null  category      
 9   Account_Created_Date                16947 non-null  datetime64[ns]
 10  Source                