## FASE 1. EXPLORACIÓN BÁSICA

In [1]:
# Tratamiento de datos

import pandas as pd
import numpy as np
import warnings

# Imputación de nulos usando métodos avanzados estadísticos

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización

import seaborn as sns
import matplotlib.pyplot as plt

# Configuración

pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
warnings.filterwarnings("ignore")

In [2]:
# Cargar csv 'Customer Flight Activity'

df_cfa = pd.read_csv("data/Customer Flight Activity.csv", index_col=0)
df_cfa.head()

Unnamed: 0_level_0,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100018,2017,1,3,0,3,1521,152.0,0,0
100102,2017,1,10,4,14,2030,203.0,0,0
100140,2017,1,6,0,6,1200,120.0,0,0
100214,2017,1,0,0,0,0,0.0,0,0
100272,2017,1,0,0,0,0,0.0,0,0


In [3]:
# Restablecer el índice para convertir 'Loyalty Number' en una columna:

df_cfa.reset_index(inplace=True)
df_cfa.head()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0
3,100214,2017,1,0,0,0,0,0.0,0,0
4,100272,2017,1,0,0,0,0,0.0,0,0


In [4]:
# Exploración de columnas: 

df_cfa.columns

Index(['Loyalty Number', 'Year', 'Month', 'Flights Booked',
       'Flights with Companions', 'Total Flights', 'Distance',
       'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed'],
      dtype='object')

In [5]:
# Cantidad de filas y columnas del dataframe:

print(f"El número de filas que tenemos es {df_cfa.shape[0]}, y el número de columnas es {df_cfa.shape[1]}.")

El número de filas que tenemos es 405624, y el número de columnas es 10.


In [6]:
# Visualización de la estructura del dataframe:

df_cfa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405624 entries, 0 to 405623
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Loyalty Number               405624 non-null  int64  
 1   Year                         405624 non-null  int64  
 2   Month                        405624 non-null  int64  
 3   Flights Booked               405624 non-null  int64  
 4   Flights with Companions      405624 non-null  int64  
 5   Total Flights                405624 non-null  int64  
 6   Distance                     405624 non-null  int64  
 7   Points Accumulated           405624 non-null  float64
 8   Points Redeemed              405624 non-null  int64  
 9   Dollar Cost Points Redeemed  405624 non-null  int64  
dtypes: float64(1), int64(9)
memory usage: 30.9 MB


In [7]:
# Visualización estadísticas descriptivas de las columnas numéricas:
df_cfa.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,405624.0,550037.873084,258935.286969,100018.0,326961.0,550834.0,772194.0,999986.0
Year,405624.0,2017.5,0.500001,2017.0,2017.0,2017.5,2018.0,2018.0
Month,405624.0,6.5,3.452057,1.0,3.75,6.5,9.25,12.0
Flights Booked,405624.0,4.115052,5.225518,0.0,0.0,1.0,8.0,21.0
Flights with Companions,405624.0,1.031805,2.076869,0.0,0.0,0.0,1.0,11.0
Total Flights,405624.0,5.146858,6.521227,0.0,0.0,1.0,10.0,32.0
Distance,405624.0,1208.880059,1433.15532,0.0,0.0,488.0,2336.0,6293.0
Points Accumulated,405624.0,123.692721,146.599831,0.0,0.0,50.0,239.0,676.5
Points Redeemed,405624.0,30.696872,125.486049,0.0,0.0,0.0,0.0,876.0
Dollar Cost Points Redeemed,405624.0,2.484503,10.150038,0.0,0.0,0.0,0.0,71.0


In [8]:
# Visualización cantidad de valores unicos de todas las columnas del dataframe:
df_cfa.nunique()

Loyalty Number                 16737
Year                               2
Month                             12
Flights Booked                    22
Flights with Companions           12
Total Flights                     33
Distance                        4746
Points Accumulated              1549
Points Redeemed                  587
Dollar Cost Points Redeemed       49
dtype: int64

In [9]:
# Exploración de los valores de cada columna del df_cfa en el contexto de valoración cambio tipo de datos:

def print_unique_values(df_cfa):
    for column in df_cfa.columns:
        unique_values = df_cfa[column].unique()
        num_unique = len(unique_values)
        print(f"Columna: {column}")
        print(f"Número de valores únicos: {num_unique}")
        print(f"Valores únicos: {unique_values}")
        print("-" * 40)
print_unique_values(df_cfa)

Columna: Loyalty Number
Número de valores únicos: 16737
Valores únicos: [100018 100102 100140 ... 999731 999788 999891]
----------------------------------------
Columna: Year
Número de valores únicos: 2
Valores únicos: [2017 2018]
----------------------------------------
Columna: Month
Número de valores únicos: 12
Valores únicos: [ 1  9  2  3 11  4  5  7  6  8 10 12]
----------------------------------------
Columna: Flights Booked
Número de valores únicos: 22
Valores únicos: [ 3 10  6  0  8 11  9  4  7  5  2  1 12 13 14 16 15 17 18 19 20 21]
----------------------------------------
Columna: Flights with Companions
Número de valores únicos: 12
Valores únicos: [ 0  4  7  1  6  3  5  2 10  8  9 11]
----------------------------------------
Columna: Total Flights
Número de valores únicos: 33
Valores únicos: [ 3 14  6  0 15 11 12 10  8  9  7  5 16  2  1 17 13 22  4 19 18 21 26 20
 23 25 27 24 28 30 29 31 32]
----------------------------------------
Columna: Distance
Número de valores únicos:

In [10]:
# Comprobar Nulos: 
df_cfa.isnull().sum()

Loyalty Number                 0
Year                           0
Month                          0
Flights Booked                 0
Flights with Companions        0
Total Flights                  0
Distance                       0
Points Accumulated             0
Points Redeemed                0
Dollar Cost Points Redeemed    0
dtype: int64

In [11]:
# Comprobar que en la columna Loyalty Number (única columna que podría ser conflictiva en términos de duplicados) todos los valores son únicos:

df_cfa.duplicated("Loyalty Number").sum()

np.int64(388887)

In [12]:
# Cargar dataframe 'Customer Loyalty History':

df_clh = pd.read_csv("data/Customer Loyalty History.csv")
df_clh.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
1,549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
2,429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0
3,608370,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,3839.75,Standard,2013,2,,
4,530508,Canada,Quebec,Hull,J8Y 3Z5,Male,Bachelor,103495.0,Married,Star,3842.79,Standard,2014,10,,


In [13]:
# Exploración de columnas:

df_clh.columns

Index(['Loyalty Number', 'Country', 'Province', 'City', 'Postal Code',
       'Gender', 'Education', 'Salary', 'Marital Status', 'Loyalty Card',
       'CLV', 'Enrollment Type', 'Enrollment Year', 'Enrollment Month',
       'Cancellation Year', 'Cancellation Month'],
      dtype='object')

In [14]:
# Cantidad de filas y columnas del dataframe:

print(f"El número de filas que tenemos es {df_clh.shape[0]}, y el número de columnas es {df_clh.shape[1]}")

El número de filas que tenemos es 16737, y el número de columnas es 16


In [15]:
# Visualización de la estructura del dataframe:

df_clh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Loyalty Number      16737 non-null  int64  
 1   Country             16737 non-null  object 
 2   Province            16737 non-null  object 
 3   City                16737 non-null  object 
 4   Postal Code         16737 non-null  object 
 5   Gender              16737 non-null  object 
 6   Education           16737 non-null  object 
 7   Salary              12499 non-null  float64
 8   Marital Status      16737 non-null  object 
 9   Loyalty Card        16737 non-null  object 
 10  CLV                 16737 non-null  float64
 11  Enrollment Type     16737 non-null  object 
 12  Enrollment Year     16737 non-null  int64  
 13  Enrollment Month    16737 non-null  int64  
 14  Cancellation Year   2067 non-null   float64
 15  Cancellation Month  2067 non-null   float64
dtypes: f

In [16]:
# Visualización estadísticas descriptivas de las columnas numéricas:

df_clh.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,16737.0,549735.880445,258912.132453,100018.0,326603.0,550434.0,772019.0,999986.0
Salary,12499.0,79245.609409,35008.297285,-58486.0,59246.5,73455.0,88517.5,407228.0
CLV,16737.0,7988.896536,6860.98228,1898.01,3980.84,5780.18,8940.58,83325.38
Enrollment Year,16737.0,2015.253211,1.979111,2012.0,2014.0,2015.0,2017.0,2018.0
Enrollment Month,16737.0,6.669116,3.398958,1.0,4.0,7.0,10.0,12.0
Cancellation Year,2067.0,2016.503145,1.380743,2013.0,2016.0,2017.0,2018.0,2018.0
Cancellation Month,2067.0,6.962748,3.455297,1.0,4.0,7.0,10.0,12.0


In [17]:
# Visualización estadísticas descriptivas de las columnas categóricas o tipo object:

df_clh.describe(include="object").T

Unnamed: 0,count,unique,top,freq
Country,16737,1,Canada,16737
Province,16737,11,Ontario,5404
City,16737,29,Toronto,3351
Postal Code,16737,55,V6E 3D9,911
Gender,16737,2,Female,8410
Education,16737,5,Bachelor,10475
Marital Status,16737,3,Married,9735
Loyalty Card,16737,3,Star,7637
Enrollment Type,16737,2,Standard,15766


In [18]:
# Visualicación cantidad de valores únicos de cada columna del dataframe:

df_clh.nunique()

Loyalty Number        16737
Country                   1
Province                 11
City                     29
Postal Code              55
Gender                    2
Education                 5
Salary                 5890
Marital Status            3
Loyalty Card              3
CLV                    7984
Enrollment Type           2
Enrollment Year           7
Enrollment Month         12
Cancellation Year         6
Cancellation Month       12
dtype: int64

In [19]:
def print_unique_values(df_clh):
    for column in df_cfa.columns:
        unique_values = df_cfa[column].unique()
        num_unique = len(unique_values)
        print(f"Columna: {column}")
        print(f"Número de valores únicos: {num_unique}")
        print(f"Valores únicos: {unique_values}")
        print("-" * 40)
print_unique_values(df_clh)

Columna: Loyalty Number
Número de valores únicos: 16737
Valores únicos: [100018 100102 100140 ... 999731 999788 999891]
----------------------------------------
Columna: Year
Número de valores únicos: 2
Valores únicos: [2017 2018]
----------------------------------------
Columna: Month
Número de valores únicos: 12
Valores únicos: [ 1  9  2  3 11  4  5  7  6  8 10 12]
----------------------------------------
Columna: Flights Booked
Número de valores únicos: 22
Valores únicos: [ 3 10  6  0  8 11  9  4  7  5  2  1 12 13 14 16 15 17 18 19 20 21]
----------------------------------------
Columna: Flights with Companions
Número de valores únicos: 12
Valores únicos: [ 0  4  7  1  6  3  5  2 10  8  9 11]
----------------------------------------
Columna: Total Flights
Número de valores únicos: 33
Valores únicos: [ 3 14  6  0 15 11 12 10  8  9  7  5 16  2  1 17 13 22  4 19 18 21 26 20
 23 25 27 24 28 30 29 31 32]
----------------------------------------
Columna: Distance
Número de valores únicos:

In [20]:
# Comprobar Nulos:
df_clh.isnull().sum()

Loyalty Number            0
Country                   0
Province                  0
City                      0
Postal Code               0
Gender                    0
Education                 0
Salary                 4238
Marital Status            0
Loyalty Card              0
CLV                       0
Enrollment Type           0
Enrollment Year           0
Enrollment Month          0
Cancellation Year     14670
Cancellation Month    14670
dtype: int64

In [21]:
# Comprobar que en la columna Loyalty Number (única columna que podría ser conflictiva en términos de duplicados) todos los valores son únicos:

df_clh.duplicated("Loyalty Number").sum()

np.int64(0)