In [231]:
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
#from scipy.stats import shapiro, kstest, poisson, chisquare, ttest_ind, levene, bartlett, sem, ppf
import scipy.stats as stats
from scipy.stats import shapiro, levene
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [232]:
# Abro los csv
df_flights = pd.read_csv("Customer Flight Activity.csv")
df_loyalty = pd.read_csv("Customer Loyalty History.csv")

## FASE 1: EXPLORACIÓN Y LIMPIEZA ##

In [233]:
# 1. EXPLORACIÓN INICIAL

In [234]:
# Exploro las 5 primeras filas de "df_flights"
df_flights.head()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0
3,100214,2017,1,0,0,0,0,0.0,0,0
4,100272,2017,1,0,0,0,0,0.0,0,0


In [235]:
# Exploro las 5 últimas filas de "df_flights"
df_flights.tail()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
405619,999902,2018,12,0,0,0,0,0.0,0,0
405620,999911,2018,12,0,0,0,0,0.0,0,0
405621,999940,2018,12,3,0,3,1233,123.0,0,0
405622,999982,2018,12,0,0,0,0,0.0,0,0
405623,999986,2018,12,0,0,0,0,0.0,0,0


In [236]:
# Exploro las 10 filas aleatorias de "df_flights"
df_flights.sample(10)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
17045,107119,2017,2,8,0,8,1584,158.0,0,0
311612,493640,2018,7,0,0,0,0,0.0,0,0
294915,504219,2018,6,13,2,15,1380,138.0,0,0
10412,118511,2017,8,0,0,0,0,0.0,0,0
102273,146099,2017,7,0,0,0,0,0.0,0,0
29535,769731,2017,2,0,0,0,0,0.0,0,0
45003,695965,2017,3,0,0,0,0,0.0,0,0
82132,871980,2017,5,3,0,3,1041,104.0,0,0
223977,328602,2018,2,0,0,0,0,0.0,0,0
52368,189914,2017,4,0,0,0,0,0.0,0,0


In [237]:
# Compruebo el total de filas y columnas de "df_flights"
df_flights.shape

(405624, 10)

In [238]:
# Exploro las 5 primeras filas de "df_loyalty"
df_loyalty.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
1,549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
2,429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0
3,608370,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,3839.75,Standard,2013,2,,
4,530508,Canada,Quebec,Hull,J8Y 3Z5,Male,Bachelor,103495.0,Married,Star,3842.79,Standard,2014,10,,


In [239]:
# Exploro las 5 últimas filas de "df_loyalty"
df_loyalty.tail()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
16732,823768,Canada,British Columbia,Vancouver,V6E 3Z3,Female,College,,Married,Star,61850.19,Standard,2012,12,,
16733,680886,Canada,Saskatchewan,Regina,S1J 3C5,Female,Bachelor,89210.0,Married,Star,67907.27,Standard,2014,9,,
16734,776187,Canada,British Columbia,Vancouver,V5R 1W3,Male,College,,Single,Star,74228.52,Standard,2014,3,,
16735,906428,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4,,
16736,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0


In [240]:
# Exploro las 10 filas aleatorias de "df_loyalty"
df_loyalty.sample(10)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
2461,876326,Canada,British Columbia,Whistler,V6T 1Y8,Female,College,,Single,Aurora,8178.52,Standard,2014,3,,
10405,876319,Canada,Ontario,Toronto,M2M 7K8,Female,Bachelor,69817.0,Married,Star,2417.76,Standard,2012,6,,
116,375511,Canada,Quebec,Montreal,H2Y 2W2,Female,Bachelor,59300.0,Single,Star,4070.05,Standard,2014,7,,
5748,202831,Canada,Newfoundland,St. John's,A1C 6H9,Male,College,,Single,Nova,4334.21,Standard,2014,1,,
6340,209675,Canada,Quebec,Montreal,H4G 3T4,Male,High School or Below,81391.0,Married,Nova,5046.89,Standard,2014,11,,
11283,975309,Canada,Ontario,Toronto,P5S 6R4,Female,College,,Married,Star,2606.05,Standard,2012,6,,
8877,563076,Canada,Ontario,Toronto,M2M 7K8,Male,Bachelor,80596.0,Married,Nova,11473.48,Standard,2016,5,,
58,645280,Canada,British Columbia,West Vancouver,V6V 8Z3,Male,Bachelor,75827.0,Married,Star,3939.01,Standard,2012,12,,
3321,444652,Canada,Alberta,Calgary,T3E 2V9,Male,College,,Single,Aurora,12067.46,Standard,2017,8,,
6348,335671,Canada,Ontario,Toronto,M1R 4K3,Male,College,,Married,Nova,5061.76,Standard,2013,8,,


In [241]:
# Compruebo el total de filas y columnas de "df_loyalty"
df_loyalty.shape

(16737, 16)

In [242]:
# Compruebo valores nulos para "df_flights"
# Observo que en este df NO tengo
df_flights.isnull().sum()

Loyalty Number                 0
Year                           0
Month                          0
Flights Booked                 0
Flights with Companions        0
Total Flights                  0
Distance                       0
Points Accumulated             0
Points Redeemed                0
Dollar Cost Points Redeemed    0
dtype: int64

In [243]:
# Compruebo valores nulos para "df_loyalty". 
df_loyalty.isnull().sum()

Loyalty Number            0
Country                   0
Province                  0
City                      0
Postal Code               0
Gender                    0
Education                 0
Salary                 4238
Marital Status            0
Loyalty Card              0
CLV                       0
Enrollment Type           0
Enrollment Year           0
Enrollment Month          0
Cancellation Year     14670
Cancellation Month    14670
dtype: int64

In [244]:
# En "df_loyalty" encuentro que tengo nulos en las columnas: "Salary", "Cancellation Year", "Cancellation Month". Miro de que tipo son estas columnas.
df_loyalty.dtypes

Loyalty Number          int64
Country                object
Province               object
City                   object
Postal Code            object
Gender                 object
Education              object
Salary                float64
Marital Status         object
Loyalty Card           object
CLV                   float64
Enrollment Type        object
Enrollment Year         int64
Enrollment Month        int64
Cancellation Year     float64
Cancellation Month    float64
dtype: object

In [245]:
# Lo observo en procentaje. En el siguiente punto de la fase exploración, analizo como voy a proceder con los datos nulos.
df_loyalty.isnull().sum() / df_loyalty.shape[0] * 100

Loyalty Number         0.000000
Country                0.000000
Province               0.000000
City                   0.000000
Postal Code            0.000000
Gender                 0.000000
Education              0.000000
Salary                25.321145
Marital Status         0.000000
Loyalty Card           0.000000
CLV                    0.000000
Enrollment Type        0.000000
Enrollment Year        0.000000
Enrollment Month       0.000000
Cancellation Year     87.650117
Cancellation Month    87.650117
dtype: float64

In [246]:
# Observo si hay duplicado en "df_flights". SI TENGO
df_flights.duplicated().sum()

1864

In [247]:
# Observo si hay duplicado en "df_loyalty". NO TENGO
df_loyalty.duplicated().sum()

0

In [248]:
# Solo tengo duplicados en "df_flights", analizo cuáles son. "keep=False" Duplicados en todas las columnas
# Observo que se repiten los valores en la columna "Loyalty Number"

df_flights[df_flights.duplicated(keep=False)]
 

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
41,101902,2017,1,0,0,0,0,0.0,0,0
42,101902,2017,1,0,0,0,0,0.0,0,0
226,112142,2017,1,0,0,0,0,0.0,0,0
227,112142,2017,1,0,0,0,0,0.0,0,0
477,126100,2017,1,0,0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
405111,971370,2018,12,0,0,0,0,0.0,0,0
405409,988392,2018,12,0,0,0,0,0.0,0,0
405410,988392,2018,12,0,0,0,0,0.0,0,0
405436,989528,2018,12,0,0,0,0,0.0,0,0


In [249]:
# Me di cuenta que colocando el indexcol=0 no me cogia "Loyalty Number" como columna. Lo cambié al leer los csv. 
df_flights.columns

Index(['Loyalty Number', 'Year', 'Month', 'Flights Booked',
       'Flights with Companions', 'Total Flights', 'Distance',
       'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed'],
      dtype='object')

In [250]:
# Veo los duplicados en la columna "Loyalty Number" 
df_flights.duplicated(subset = "Loyalty Number").sum()

388887

In [251]:
# Antes de unir los DataFrames debo gestionar los duplicados. 
# Como este atributo "Loyalty Number" representa un identificador único para cada cliente, decido eliminar los duplicados.

# Elimino las filas duplicadas del DataFrame
df_flights.drop_duplicates(inplace=True)

# Restablezco el índice del DataFrame
df_flights.reset_index(drop=True)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0
3,100214,2017,1,0,0,0,0,0.0,0,0
4,100272,2017,1,0,0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
403755,999902,2018,12,0,0,0,0,0.0,0,0
403756,999911,2018,12,0,0,0,0,0.0,0,0
403757,999940,2018,12,3,0,3,1233,123.0,0,0
403758,999982,2018,12,0,0,0,0,0.0,0,0


In [252]:
# Compruebo que se hayan eliminado los duplicados
df_flights.duplicated().sum()

0

In [253]:
# Ahora uniré los dos DataFrames, basándome en la columna común "Loyalty Number"
df_combinado = df_loyalty.merge(df_flights, on =["Loyalty Number"], how = "left")

RECORDATORIO PARA MÍ: (left join)
Si un Loyalty Number de loyalty tiene una coincidencia en flight, se agregarán las columnas de flight a esa fila.
Si un Loyalty Number de loyalty no tiene una coincidencia en flight, se mantendrá la fila de loyalty tal como está, pero las columnas de flight se llenarán con NaN (valores faltantes).

In [254]:
df_combinado.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,1,0,0,0,0,0.0,0,0
1,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,2,3,0,3,2823,282.0,0,0
2,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,3,0,0,0,0,0.0,0,0
3,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,4,0,0,0,0,0.0,0,0
4,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2018,10,6,2,8,3352,335.0,465,38


In [255]:
# Ahora compruebo el total de filas y columnas del nuevo DF "df_combinado"
df_combinado.shape

(403760, 25)

In [256]:
# Re-verifico que no tenga duplicados
df_combinado.duplicated().sum()

0

In [257]:
df_combinado.dtypes

Loyalty Number                   int64
Country                         object
Province                        object
City                            object
Postal Code                     object
Gender                          object
Education                       object
Salary                         float64
Marital Status                  object
Loyalty Card                    object
CLV                            float64
Enrollment Type                 object
Enrollment Year                  int64
Enrollment Month                 int64
Cancellation Year              float64
Cancellation Month             float64
Year                             int64
Month                            int64
Flights Booked                   int64
Flights with Companions          int64
Total Flights                    int64
Distance                         int64
Points Accumulated             float64
Points Redeemed                  int64
Dollar Cost Points Redeemed      int64
dtype: object

In [258]:
# Analizo las variables NÚMERICAS
df_combinado.select_dtypes(include=["int", "float"])

Unnamed: 0,Loyalty Number,Salary,CLV,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,83236.0,3839.14,2016,2,,,2017,1,0,0,0,0,0.0,0,0
1,480934,83236.0,3839.14,2016,2,,,2017,2,3,0,3,2823,282.0,0,0
2,480934,83236.0,3839.14,2016,2,,,2017,3,0,0,0,0,0.0,0,0
3,480934,83236.0,3839.14,2016,2,,,2017,4,0,0,0,0,0.0,0,0
4,480934,83236.0,3839.14,2016,2,,,2018,10,6,2,8,3352,335.0,465,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403755,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,8,0,0,0,0,0.0,0,0
403756,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,9,0,0,0,0,0.0,0,0
403757,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,10,0,0,0,0,0.0,0,0
403758,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,11,0,0,0,0,0.0,0,0


In [259]:
df_combinado.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,403760.0,549875.383713,258961.514684,100018.0,326699.0,550598.0,772152.0,999986.0
Salary,301500.0,79258.576285,34983.929798,-58486.0,59262.0,73479.0,88606.0,407228.0
CLV,403760.0,7990.864857,6863.31716,1898.01,3981.78,5776.34,8937.12,83325.38
Enrollment Year,403760.0,2015.250716,1.979427,2012.0,2014.0,2015.0,2017.0,2018.0
Enrollment Month,403760.0,6.667555,3.398829,1.0,4.0,7.0,10.0,12.0
Cancellation Year,49650.0,2016.50435,1.380482,2013.0,2016.0,2017.0,2018.0,2018.0
Cancellation Month,49650.0,6.962095,3.454362,1.0,4.0,7.0,10.0,12.0
Year,403760.0,2017.500352,0.5,2017.0,2017.0,2018.0,2018.0,2018.0
Month,403760.0,6.501335,3.451982,1.0,4.0,7.0,10.0,12.0
Flights Booked,403760.0,4.13405,5.230064,0.0,0.0,1.0,8.0,21.0


In [260]:
# # Analizo las variables CATEGÓRICAS
df_combinado.select_dtypes(include='object')

Unnamed: 0,Country,Province,City,Postal Code,Gender,Education,Marital Status,Loyalty Card,Enrollment Type
0,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
1,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
2,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
3,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
4,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
...,...,...,...,...,...,...,...,...,...
403755,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard
403756,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard
403757,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard
403758,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard


In [261]:
df_combinado.describe(include="object").T

Unnamed: 0,count,unique,top,freq
Country,403760,1,Canada,403760
Province,403760,11,Ontario,130258
City,403760,29,Toronto,80775
Postal Code,403760,55,V6E 3D9,21944
Gender,403760,2,Female,202757
Education,403760,5,Bachelor,252567
Marital Status,403760,3,Married,234845
Loyalty Card,403760,3,Star,183745
Enrollment Type,403760,2,Standard,380419


In [262]:
df_combinado.columns

Index(['Loyalty Number', 'Country', 'Province', 'City', 'Postal Code',
       'Gender', 'Education', 'Salary', 'Marital Status', 'Loyalty Card',
       'CLV', 'Enrollment Type', 'Enrollment Year', 'Enrollment Month',
       'Cancellation Year', 'Cancellation Month', 'Year', 'Month',
       'Flights Booked', 'Flights with Companions', 'Total Flights',
       'Distance', 'Points Accumulated', 'Points Redeemed',
       'Dollar Cost Points Redeemed'],
      dtype='object')

In [263]:
# Obtengo los valore únicos de cada columna

columnas = ['Loyalty Number', 'Country', 'Province', 'City', 'Postal Code',
       'Gender', 'Education', 'Salary', 'Marital Status', 'Loyalty Card',
       'CLV', 'Enrollment Type', 'Enrollment Year', 'Enrollment Month',
       'Cancellation Year', 'Cancellation Month', 'Year', 'Month',
       'Flights Booked', 'Flights with Companions', 'Total Flights',
       'Distance', 'Points Accumulated', 'Points Redeemed',
       'Dollar Cost Points Redeemed']

valores_unicos = {col: df_combinado[col].unique() for col in columnas}


for col, valores in valores_unicos.items():
    print(f"Valores únicos en {col}:")
    print(valores)
    print() 


Valores únicos en Loyalty Number:
[480934 549612 429460 ... 776187 906428 652627]

Valores únicos en Country:
['Canada']

Valores únicos en Province:
['Ontario' 'Alberta' 'British Columbia' 'Quebec' 'Yukon' 'New Brunswick'
 'Manitoba' 'Nova Scotia' 'Saskatchewan' 'Newfoundland'
 'Prince Edward Island']

Valores únicos en City:
['Toronto' 'Edmonton' 'Vancouver' 'Hull' 'Whitehorse' 'Trenton' 'Montreal'
 'Dawson Creek' 'Quebec City' 'Fredericton' 'Ottawa' 'Tremblant' 'Calgary'
 'Thunder Bay' 'Whistler' 'Peace River' 'Winnipeg' 'Sudbury'
 'West Vancouver' 'Halifax' 'London' 'Regina' 'Kelowna' "St. John's"
 'Victoria' 'Kingston' 'Banff' 'Moncton' 'Charlottetown']

Valores únicos en Postal Code:
['M2Z 4K1' 'T3G 6Y6' 'V6E 3D9' 'P1W 1K4' 'J8Y 3Z5' 'Y2K 6R0' 'P5S 6R4'
 'K8V 4B2' 'H2Y 2W2' 'M8Y 4K8' 'U5I 4F1' 'G1B 3L5' 'H4G 3T4' 'M2M 7K8'
 'M2M 6J7' 'E3B 2H2' 'M1R 4K3' 'T9G 1W3' 'H2Y 4R4' 'V5R 1W3' 'P1L 8X8'
 'K1F 2R2' 'H5Y 2S9' 'V1E 4R6' 'H2T 2J6' 'T3E 2V9' 'H2T 9K8' 'K8T 5M5'
 'V6T 1Y8' 'P2T 6

In [264]:
# ENCUENTRO: 
# 1. Cancellation Year: está como decimal FLOAT. Debe ser cambiado a INT
# 2. Cancellation Month: está como decimal FLOAT. Lo mejor sería cambiarlo a string con el nombre del mes
# 3. Salary: tiene valores negativos
# 4. Valores nulos

In [265]:
# 2. LIMPIEZA DE DATOS

In [266]:
# Para hacer los cambios antes mencionados, hago primero manejo de nulos.
# sin embargo, debo primero convertir los valores negativos en la columna 'Salary' a positivos

In [267]:
# Uso la función abs() que convierte los valores negativos en positivos y deja los valores positivos tal como están
df_combinado['Salary'] = df_combinado['Salary'].abs()

In [268]:
df_combinado['Salary'].describe().T

count    301500.000000
mean      79371.732902
std       34726.433958
min        9081.000000
25%       59262.000000
50%       73479.000000
75%       88606.000000
max      407228.000000
Name: Salary, dtype: float64

In [269]:
# Ahora retomo los nulos que había comprobado en el paso anterior
df_combinado.isnull().mean() * 100

Loyalty Number                  0.000000
Country                         0.000000
Province                        0.000000
City                            0.000000
Postal Code                     0.000000
Gender                          0.000000
Education                       0.000000
Salary                         25.326927
Marital Status                  0.000000
Loyalty Card                    0.000000
CLV                             0.000000
Enrollment Type                 0.000000
Enrollment Year                 0.000000
Enrollment Month                0.000000
Cancellation Year              87.703091
Cancellation Month             87.703091
Year                            0.000000
Month                           0.000000
Flights Booked                  0.000000
Flights with Companions         0.000000
Total Flights                   0.000000
Distance                        0.000000
Points Accumulated              0.000000
Points Redeemed                 0.000000
Dollar Cost Poin

In [270]:
# Guardo en una variable las columnas con nulos, para darle manejo a los nulos más adelante, a través de la variable
nulos = df_combinado[["Salary", "Cancellation Year", "Cancellation Month"]]

In [271]:
# Observo que tipo de datos son esas tres columnas, para ver que manejo le doy. Son variables númericas. 
nulos.dtypes

Salary                float64
Cancellation Year     float64
Cancellation Month    float64
dtype: object

In [272]:
# SALARY: Comienzo con los nulos de "Salary". Saco la media y la mediana para analizar estos datos 

media_salary = round(df_combinado['Salary'].mean(), 2)
print("media:", media_salary)
mediana_salary = df_combinado['Salary'].median()
print("mediana:", mediana_salary)

media: 79371.73
mediana: 73479.0


In [273]:
# Pienso que la media y la mediana son muy parecidas, entonces usaré la media para imputar los nulos con el método SimpleImputer

In [274]:
# copio el código de la lección "modulo-3-leccion-06-02-imputacion-numericas"

# creamos el objeto de la clase del SimpleImputer
imputer_salary = SimpleImputer(strategy = "mean")

# ajustamos y transformamos los datos.
salary_imputado = imputer_salary.fit_transform(df_combinado[["Salary"]])

# comprobamos que es lo que nos devuelve este método. Nos devuelve un array. 
salary_imputado

array([[83236.],
       [83236.],
       [83236.],
       ...,
       [75049.],
       [75049.],
       [75049.]])

In [275]:
# lo siguiente que tenemos que hacer es introducir este array en nuestro DataFrame, en este caso sobreescribiremos la columna original. 
df_combinado["Salary"] = salary_imputado

# por último comprobamos si tenemos nulos. 
print(f"Después del 'SimpleImputer' tenemos {df_combinado["Salary"].isnull().sum()} nulos")

Después del 'SimpleImputer' tenemos 0 nulos


In [276]:
df_combinado["Salary"].isna().sum()

0

In [277]:
# En el caso de la columna "Cancellation Year" creo que es conveniente reemplazar los valores nulos por la moda. 
# Saco la moda para saber cuál es el año que más se repite.
moda_cy = df_combinado["Cancellation Year"].mode()[0]  
print("moda:", moda_cy)

moda: 2018.0


In [278]:
# Hago reemplazo de nulos
df_combinado["Cancellation Year"] = df_combinado["Cancellation Year"].fillna(moda_cy)

In [279]:
# Compruebo los nulos en la columna "Cancellation Year"
df_combinado["Cancellation Year"].isnull().sum()

0

In [280]:
# Hago procedimiento para la columna "Cancellation Month". Saco la moda para saber cuál es el mes que más se repite.
moda_cm = df_combinado["Cancellation Month"].mode()[0]  
print("moda:", moda_cm)


moda: 12.0


In [281]:
# Hago reemplazo de nulos
df_combinado["Cancellation Month"] = df_combinado["Cancellation Month"].fillna(moda_cy)

In [282]:
# Compruebo los nulos en la columna "Cancellation Month"
df_combinado["Cancellation Month"].isnull().sum() 

0

In [283]:
# Ahora si puedo convertir las columnas "Cancellation Year" y "Cancellation Month" de tipo float a int.
df_combinado["Cancellation Year"] = df_combinado["Cancellation Year"].astype(int)
df_combinado["Cancellation Month"] = df_combinado["Cancellation Month"].astype(int)

In [284]:
df_combinado.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,1,0,0,0,0,0.0,0,0
1,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,2,3,0,3,2823,282.0,0,0
2,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,3,0,0,0,0,0.0,0,0
3,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,4,0,0,0,0,0.0,0,0
4,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2018,10,6,2,8,3352,335.0,465,38


In [None]:
# df_combinado.to_csv('df_combinado.csv', index=False)