In [105]:
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
#from scipy.stats import shapiro, kstest, poisson, chisquare, ttest_ind, levene, bartlett, sem, ppf
import scipy.stats as stats
from scipy.stats import shapiro, levene
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [106]:
# Abro los csv
df_flights = pd.read_csv("../Customer Flight Activity.csv")
df_loyalty = pd.read_csv("../Customer Loyalty History.csv")

## FASE 1: EXPLORACIÓN Y LIMPIEZA ##

In [107]:
# 1. EXPLORACIÓN INICIAL

In [108]:
# Exploro las 5 primeras filas de "df_flights"
df_flights.head()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0
3,100214,2017,1,0,0,0,0,0.0,0,0
4,100272,2017,1,0,0,0,0,0.0,0,0


In [109]:
# Exploro las 5 últimas filas de "df_flights"
df_flights.tail()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
405619,999902,2018,12,0,0,0,0,0.0,0,0
405620,999911,2018,12,0,0,0,0,0.0,0,0
405621,999940,2018,12,3,0,3,1233,123.0,0,0
405622,999982,2018,12,0,0,0,0,0.0,0,0
405623,999986,2018,12,0,0,0,0,0.0,0,0


In [110]:
# Exploro las 10 filas aleatorias de "df_flights"
df_flights.sample(10)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
58725,528074,2017,4,0,0,0,0,0.0,0,0
402374,826136,2018,12,0,0,0,0,0.0,0,0
362817,520202,2018,10,5,0,5,1820,182.0,0,0
353991,950707,2018,9,4,2,6,1068,106.0,0,0
129309,684889,2017,8,0,0,0,0,0.0,0,0
380643,570796,2018,11,13,3,16,3120,312.0,0,0
263439,313424,2017,11,1,1,2,1522,152.0,528,43
405110,971370,2018,12,0,0,0,0,0.0,0,0
394983,979449,2018,6,0,0,0,0,0.0,0,0
286276,857914,2017,3,7,1,8,1544,154.0,475,38


In [111]:
# Compruebo el total de filas y columnas de "df_flights"
df_flights.shape

(405624, 10)

In [112]:
# Exploro las 5 primeras filas de "df_loyalty"
df_loyalty.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
1,549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
2,429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0
3,608370,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,3839.75,Standard,2013,2,,
4,530508,Canada,Quebec,Hull,J8Y 3Z5,Male,Bachelor,103495.0,Married,Star,3842.79,Standard,2014,10,,


In [113]:
# Exploro las 5 últimas filas de "df_loyalty"
df_loyalty.tail()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
16732,823768,Canada,British Columbia,Vancouver,V6E 3Z3,Female,College,,Married,Star,61850.19,Standard,2012,12,,
16733,680886,Canada,Saskatchewan,Regina,S1J 3C5,Female,Bachelor,89210.0,Married,Star,67907.27,Standard,2014,9,,
16734,776187,Canada,British Columbia,Vancouver,V5R 1W3,Male,College,,Single,Star,74228.52,Standard,2014,3,,
16735,906428,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4,,
16736,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0


In [114]:
# Exploro las 10 filas aleatorias de "df_loyalty"
df_loyalty.sample(10)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
3376,419748,Canada,British Columbia,Vancouver,V6E 3D9,Male,Bachelor,61813.0,Married,Aurora,12731.95,Standard,2012,12,,
9933,270909,Canada,Yukon,Whitehorse,Y2K 6R0,Male,College,,Single,Star,2188.66,Standard,2014,5,,
10338,190429,Canada,Ontario,Thunder Bay,K8T 5M5,Male,Bachelor,36994.0,Divorced,Nova,20878.41,2018 Promotion,2018,2,,
12326,338872,Canada,Saskatchewan,Regina,S6J 3G0,Female,Bachelor,54550.0,Divorced,Star,3589.37,Standard,2016,12,,
51,869743,Canada,Ontario,Sudbury,M5V 1G5,Female,Bachelor,73236.0,Single,Star,3924.42,Standard,2016,11,2017.0,7.0
15900,237888,Canada,Ontario,Toronto,M8Y 4K8,Male,Bachelor,85364.0,Married,Star,12741.85,Standard,2017,5,,
11261,203133,Canada,British Columbia,Dawson Creek,U5I 4F1,Female,Bachelor,69942.0,Married,Star,2600.27,Standard,2013,5,2018.0,6.0
7906,327498,Canada,Alberta,Peace River,T9O 2W2,Male,Bachelor,64702.0,Divorced,Nova,7956.15,Standard,2016,1,2016.0,9.0
5856,650657,Canada,British Columbia,Vancouver,V6E 3Z3,Female,College,,Single,Nova,4463.61,Standard,2015,6,,
3580,907252,Canada,New Brunswick,Fredericton,E3B 2H2,Male,College,,Divorced,Aurora,14862.76,Standard,2015,5,,


In [115]:
# Compruebo el total de filas y columnas de "df_loyalty"
df_loyalty.shape

(16737, 16)

In [116]:
# Compruebo valores nulos para "df_flights"
# Observo que en este df NO tengo
df_flights.isnull().sum()

Loyalty Number                 0
Year                           0
Month                          0
Flights Booked                 0
Flights with Companions        0
Total Flights                  0
Distance                       0
Points Accumulated             0
Points Redeemed                0
Dollar Cost Points Redeemed    0
dtype: int64

In [117]:
# Compruebo valores nulos para "df_loyalty". 
df_loyalty.isnull().sum()

Loyalty Number            0
Country                   0
Province                  0
City                      0
Postal Code               0
Gender                    0
Education                 0
Salary                 4238
Marital Status            0
Loyalty Card              0
CLV                       0
Enrollment Type           0
Enrollment Year           0
Enrollment Month          0
Cancellation Year     14670
Cancellation Month    14670
dtype: int64

In [118]:
# En "df_loyalty" encuentro que tengo nulos en las columnas: "Salary", "Cancellation Year", "Cancellation Month". Miro de que tipo son estas columnas.
df_loyalty.dtypes

Loyalty Number          int64
Country                object
Province               object
City                   object
Postal Code            object
Gender                 object
Education              object
Salary                float64
Marital Status         object
Loyalty Card           object
CLV                   float64
Enrollment Type        object
Enrollment Year         int64
Enrollment Month        int64
Cancellation Year     float64
Cancellation Month    float64
dtype: object

In [119]:
# Lo observo en procentaje. En el siguiente punto de la fase exploración, analizo como voy a proceder con los datos nulos.
df_loyalty.isnull().sum() / df_loyalty.shape[0] * 100

Loyalty Number         0.000000
Country                0.000000
Province               0.000000
City                   0.000000
Postal Code            0.000000
Gender                 0.000000
Education              0.000000
Salary                25.321145
Marital Status         0.000000
Loyalty Card           0.000000
CLV                    0.000000
Enrollment Type        0.000000
Enrollment Year        0.000000
Enrollment Month       0.000000
Cancellation Year     87.650117
Cancellation Month    87.650117
dtype: float64

In [120]:
# Observo si hay duplicado en "df_flights". SI TENGO
df_flights.duplicated().sum()

1864

In [121]:
# Observo si hay duplicado en "df_loyalty". NO TENGO
df_loyalty.duplicated().sum()

0

In [122]:
# Solo tengo duplicados en "df_flights", analizo cuáles son. "keep=False" Duplicados en todas las columnas
# Observo que se repiten los valores en la columna "Loyalty Number", sin embargo no se eliminan ya que es info relevante

df_flights[df_flights.duplicated(keep=False)]
 

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
41,101902,2017,1,0,0,0,0,0.0,0,0
42,101902,2017,1,0,0,0,0,0.0,0,0
226,112142,2017,1,0,0,0,0,0.0,0,0
227,112142,2017,1,0,0,0,0,0.0,0,0
477,126100,2017,1,0,0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
405111,971370,2018,12,0,0,0,0,0.0,0,0
405409,988392,2018,12,0,0,0,0,0.0,0,0
405410,988392,2018,12,0,0,0,0,0.0,0,0
405436,989528,2018,12,0,0,0,0,0.0,0,0


In [123]:
# Me di cuenta que colocando el indexcol=0 no me cogia "Loyalty Number" como columna. Lo cambié al leer los csv. 
df_flights.columns

Index(['Loyalty Number', 'Year', 'Month', 'Flights Booked',
       'Flights with Companions', 'Total Flights', 'Distance',
       'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed'],
      dtype='object')

In [124]:
# Acá observo los valores negativos del df "df_loyalty" que descubrí más adelante. 
df_loyalty[df_loyalty["Salary"] < 0]

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
1082,542976,Canada,Quebec,Montreal,H2Y 4R4,Male,High School or Below,-49830.0,Divorced,Star,24127.5,2018 Promotion,2018,3,,
1894,959977,Canada,British Columbia,Vancouver,V5R 1W3,Female,Bachelor,-12497.0,Married,Aurora,9453.0,2018 Promotion,2018,3,,
2471,232755,Canada,British Columbia,Vancouver,V1E 4R6,Female,Bachelor,-46683.0,Single,Nova,4787.81,2018 Promotion,2018,3,,
3575,525245,Canada,British Columbia,Victoria,V10 6T5,Male,Bachelor,-45962.0,Married,Star,2402.33,2018 Promotion,2018,3,,
3932,603070,Canada,British Columbia,West Vancouver,V6V 8Z3,Female,Bachelor,-19325.0,Single,Star,2893.74,2018 Promotion,2018,3,,
4712,491242,Canada,British Columbia,Dawson Creek,U5I 4F1,Male,Bachelor,-43234.0,Married,Star,7597.91,2018 Promotion,2018,3,,
6560,115505,Canada,Newfoundland,St. John's,A1C 6H9,Male,Bachelor,-10605.0,Married,Nova,5860.17,2018 Promotion,2018,4,,
6570,430398,Canada,Newfoundland,St. John's,A1C 6H9,Male,Bachelor,-17534.0,Married,Nova,49423.8,2018 Promotion,2018,3,,
7373,152016,Canada,Ontario,Toronto,P1J 8T7,Female,Bachelor,-58486.0,Married,Aurora,5067.21,2018 Promotion,2018,2,,
8576,194065,Canada,Ontario,Sudbury,M5V 1G5,Female,Bachelor,-31911.0,Married,Nova,2888.85,2018 Promotion,2018,2,,


In [125]:
# Antes de unir los PD, decido convertir los salarios en positivo, ya que puede tratarse de un error de digitación. 
df_loyalty["Salary"] = df_loyalty["Salary"].abs()

In [126]:
# Ahora uniré los dos DataFrames, basándome en la columna común "Loyalty Number"
df_combinado = df_loyalty.merge(df_flights, on =["Loyalty Number"], how = "left")

RECORDATORIO PARA MÍ: (left join)
Si un Loyalty Number de loyalty tiene una coincidencia en flight, se agregarán las columnas de flight a esa fila.
Si un Loyalty Number de loyalty no tiene una coincidencia en flight, se mantendrá la fila de loyalty tal como está, pero las columnas de flight se llenarán con NaN (valores faltantes).

In [127]:
df_combinado.shape

(405624, 25)

In [128]:
df_combinado.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,1,0,0,0,0,0.0,0,0
1,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,2,3,0,3,2823,282.0,0,0
2,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,3,0,0,0,0,0.0,0,0
3,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,4,0,0,0,0,0.0,0,0
4,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2018,10,6,2,8,3352,335.0,465,38


In [129]:
df_combinado.tail()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
405619,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0,2018,8,0,0,0,0,0.0,0,0
405620,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0,2018,9,0,0,0,0,0.0,0,0
405621,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0,2018,10,0,0,0,0,0.0,0,0
405622,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0,2018,11,0,0,0,0,0.0,0,0
405623,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0,2018,12,0,0,0,0,0.0,0,0


In [130]:
df_combinado.sample(10)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
149808,166955,Canada,Manitoba,Winnipeg,R2C 0M5,Female,High School or Below,57604.0,Married,Nova,4826.83,Standard,2016,11,,,2017,1,7,0,7,1554,155.0,0,0
95648,170152,Canada,British Columbia,West Vancouver,V6V 8Z3,Female,Bachelor,74689.0,Married,Aurora,23187.15,Standard,2018,9,,,2017,9,0,0,0,0,0.0,0,0
88049,950467,Canada,British Columbia,Vancouver,V6E 3Z3,Male,College,,Married,Star,2425.3,2018 Promotion,2018,4,,,2018,5,11,8,19,3363,336.0,0,0
154231,237094,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,98937.0,Married,Nova,5024.03,Standard,2017,3,,,2017,4,0,0,0,0,0.0,0,0
339139,193212,Canada,British Columbia,Vancouver,V6E 3Z3,Female,Bachelor,104525.0,Single,Star,5816.71,Standard,2017,1,,,2018,8,0,0,0,0,0.0,0,0
287324,163757,Canada,Ontario,Ottawa,K1F 2R2,Male,Bachelor,62886.0,Single,Star,2756.54,Standard,2015,11,,,2018,9,8,0,8,2560,256.0,0,0
46100,784271,Canada,Quebec,Montreal,H4G 3T4,Male,Doctor,244617.0,Married,Aurora,6943.56,Standard,2014,11,,,2018,9,0,0,0,0,0.0,0,0
229599,380788,Canada,Quebec,Montreal,H2Y 2W2,Female,Bachelor,98888.0,Married,Nova,18938.54,Standard,2016,2,2018.0,2.0,2018,5,0,0,0,0,0.0,0,0
109084,535413,Canada,Quebec,Tremblant,H5Y 2S9,Male,Bachelor,64165.0,Married,Nova,2874.07,Standard,2018,1,,,2017,4,0,0,0,0,0.0,0,0
306986,268439,Canada,Ontario,Kingston,M9K 2P4,Male,Bachelor,75819.0,Married,Star,4179.49,Standard,2017,3,,,2017,3,0,0,0,0,0.0,0,0


In [131]:
# Elimino la columna "Country" ya que todas las ciudades y provincias hacen referencia al país Canadá
df_combinado.drop(columns = 'Country', inplace = True)

In [132]:
# Compruebo que se haya eliminado
df_combinado.head(1)

Unnamed: 0,Loyalty Number,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,,2017,1,0,0,0,0,0.0,0,0


In [133]:
df_combinado.dtypes

Loyalty Number                   int64
Province                        object
City                            object
Postal Code                     object
Gender                          object
Education                       object
Salary                         float64
Marital Status                  object
Loyalty Card                    object
CLV                            float64
Enrollment Type                 object
Enrollment Year                  int64
Enrollment Month                 int64
Cancellation Year              float64
Cancellation Month             float64
Year                             int64
Month                            int64
Flights Booked                   int64
Flights with Companions          int64
Total Flights                    int64
Distance                         int64
Points Accumulated             float64
Points Redeemed                  int64
Dollar Cost Points Redeemed      int64
dtype: object

In [134]:
# Analizo las variables NÚMERICAS
df_combinado.select_dtypes(include=["int", "float"])

Unnamed: 0,Loyalty Number,Salary,CLV,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,83236.0,3839.14,2016,2,,,2017,1,0,0,0,0,0.0,0,0
1,480934,83236.0,3839.14,2016,2,,,2017,2,3,0,3,2823,282.0,0,0
2,480934,83236.0,3839.14,2016,2,,,2017,3,0,0,0,0,0.0,0,0
3,480934,83236.0,3839.14,2016,2,,,2017,4,0,0,0,0,0.0,0,0
4,480934,83236.0,3839.14,2016,2,,,2018,10,6,2,8,3352,335.0,465,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405619,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,8,0,0,0,0,0.0,0,0
405620,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,9,0,0,0,0,0.0,0,0
405621,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,10,0,0,0,0,0.0,0,0
405622,652627,75049.0,83325.38,2015,12,2016.0,8.0,2018,11,0,0,0,0,0.0,0,0


In [135]:
df_combinado.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,405624.0,550037.873084,258935.286969,100018.0,326961.0,550834.0,772194.0,999986.0
Salary,302952.0,79381.440228,34735.904533,9081.0,59262.0,73479.0,88612.0,407228.0
CLV,405624.0,7991.976226,6863.663857,1898.01,3985.32,5776.34,8936.82,83325.38
Enrollment Year,405624.0,2015.252529,1.979427,2012.0,2014.0,2015.0,2017.0,2018.0
Enrollment Month,405624.0,6.668008,3.399766,1.0,4.0,7.0,10.0,12.0
Cancellation Year,50064.0,2016.499521,1.384336,2013.0,2016.0,2017.0,2018.0,2018.0
Cancellation Month,50064.0,6.966443,3.4483,1.0,4.0,7.0,10.0,12.0
Year,405624.0,2017.5,0.500001,2017.0,2017.0,2017.5,2018.0,2018.0
Month,405624.0,6.5,3.452057,1.0,3.75,6.5,9.25,12.0
Flights Booked,405624.0,4.115052,5.225518,0.0,0.0,1.0,8.0,21.0


In [137]:
# # Analizo las variables CATEGÓRICAS
df_combinado.select_dtypes(include='object')

Unnamed: 0,Province,City,Postal Code,Gender,Education,Marital Status,Loyalty Card,Enrollment Type
0,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
1,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
2,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
3,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
4,Ontario,Toronto,M2Z 4K1,Female,Bachelor,Married,Star,Standard
...,...,...,...,...,...,...,...,...
405619,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard
405620,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard
405621,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard
405622,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,Married,Star,Standard


In [138]:
df_combinado.describe(include="object").T

Unnamed: 0,count,unique,top,freq
Province,405624,11,Ontario,130896
City,405624,29,Toronto,81096
Postal Code,405624,55,V6E 3D9,21984
Gender,405624,2,Female,203640
Education,405624,5,Bachelor,253752
Marital Status,405624,3,Married,235800
Loyalty Card,405624,3,Star,184224
Enrollment Type,405624,2,Standard,382200


In [139]:
df_combinado.columns

Index(['Loyalty Number', 'Province', 'City', 'Postal Code', 'Gender',
       'Education', 'Salary', 'Marital Status', 'Loyalty Card', 'CLV',
       'Enrollment Type', 'Enrollment Year', 'Enrollment Month',
       'Cancellation Year', 'Cancellation Month', 'Year', 'Month',
       'Flights Booked', 'Flights with Companions', 'Total Flights',
       'Distance', 'Points Accumulated', 'Points Redeemed',
       'Dollar Cost Points Redeemed'],
      dtype='object')

In [141]:
# Obtengo los valore únicos de cada columna

columnas = ['Loyalty Number', 'Province', 'City', 'Postal Code', 'Gender',
       'Education', 'Salary', 'Marital Status', 'Loyalty Card', 'CLV',
       'Enrollment Type', 'Enrollment Year', 'Enrollment Month',
       'Cancellation Year', 'Cancellation Month', 'Year', 'Month',
       'Flights Booked', 'Flights with Companions', 'Total Flights',
       'Distance', 'Points Accumulated', 'Points Redeemed',
       'Dollar Cost Points Redeemed']

valores_unicos = {col: df_combinado[col].unique() for col in columnas}


for col, valores in valores_unicos.items():
    print(f"Valores únicos en {col}:")
    print(valores)
    print() 


Valores únicos en Loyalty Number:
[480934 549612 429460 ... 776187 906428 652627]

Valores únicos en Province:
['Ontario' 'Alberta' 'British Columbia' 'Quebec' 'Yukon' 'New Brunswick'
 'Manitoba' 'Nova Scotia' 'Saskatchewan' 'Newfoundland'
 'Prince Edward Island']

Valores únicos en City:
['Toronto' 'Edmonton' 'Vancouver' 'Hull' 'Whitehorse' 'Trenton' 'Montreal'
 'Dawson Creek' 'Quebec City' 'Fredericton' 'Ottawa' 'Tremblant' 'Calgary'
 'Thunder Bay' 'Whistler' 'Peace River' 'Winnipeg' 'Sudbury'
 'West Vancouver' 'Halifax' 'London' 'Regina' 'Kelowna' "St. John's"
 'Victoria' 'Kingston' 'Banff' 'Moncton' 'Charlottetown']

Valores únicos en Postal Code:
['M2Z 4K1' 'T3G 6Y6' 'V6E 3D9' 'P1W 1K4' 'J8Y 3Z5' 'Y2K 6R0' 'P5S 6R4'
 'K8V 4B2' 'H2Y 2W2' 'M8Y 4K8' 'U5I 4F1' 'G1B 3L5' 'H4G 3T4' 'M2M 7K8'
 'M2M 6J7' 'E3B 2H2' 'M1R 4K3' 'T9G 1W3' 'H2Y 4R4' 'V5R 1W3' 'P1L 8X8'
 'K1F 2R2' 'H5Y 2S9' 'V1E 4R6' 'H2T 2J6' 'T3E 2V9' 'H2T 9K8' 'K8T 5M5'
 'V6T 1Y8' 'P2T 6G3' 'T9O 2W2' 'V6E 3Z3' 'R6Y 4T5' 'M5V 

In [None]:
# ENCUENTRO: 
# 1. Cancellation Year: está como decimal FLOAT. Debe ser cambiado a INT
# 2. Cancellation Month: está como decimal FLOAT. Lo mejor sería cambiarlo a string con el nombre del mes
# 3. Salary: tiene valores negativos
# 4. Valores nulos

In [None]:
# 2. LIMPIEZA DE DATOS

In [None]:
# Para hacer los cambios antes mencionados, hago primero manejo de nulos.

In [None]:
# Ahora retomo los nulos que había comprobado en el paso anterior
df_combinado.isnull().mean() * 100

Loyalty Number                  0.000000
Country                         0.000000
Province                        0.000000
City                            0.000000
Postal Code                     0.000000
Gender                          0.000000
Education                       0.000000
Salary                         25.312112
Marital Status                  0.000000
Loyalty Card                    0.000000
CLV                             0.000000
Enrollment Type                 0.000000
Enrollment Year                 0.000000
Enrollment Month                0.000000
Cancellation Year              87.657535
Cancellation Month             87.657535
Year                            0.000000
Month                           0.000000
Flights Booked                  0.000000
Flights with Companions         0.000000
Total Flights                   0.000000
Distance                        0.000000
Points Accumulated              0.000000
Points Redeemed                 0.000000
Dollar Cost Poin

In [None]:
# Guardo en una variable las columnas con nulos, para darle manejo a los nulos más adelante, a través de la variable
nulos = df_combinado[["Salary", "Cancellation Year", "Cancellation Month"]]

In [None]:
# Observo que tipo de datos son esas tres columnas, para ver que manejo le doy. Son variables númericas. 
nulos.dtypes

Salary                float64
Cancellation Year     float64
Cancellation Month    float64
dtype: object

In [None]:
# SALARY: Comienzo con los nulos de "Salary". Saco la media y la mediana para analizar estos datos 

media_salary = round(df_combinado['Salary'].mean(), 2)
print("media:", media_salary)
mediana_salary = df_combinado['Salary'].median()
print("mediana:", mediana_salary)

media: 79381.44
mediana: 73479.0


In [None]:
# DEBO CAMBIAR POR LA MEDIANA

In [None]:
# Pienso que la media y la mediana son muy parecidas, entonces usaré la media para imputar los nulos con el método SimpleImputer

In [None]:
# copio el código de la lección "modulo-3-leccion-06-02-imputacion-numericas"

# creamos el objeto de la clase del SimpleImputer
imputer_salary = SimpleImputer(strategy = "mean")

# ajustamos y transformamos los datos.
salary_imputado = imputer_salary.fit_transform(df_combinado[["Salary"]])

# comprobamos que es lo que nos devuelve este método. Nos devuelve un array. 
salary_imputado

array([[83236.],
       [83236.],
       [83236.],
       ...,
       [75049.],
       [75049.],
       [75049.]])

In [None]:
# lo siguiente que tenemos que hacer es introducir este array en nuestro DataFrame, en este caso sobreescribiremos la columna original. 
df_combinado["Salary"] = salary_imputado

# por último comprobamos si tenemos nulos. 
print(f"Después del 'SimpleImputer' tenemos {df_combinado["Salary"].isnull().sum()} nulos")

Después del 'SimpleImputer' tenemos 0 nulos


In [None]:
df_combinado["Salary"].isna().sum()

0

In [None]:
# En el caso de la columna "Cancellation Year" creo que es conveniente reemplazar los valores nulos por la moda. 
# Saco la moda para saber cuál es el año que más se repite.
moda_cy = df_combinado["Cancellation Year"].mode()[0]  
print("moda:", moda_cy)

moda: 2018.0


In [None]:
# Hago reemplazo de nulos
df_combinado["Cancellation Year"] = df_combinado["Cancellation Year"].fillna(moda_cy)

In [None]:
# Compruebo los nulos en la columna "Cancellation Year"
df_combinado["Cancellation Year"].isnull().sum()

0

In [None]:
# Hago procedimiento para la columna "Cancellation Month". Saco la moda para saber cuál es el mes que más se repite.
moda_cm = df_combinado["Cancellation Month"].mode()[0]  
print("moda:", moda_cm)


moda: 11.0


In [None]:
# Hago reemplazo de nulos
df_combinado["Cancellation Month"] = df_combinado["Cancellation Month"].fillna(moda_cy)

In [None]:
# Compruebo los nulos en la columna "Cancellation Month"
df_combinado["Cancellation Month"].isnull().sum() 

0

In [None]:
# Ahora si puedo convertir las columnas "Cancellation Year" y "Cancellation Month" de tipo float a int.
df_combinado["Cancellation Year"] = df_combinado["Cancellation Year"].astype(int)
df_combinado["Cancellation Month"] = df_combinado["Cancellation Month"].astype(int)

In [None]:
df_combinado.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,1,0,0,0,0,0.0,0,0
1,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,2,3,0,3,2823,282.0,0,0
2,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,3,0,0,0,0,0.0,0,0
3,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,4,0,0,0,0,0.0,0,0
4,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2018,10,6,2,8,3352,335.0,465,38


In [None]:
# df_combinado.to_csv('df_combinado.csv', index=False)