In [1]:
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
#from scipy.stats import shapiro, kstest, poisson, chisquare, ttest_ind, levene, bartlett, sem, ppf
import scipy.stats as stats
from scipy.stats import shapiro, levene
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Abro los csv
df_flights = pd.read_csv("Customer Flight Activity.csv", index_col = 0)
df_loyalty = pd.read_csv("Customer Loyalty History.csv", index_col = 0)

## FASE 1: EXPLORACIÓN Y LIMPIEZA ##

In [3]:
# EXPLORACIÓN INICIAL

In [4]:
# Exploro las 5 primeras filas de "df_flights"
df_flights.head()

Unnamed: 0_level_0,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100018,2017,1,3,0,3,1521,152.0,0,0
100102,2017,1,10,4,14,2030,203.0,0,0
100140,2017,1,6,0,6,1200,120.0,0,0
100214,2017,1,0,0,0,0,0.0,0,0
100272,2017,1,0,0,0,0,0.0,0,0


In [5]:
# Exploro las 5 últimas filas de "df_flights"
df_flights.tail()

Unnamed: 0_level_0,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
999902,2018,12,0,0,0,0,0.0,0,0
999911,2018,12,0,0,0,0,0.0,0,0
999940,2018,12,3,0,3,1233,123.0,0,0
999982,2018,12,0,0,0,0,0.0,0,0
999986,2018,12,0,0,0,0,0.0,0,0


In [6]:
# Exploro las 10 filas aleatorias de "df_flights"
df_flights.sample(10)

Unnamed: 0_level_0,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
917191,2018,6,0,0,0,0,0.0,0,0
365887,2018,1,0,0,0,0,0.0,0,0
727348,2018,11,1,0,1,2935,293.0,0,0
193827,2018,4,0,0,0,0,0.0,0,0
309408,2017,12,6,0,6,876,87.0,0,0
800196,2018,1,0,0,0,0,0.0,0,0
373665,2018,12,0,0,0,0,0.0,0,0
200168,2018,6,14,0,14,2044,204.0,0,0
472356,2018,3,0,0,0,0,0.0,0,0
756791,2018,3,14,2,16,4464,481.68,0,0


In [7]:
# compruebo el total de filas y columnas de "df_flights"
df_flights.shape

(405624, 9)

In [8]:
# Exploro las 5 primeras filas de "df_loyalty"
df_loyalty.head()

Unnamed: 0_level_0,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0
608370,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,3839.75,Standard,2013,2,,
530508,Canada,Quebec,Hull,J8Y 3Z5,Male,Bachelor,103495.0,Married,Star,3842.79,Standard,2014,10,,


In [9]:
# Exploro las 5 últimas filas de "df_loyalty"
df_loyalty.tail()

Unnamed: 0_level_0,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
823768,Canada,British Columbia,Vancouver,V6E 3Z3,Female,College,,Married,Star,61850.19,Standard,2012,12,,
680886,Canada,Saskatchewan,Regina,S1J 3C5,Female,Bachelor,89210.0,Married,Star,67907.27,Standard,2014,9,,
776187,Canada,British Columbia,Vancouver,V5R 1W3,Male,College,,Single,Star,74228.52,Standard,2014,3,,
906428,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4,,
652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0


In [10]:
# Exploro las 10 filas aleatorias de "df_loyalty"
df_loyalty.sample(10)

Unnamed: 0_level_0,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
247514,Canada,Ontario,Toronto,P1L 8X8,Female,Bachelor,82509.0,Single,Nova,3319.62,Standard,2013,7,,
942819,Canada,Quebec,Montreal,H2Y 4R4,Female,Doctor,80786.0,Married,Aurora,7679.68,Standard,2014,4,,
971097,Canada,Ontario,Sudbury,M5V 1G5,Male,College,,Single,Aurora,6872.04,Standard,2014,10,,
979135,Canada,Ontario,Toronto,P2T 6G3,Male,College,,Single,Star,2279.75,2018 Promotion,2018,3,,
778027,Canada,British Columbia,Vancouver,V1E 4R6,Female,Bachelor,67115.0,Married,Aurora,15362.35,Standard,2018,5,,
414433,Canada,Manitoba,Winnipeg,R3R 3T4,Female,Bachelor,89230.0,Divorced,Aurora,21892.35,Standard,2013,2,,
364264,Canada,Ontario,Toronto,M2M 7K8,Female,Bachelor,73945.0,Single,Nova,12157.33,Standard,2016,3,,
612719,Canada,British Columbia,Vancouver,V5R 1W3,Female,Doctor,250798.0,Single,Star,2482.41,Standard,2012,7,,
267352,Canada,Ontario,Toronto,M2M 7K8,Male,College,,Single,Star,19069.5,Standard,2013,6,,
200012,Canada,Ontario,Toronto,M2Z 4K1,Female,College,,Single,Star,2161.69,Standard,2016,6,,


In [11]:
# compruebo el total de filas y columnas de "df_loyalty"
df_loyalty.shape

(16737, 15)