In [76]:
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
#from scipy.stats import shapiro, kstest, poisson, chisquare, ttest_ind, levene, bartlett, sem, ppf
import scipy.stats as stats
from scipy.stats import shapiro, levene
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [77]:
# Abro los csv
df_flights = pd.read_csv("Customer Flight Activity.csv")
df_loyalty = pd.read_csv("Customer Loyalty History.csv")

## FASE 1: EXPLORACIÓN Y LIMPIEZA ##

In [78]:
# EXPLORACIÓN INICIAL

In [79]:
# Exploro las 5 primeras filas de "df_flights"
df_flights.head()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0
3,100214,2017,1,0,0,0,0,0.0,0,0
4,100272,2017,1,0,0,0,0,0.0,0,0


In [80]:
# Exploro las 5 últimas filas de "df_flights"
df_flights.tail()

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
405619,999902,2018,12,0,0,0,0,0.0,0,0
405620,999911,2018,12,0,0,0,0,0.0,0,0
405621,999940,2018,12,3,0,3,1233,123.0,0,0
405622,999982,2018,12,0,0,0,0,0.0,0,0
405623,999986,2018,12,0,0,0,0,0.0,0,0


In [81]:
# Exploro las 10 filas aleatorias de "df_flights"
df_flights.sample(10)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
205417,240500,2018,1,0,0,0,0,0.0,0,0
43966,191057,2018,2,0,0,0,0,0.0,0,0
54203,287224,2017,4,0,0,0,0,0.0,0,0
190709,356460,2017,12,3,2,5,3220,322.0,0,0
241658,608493,2018,3,12,0,12,2412,260.28,0,0
223749,349767,2018,2,1,1,2,352,43.75,0,0
378875,476288,2018,11,4,3,7,1358,135.0,0,0
350149,744181,2018,9,0,0,0,0,0.0,0,0
166193,847559,2017,10,9,0,9,1899,189.0,0,0
103816,229436,2017,7,0,0,0,0,0.0,0,0


In [82]:
# Compruebo el total de filas y columnas de "df_flights"
df_flights.shape

(405624, 10)

In [83]:
# Exploro las 5 primeras filas de "df_loyalty"
df_loyalty.head()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
1,549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
2,429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0
3,608370,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,3839.75,Standard,2013,2,,
4,530508,Canada,Quebec,Hull,J8Y 3Z5,Male,Bachelor,103495.0,Married,Star,3842.79,Standard,2014,10,,


In [84]:
# Exploro las 5 últimas filas de "df_loyalty"
df_loyalty.tail()

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
16732,823768,Canada,British Columbia,Vancouver,V6E 3Z3,Female,College,,Married,Star,61850.19,Standard,2012,12,,
16733,680886,Canada,Saskatchewan,Regina,S1J 3C5,Female,Bachelor,89210.0,Married,Star,67907.27,Standard,2014,9,,
16734,776187,Canada,British Columbia,Vancouver,V5R 1W3,Male,College,,Single,Star,74228.52,Standard,2014,3,,
16735,906428,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4,,
16736,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0


In [85]:
# Exploro las 10 filas aleatorias de "df_loyalty"
df_loyalty.sample(10)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
11051,114214,Canada,Quebec,Montreal,H2T 9K8,Female,College,,Married,Star,2554.55,Standard,2017,5,2018.0,4.0
12601,131463,Canada,British Columbia,Vancouver,V1E 4R6,Male,Bachelor,55406.0,Married,Star,4128.09,Standard,2013,4,,
11973,562084,Canada,Ontario,Toronto,M8Y 4K8,Male,Bachelor,96153.0,Married,Star,2882.35,Standard,2014,11,,
9829,990512,Canada,Ontario,Toronto,P1W 1K4,Male,College,,Single,Star,1904.0,Standard,2015,10,,
10192,392682,Canada,British Columbia,Victoria,V10 6T5,Female,College,,Married,Star,2356.07,Standard,2017,10,,
1999,208234,Canada,Quebec,Montreal,H4G 3T4,Female,College,,Divorced,Aurora,7281.25,Standard,2014,7,,
7811,345244,Canada,Ontario,Thunder Bay,K8T 5M5,Male,Bachelor,103320.0,Divorced,Nova,7756.44,Standard,2014,8,,
4287,106689,Canada,Ontario,Toronto,P1W 1K4,Female,Bachelor,61554.0,Married,Nova,2795.41,Standard,2017,6,,
12029,454690,Canada,British Columbia,Vancouver,V5R 1W3,Male,Bachelor,70951.0,Married,Star,2939.33,Standard,2017,3,,
1880,523188,Canada,British Columbia,Dawson Creek,U5I 4F1,Male,College,,Divorced,Aurora,8714.92,2018 Promotion,2018,3,,


In [86]:
# Compruebo el total de filas y columnas de "df_loyalty"
df_loyalty.shape

(16737, 16)

In [87]:
# Compruebo valores nulos para "df_flights"
# Observo que en este df NO tengo
df_flights.isnull().sum()

Loyalty Number                 0
Year                           0
Month                          0
Flights Booked                 0
Flights with Companions        0
Total Flights                  0
Distance                       0
Points Accumulated             0
Points Redeemed                0
Dollar Cost Points Redeemed    0
dtype: int64

In [88]:
# Compruebo valores nulos para "df_loyalty". 
df_loyalty.isnull().sum()

Loyalty Number            0
Country                   0
Province                  0
City                      0
Postal Code               0
Gender                    0
Education                 0
Salary                 4238
Marital Status            0
Loyalty Card              0
CLV                       0
Enrollment Type           0
Enrollment Year           0
Enrollment Month          0
Cancellation Year     14670
Cancellation Month    14670
dtype: int64

In [89]:
# En "df_loyalty" encuentro que tengo nulos en las columnas: "Salary", "Cancellation Year", "Cancellation Month". Miro de que tipo son estas columnas.
df_loyalty.dtypes

Loyalty Number          int64
Country                object
Province               object
City                   object
Postal Code            object
Gender                 object
Education              object
Salary                float64
Marital Status         object
Loyalty Card           object
CLV                   float64
Enrollment Type        object
Enrollment Year         int64
Enrollment Month        int64
Cancellation Year     float64
Cancellation Month    float64
dtype: object

In [90]:
# Lo observo en procentaje. En el siguiente punto de la fase exploración, analizo como voy a proceder con los datos nulos.
df_loyalty.isnull().sum() / df_loyalty.shape[0] * 100

Loyalty Number         0.000000
Country                0.000000
Province               0.000000
City                   0.000000
Postal Code            0.000000
Gender                 0.000000
Education              0.000000
Salary                25.321145
Marital Status         0.000000
Loyalty Card           0.000000
CLV                    0.000000
Enrollment Type        0.000000
Enrollment Year        0.000000
Enrollment Month       0.000000
Cancellation Year     87.650117
Cancellation Month    87.650117
dtype: float64

In [91]:
# Observo si hay duplicado en "df_flights"
df_flights.duplicated().sum()

1864

In [92]:
# Observo si hay duplicado en "df_loyalty"
df_loyalty.duplicated().sum()

0

In [93]:
# Solo tengo duplicados en "df_flights", analizo cuáles son. "keep=False" Duplicados en todas las columnas
# Observo que se repiten los valores en la columna "Loyalty Number"

df_flights[df_flights.duplicated(keep=False)]
 

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
41,101902,2017,1,0,0,0,0,0.0,0,0
42,101902,2017,1,0,0,0,0,0.0,0,0
226,112142,2017,1,0,0,0,0,0.0,0,0
227,112142,2017,1,0,0,0,0,0.0,0,0
477,126100,2017,1,0,0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
405111,971370,2018,12,0,0,0,0,0.0,0,0
405409,988392,2018,12,0,0,0,0,0.0,0,0
405410,988392,2018,12,0,0,0,0,0.0,0,0
405436,989528,2018,12,0,0,0,0,0.0,0,0


In [None]:
# Me di cuenta que colocando el indexcol=0 no me cogia "Loyalty Number" como columna. Lo cambié al leer los csv. 
df_flights.columns

Index(['Loyalty Number', 'Year', 'Month', 'Flights Booked',
       'Flights with Companions', 'Total Flights', 'Distance',
       'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed'],
      dtype='object')

In [101]:
df_flights.duplicated(subset = "Loyalty Number").sum()

388887