In [1]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Leemos el csv con la unión que hemos creado en la fase anterior

df = pd.read_csv('data/data_join.csv', index_col= 0)

In [5]:
df.head(1)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,100018,2017,1,3,0,3,1521,152.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,


In [6]:
# Ya habíamos comprobado en la fase anterior, sabemos que tenemos duplicados

print(f'El numero de duplidados de este data frame es {df.duplicated().sum()}') 

El numero de duplidados de este data frame es 1864


In [13]:
# compruebo la columa Loyality Number para ver si tiene duplicados

print(f'El numero de duplidados de este data frame es {df["Loyalty Number"].duplicated().sum()}') 
 

# Compruebo un valor aleatorio de esta columna

df[df["Loyalty Number"] == 100018].head(6)




El numero de duplidados de este data frame es 388887


Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,100018,2017,1,3,0,3,1521,152.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
16901,100018,2017,2,2,2,4,1320,132.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
46353,100018,2018,10,6,4,10,3110,311.0,385,31,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
50703,100018,2017,4,4,0,4,924,92.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
67604,100018,2017,5,0,0,0,0,0.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
84505,100018,2017,6,4,1,5,4330,433.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,


In [17]:
# Veo que la información es un historíco por mes y año de los vuelos tomados. Por lo tanto no tendría sentido tener duplicados

df.drop_duplicates(inplace=True)

#comprobamos que no tenemos duplicados
df.duplicated().sum()


0

Vamos a unificar el formato de las columnas, las queremos en minúscula y que el espacio sea una barra baja

In [22]:

#hacemos una variable de nuevas columnas que hará que las columnas aparezcan en miinuscula y que remplace el espacio por comas
new_columns = {column : column.lower().replace(' ', '_') for column in df}


#renombramos las columnas antiguas por las nuevas
df.rename(columns = new_columns, inplace= True)


# comprobamos que se ha realizado
df.sample(10)

Unnamed: 0,loyalty_number,year,month,flights_booked,flights_with_companions,total_flights,distance,points_accumulated,points_redeemed,dollar_cost_points_redeemed,country,province,city,postal_code,gender,education,salary,marital_status,loyalty_card,clv,enrollment_type,enrollment_year,enrollment_month,cancellation_year,cancellation_month
67137,973846,2017,4,0,0,0,0,0.0,0,0,Canada,British Columbia,Kelowna,V09 2E9,Male,Bachelor,47116.0,Single,Nova,16172.94,2018 Promotion,2018,2,,
4527,343001,2017,1,0,0,0,0,0.0,0,0,Canada,Nova Scotia,Halifax,B3J 9S2,Female,College,,Married,Aurora,5310.86,Standard,2016,3,,
30392,817527,2017,2,0,0,0,0,0.0,0,0,Canada,Ontario,Toronto,M2Z 4K1,Male,Bachelor,83172.0,Married,Aurora,7782.13,Standard,2015,3,,
182341,807256,2017,11,5,4,9,1197,119.0,0,0,Canada,Nova Scotia,Halifax,B3J 9S2,Female,College,,Married,Star,2585.44,Standard,2012,7,,
107188,410552,2017,7,5,0,5,1330,133.0,0,0,Canada,British Columbia,Vancouver,V1E 4R6,Female,Bachelor,70532.0,Married,Star,2452.74,Standard,2016,4,,
138977,301794,2017,9,11,3,14,2436,243.0,0,0,Canada,British Columbia,Dawson Creek,U5I 4F1,Male,College,,Single,Star,4620.39,Standard,2012,5,,
210104,488679,2018,1,3,0,3,1515,151.0,0,0,Canada,Ontario,Toronto,M8Y 4K8,Male,Bachelor,48629.0,Married,Star,3362.41,Standard,2016,4,,
100975,975531,2017,6,0,0,0,0,0.0,0,0,Canada,Manitoba,Winnipeg,R6Y 4T5,Male,College,,Single,Star,1898.01,Standard,2018,11,,
280916,659400,2018,5,0,0,0,0,0.0,0,0,Canada,Nova Scotia,Halifax,B3C 2M8,Male,Bachelor,72769.0,Married,Star,8679.8,Standard,2012,11,,
64396,828643,2017,4,0,0,0,0,0.0,0,0,Canada,British Columbia,Vancouver,V1E 4R6,Male,College,,Married,Nova,11490.34,Standard,2017,4,,


In [26]:
#Creamos una función para convertir en lower minúscula y le quitamos los espacios de delante y 


def modificar_lower_strip (lista_columnas):
    for columna in columnas_ojeto:
        df[columna] = df[columna].str.strip().str.lower()


# llamamos a la función
columnas_ojeto = df.select_dtypes(include = "object").columns
modificar_lower_strip(columnas_ojeto)




Unnamed: 0,loyalty_number,year,month,flights_booked,flights_with_companions,total_flights,distance,points_accumulated,points_redeemed,dollar_cost_points_redeemed,country,province,city,postal_code,gender,education,salary,marital_status,loyalty_card,clv,enrollment_type,enrollment_year,enrollment_month,cancellation_year,cancellation_month
0,100018,2017,1,3,0,3,1521,152.0,0,0,canada,alberta,edmonton,t9g 1w3,female,bachelor,92552.0,married,aurora,7919.20,standard,2016,8,,
1,100102,2017,1,10,4,14,2030,203.0,0,0,canada,ontario,toronto,m1r 4k3,male,college,,single,nova,2887.74,standard,2013,3,,
2,100140,2017,1,6,0,6,1200,120.0,0,0,canada,british columbia,dawson creek,u5i 4f1,female,college,,divorced,nova,2838.07,standard,2016,7,,
3,100214,2017,1,0,0,0,0,0.0,0,0,canada,british columbia,vancouver,v5r 1w3,male,bachelor,63253.0,married,star,4170.57,standard,2015,8,,
4,100272,2017,1,0,0,0,0,0.0,0,0,canada,ontario,toronto,p1l 8x8,female,bachelor,91163.0,divorced,star,6622.05,standard,2014,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405619,999902,2018,12,0,0,0,0,0.0,0,0,canada,ontario,toronto,m1r 4k3,male,college,,married,aurora,7290.07,standard,2014,5,,
405620,999911,2018,12,0,0,0,0,0.0,0,0,canada,newfoundland,st. john's,a1c 6h9,male,doctor,217943.0,single,nova,8564.77,standard,2012,8,,
405621,999940,2018,12,3,0,3,1233,123.0,0,0,canada,quebec,quebec city,g1b 3l5,female,bachelor,47670.0,married,nova,20266.50,standard,2017,7,,
405622,999982,2018,12,0,0,0,0,0.0,0,0,canada,british columbia,victoria,v10 6t5,male,college,,married,star,2631.56,standard,2018,7,,


In [39]:
#comprobamos valores nulos 

df.isnull().sum()

#unicamente tenmos  3 columnas coon valores nulos
# entendemos las columnas que contienen información de cancelación es porque no ha cancelado por lo tanto pondremos los calores nan en 0 como una nueva categoria especial

loyalty_number                      0
year                                0
month                               0
flights_booked                      0
flights_with_companions             0
total_flights                       0
distance                            0
points_accumulated                  0
points_redeemed                     0
dollar_cost_points_redeemed         0
country                             0
province                            0
city                                0
postal_code                         0
gender                              0
education                           0
salary                         102260
marital_status                      0
loyalty_card                        0
clv                                 0
enrollment_type                     0
enrollment_year                     0
enrollment_month                    0
cancellation_year              354110
cancellation_month             354110
dtype: int64

In [42]:
df['salary'].unique()

array([92552.,    nan, 63253., ..., 23160., 97206., 56345.])