In [77]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [78]:
# Cargamos data Frame
df1 = pd.read_csv("data/Customer Flight Activity.csv",sep = ",")
df2 = pd.read_csv("data/Customer Loyalty History.csv",sep = ",")


## Comenzamos por el primer archivo 

In [79]:
# Miramos la cabecera con las 3 primeras filas
df1.head(3)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0
1,100102,2017,1,10,4,14,2030,203.0,0,0
2,100140,2017,1,6,0,6,1200,120.0,0,0


In [80]:
# Miramos las 3 últimas filas

df1.tail(3) 

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
405621,999940,2018,12,3,0,3,1233,123.0,0,0
405622,999982,2018,12,0,0,0,0,0.0,0,0
405623,999986,2018,12,0,0,0,0,0.0,0,0


In [81]:
# Miramos 3 filas  aleatorias 

df1.sample(3)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
395079,439675,2018,12,0,0,0,0,0.0,0,0
126559,540497,2017,8,0,0,0,0,0.0,0,0
144771,608537,2017,9,0,0,0,0,0.0,0,0


In [82]:
#Voy a crear una función para entender el data frame


def entender_datos(df):
    
    # Encabezado
    print("INFO SOBRE NUESTROS DATOS")
    print("============================================================================")
    
    # Información general
    print(f"El número de filas es: {df.shape[0]}")
    print(f"El número de columnas es: {df.shape[1]}")
    print("============================================================================")
    
    # Descripción del DataFrame
    print("DESCRIPCIÓN ESTADÍSTICA")
    print(df.describe().T)
    print("============================================================================")
    
    # Columnas del DataFrame
    print("COLUMNAS")
    print(df.columns)
    print("============================================================================")
    
    # Filas duplicadas
    print(f"Número de filas duplicadas: {df.duplicated().sum()}")
    print("============================================================================")
    
    # Valores nulos totales
    print(f"Total de valores nulos: {df.isna().sum().sum()}")
    print("============================================================================")
    
    # Valores nulos por columna
    print("VALORES NULOS POR COLUMNA")
    print(df.isna().sum())
    print("============================================================================")
    
    # Tipos de datos
    print("TIPOS DE DATOS")
    print(df.dtypes.value_counts())
    print("============================================================================")
    
    # Información del DataFrame
    print("INFORMACIÓN DETALLADA")
    df.info()
    print("============================================================================")



In [83]:
# Llamamos a la funcion para nuestro primer DataFrame

entender_datos(df1)

INFO SOBRE NUESTROS DATOS
El número de filas es: 405624
El número de columnas es: 10
DESCRIPCIÓN ESTADÍSTICA
                                count           mean            std       min  \
Loyalty Number               405624.0  550037.873084  258935.286969  100018.0   
Year                         405624.0    2017.500000       0.500001    2017.0   
Month                        405624.0       6.500000       3.452057       1.0   
Flights Booked               405624.0       4.115052       5.225518       0.0   
Flights with Companions      405624.0       1.031805       2.076869       0.0   
Total Flights                405624.0       5.146858       6.521227       0.0   
Distance                     405624.0    1208.880059    1433.155320       0.0   
Points Accumulated           405624.0     123.692721     146.599831       0.0   
Points Redeemed              405624.0      30.696872     125.486049       0.0   
Dollar Cost Points Redeemed  405624.0       2.484503      10.150038       0.0   


In [84]:
# Compruebo que tenemos duplicados el la columna Loyality Number 
df1['Loyalty Number'].duplicated().sum()


# Compruebo un Loyality Number = 999940     ue lo he escogido de forma aleatoria para ver toda la info
df1[df1['Loyalty Number'] == 999940].head(3)


Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
16898,999940,2017,1,0,0,0,0,0.0,0,0
33799,999940,2017,2,0,0,0,0,0.0,0,0
50700,999940,2017,3,0,0,0,0,0.0,0,0


## Compruebo el segundo archivo

In [85]:
#miramos las 3 primeras columnas
df2.head(3)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,
1,549612,Canada,Alberta,Edmonton,T3G 6Y6,Male,College,,Divorced,Star,3839.61,Standard,2016,3,,
2,429460,Canada,British Columbia,Vancouver,V6E 3D9,Male,College,,Single,Star,3839.75,Standard,2014,7,2018.0,1.0


In [86]:
# miramos las 3 últimas filas
df2.tail(3)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
16734,776187,Canada,British Columbia,Vancouver,V5R 1W3,Male,College,,Single,Star,74228.52,Standard,2014,3,,
16735,906428,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4,,
16736,652627,Canada,Manitoba,Winnipeg,R2C 0M5,Female,Bachelor,75049.0,Married,Star,83325.38,Standard,2015,12,2016.0,8.0


In [87]:
# miramos 3 filas aleatorias
df2.sample(3)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
11226,320183,Canada,Ontario,Toronto,M2M 6J7,Female,Bachelor,74964.0,Married,Star,4362.93,2018 Promotion,2018,2,,
2634,461937,Canada,Newfoundland,St. John's,A1C 6H9,Male,College,,Single,Aurora,8558.93,Standard,2017,3,,
1089,763113,Canada,Quebec,Tremblant,H5Y 2S9,Male,College,,Married,Aurora,5325.72,Standard,2015,1,,


Compruebo el segundo archivo

In [88]:
#llamamos a la funciçon que hemos creado antes para entender los datos

entender_datos(df2)

INFO SOBRE NUESTROS DATOS
El número de filas es: 16737
El número de columnas es: 16
DESCRIPCIÓN ESTADÍSTICA
                      count           mean            std        min  \
Loyalty Number      16737.0  549735.880445  258912.132453  100018.00   
Salary              12499.0   79245.609409   35008.297285  -58486.00   
CLV                 16737.0    7988.896536    6860.982280    1898.01   
Enrollment Year     16737.0    2015.253211       1.979111    2012.00   
Enrollment Month    16737.0       6.669116       3.398958       1.00   
Cancellation Year    2067.0    2016.503145       1.380743    2013.00   
Cancellation Month   2067.0       6.962748       3.455297       1.00   

                          25%        50%        75%        max  
Loyalty Number      326603.00  550434.00  772019.00  999986.00  
Salary               59246.50   73455.00   88517.50  407228.00  
CLV                   3980.84    5780.18    8940.58   83325.38  
Enrollment Year       2014.00    2015.00    2017.00    

He comprobado que en el segundo archivo hay columnas del tipo objeto
Miramos las columnas ´únicas de el tipo object'

In [89]:
#Creo un data frame con las columnas tipo objeto 

df2_cat = df2.select_dtypes(include = "object")


# creamos una variable con los nombres de las columnas del DataFrame de las variables categóricas utilizando el método '.columns'
columnas_cat = df2_cat.columns
print(f"Las columnas del DataFrame de variables categóricas son {columnas_cat}")

# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas_cat:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df2_cat[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df2_cat[columna].value_counts()} ")

Las columnas del DataFrame de variables categóricas son Index(['Country', 'Province', 'City', 'Postal Code', 'Gender', 'Education',
       'Marital Status', 'Loyalty Card', 'Enrollment Type'],
      dtype='object')
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'COUNTRY' -----------

Sus valores únicos son: ['Canada']

Las frecuencias de los valores únicos de las categorías son: Country
Canada    16737
Name: count, dtype: int64 
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'PROVINCE' -----------

Sus valores únicos son: ['Ontario' 'Alberta' 'British Columbia' 'Quebec' 'Yukon' 'New Brunswick'
 'Manitoba' 'Nova Scotia' 'Saskatchewan' 'Newfoundland'
 'Prince Edward Island']

Las frecuencias de los valores únicos de las categorías son: Province
Ontario                 5404
British Columbia        4409
Quebec                  3300
Alberta                  969
Manitoba                 658
New Brunswick            636
Nova Scotia              518
Saskatchewan             409
Newfoundland       

In [90]:
df1.head(1)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,100018,2017,1,3,0,3,1521,152.0,0,0


In [91]:
df2.head(1)

Unnamed: 0,Loyalty Number,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,


## Unimos los dos Dataframe

En este caso vamos a elegir el metodo JOIN.  Se utiliza para combinar los datos de dos DataFrames en función de los índices de las filas. Proporciona una forma conveniente de realizar uniones basadas en índices en lugar de columnas.

In [92]:
#compruebo de nuevo las filas y columnas que tiene nuestro data frame

print(f'Las filas y columnas de df1 son: {df1.shape}')
print(f'Las filas y columnas de df2 son: {df2.shape}')

Las filas y columnas de df1 son: (405624, 10)
Las filas y columnas de df2 son: (16737, 16)


In [93]:
# utilizamos el método 'set_index()' para poner la columna de 'ID' como índice de nuestro DataFrame
df2.set_index(["Loyalty Number"], inplace = True)

# vemos como ha quedado el DataFrame
df2.head(1)

Unnamed: 0_level_0,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
Loyalty Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
480934,Canada,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,,


In [94]:
# en este punto el orden es importante, y tendremos pasar primero el DataFrame que no tiene en el índice la columna y dentro del paréntesis el DataFrame que tiene la columna común en el índice. 
df_join = df1.join(df2, on = "Loyalty Number")
df_join.sample() #recuerden que sample es solo para traernos una muestra del dataframe

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
270792,207865,2018,1,8,1,9,1647,164.0,442,36,Canada,Ontario,Toronto,M2Z 4K1,Female,Master,118530.0,Single,Star,3600.93,Standard,2016,12,,


In [95]:
print(f'Las filas y columnas de df1 son: {df_join.shape}')

Las filas y columnas de df1 son: (405624, 25)


In [96]:
entender_datos(df_join)

INFO SOBRE NUESTROS DATOS
El número de filas es: 405624
El número de columnas es: 25
DESCRIPCIÓN ESTADÍSTICA
                                count           mean            std  \
Loyalty Number               405624.0  550037.873084  258935.286969   
Year                         405624.0    2017.500000       0.500001   
Month                        405624.0       6.500000       3.452057   
Flights Booked               405624.0       4.115052       5.225518   
Flights with Companions      405624.0       1.031805       2.076869   
Total Flights                405624.0       5.146858       6.521227   
Distance                     405624.0    1208.880059    1433.155320   
Points Accumulated           405624.0     123.692721     146.599831   
Points Redeemed              405624.0      30.696872     125.486049   
Dollar Cost Points Redeemed  405624.0       2.484503      10.150038   
Salary                       302952.0   79268.825953   34992.133508   
CLV                          405624.0  