In [1]:
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
#from scipy.stats import shapiro, kstest, poisson, chisquare, ttest_ind, levene, bartlett, sem, ppf
import scipy.stats as stats
from scipy.stats import shapiro, levene
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Abro el CSV
df = pd.read_csv("../df_combinado.csv")
df.head()

Unnamed: 0,Loyalty Number,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed
0,480934,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,1,0,0,0,0,0.0,0,0
1,480934,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,2,3,0,3,2823,282.0,0,0
2,480934,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,3,0,0,0,0,0.0,0,0
3,480934,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2017,4,0,0,0,0,0.0,0,0
4,480934,Ontario,Toronto,M2Z 4K1,Female,Bachelor,83236.0,Married,Star,3839.14,Standard,2016,2,2018,2018,2018,10,6,2,8,3352,335.0,465,38


## Fase 3: Evaluación de Diferencias en Reservas de Vuelos por Nivel Educativo ##

In [3]:
df.columns

Index(['Loyalty Number', 'Province', 'City', 'Postal Code', 'Gender',
       'Education', 'Salary', 'Marital Status', 'Loyalty Card', 'CLV',
       'Enrollment Type', 'Enrollment Year', 'Enrollment Month',
       'Cancellation Year', 'Cancellation Month', 'Year', 'Month',
       'Flights Booked', 'Flights with Companions', 'Total Flights',
       'Distance', 'Points Accumulated', 'Points Redeemed',
       'Dollar Cost Points Redeemed'],
      dtype='object')

In [4]:
# 1. Preparación de Datos: Filtra el conjunto de datos para incluir únicamente las columnas relevantes: 'Flights Booked' y 'Education'.

df_filtrado = df[['Flights Booked', 'Education']]
df_filtrado.head()

Unnamed: 0,Flights Booked,Education
0,0,Bachelor
1,3,Bachelor
2,0,Bachelor
3,0,Bachelor
4,6,Bachelor


In [7]:
# Análisis Descriptivo: Agrupa los datos por nivel educativo y calcula estadísticas descriptivas básicas
# (como el promedio, la desviación estandar, los percentiles) del número de vuelos reservados para cada grupo.

df_filtrado.groupby('Education')['Flights Booked'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bachelor,253752.0,4.091093,5.216995,0.0,0.0,1.0,8.0,21.0
College,102672.0,4.153012,5.242136,0.0,0.0,1.0,8.0,21.0
Doctor,17856.0,4.146281,5.250093,0.0,0.0,1.0,8.0,21.0
High School or Below,19008.0,4.155777,5.234551,0.0,0.0,1.0,8.0,21.0
Master,12336.0,4.184014,5.210294,0.0,0.0,1.0,8.0,21.0


In [None]:
# Prueba Estadística: Realiza una prueba de A/B testing para determinar si existe una diferencia significativa en el número de vuelos reservados 
# entre los diferentes niveles educativos.

In [11]:
# Hago un nuevo filtrado incluyendo el 'Loyalty Number' ya que será necesario en esta prueba

df_filtrado2 = df[['Loyalty Number','Flights Booked', 'Education']]
df_filtrado2.head()

Unnamed: 0,Loyalty Number,Flights Booked,Education
0,480934,0,Bachelor
1,480934,3,Bachelor
2,480934,0,Bachelor
3,480934,0,Bachelor
4,480934,6,Bachelor


In [12]:
df['Education'].unique()

array(['Bachelor', 'College', 'Master', 'High School or Below', 'Doctor'],
      dtype=object)

In [15]:
# Para tener una sola fila por cliente, voy a calcular el promedio mensual de vuelos reservados por cada cliente. 

media_vuelos_reser = df_filtrado2.groupby(['Loyalty Number', 'Education'])['Flights Booked'].mean().reset_index() 
media_vuelos_reser.head()

Unnamed: 0,Loyalty Number,Education,Flights Booked
0,100018,Bachelor,6.541667
1,100102,College,7.208333
2,100140,College,6.333333
3,100214,Bachelor,3.291667
4,100272,Bachelor,5.291667


Agrupo los niveles educativos en dos grupos (A y B):

- Grupo A: 'High School or Below', 'Bachelor', 'College' (niveles más bajos)
- Grupo B: 'Master', 'Doctor' (niveles más altos)