In [1]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import re

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# -----------------------------------------------------------------------------
from scipy.stats import chi2_contingency, ttest_ind, shapiro, kstest, poisson, chisquare, expon

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


| `Nombre columna English`         | `En Español`                        |
|--------------------------------|-----------------------------------|
| Loyalty Number                 | Número de fidelización            |
| Year                           | Año                               |
| Month                          | Mes                               |
| Flights Booked                 | Vuelos Reservados                 |
| Flights with Companions        | Vuelos con acompañantes           |
| Total Flights                  | Total de vuelos                   |
| Distance                       | Distanciarse                      |
| Points Accumulated             | Puntos acumulados                 |
| Points Redeemed                | Puntos canjeados                  |
| Dollar Cost Points Redeemed    | Puntos de costo en dólares canjeados |
| Country                        | País                    |
| Province                       | Provincia               |
| City                           | Ciudad                  |
| Postal Code                    | Código Postal           |
| Gender                         | Género                  |
| Education                      | Educación               |
| Salary                         | Salario                 |
| Marital Status                 | Estado Civil            |
| Loyalty Card                   | Tarjeta de Lealtad      |
| CLV                            | Valor del Cliente (CLV) |
| Enrollment Type                | Tipo de Inscripción     |
| Enrollment Year                | Año de Inscripción      |
| Enrollment Month               | Mes de Inscripción      |
| Cancellation Year              | Año de Cancelación      |
| Cancellation Month             | Mes de Cancelación      |


In [4]:
df_union = pd.read_csv('DF_sin_duplicados.csv')
df_union.sample(10)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
197850,882466,2017,12,16,3,19,2755,275.0,438,36,Canada,Saskatchewan,Regina,S1J 3C5,Male,Bachelor,84048.0,Single,Aurora,10876.48,Standard,2016,3,,
257305,666702,2017,12,2,0,2,1088,108.0,0,0,Canada,Ontario,Toronto,M2Z 4K1,Female,Doctor,199894.0,Married,Nova,6503.4,Standard,2017,4,2017.0,12.0
367025,924787,2017,1,0,0,0,0,0.0,0,0,Canada,British Columbia,Vancouver,V6E 3Z3,Male,College,,Divorced,Star,2004.35,Standard,2016,3,2018.0,12.0
36509,185919,2018,8,4,3,7,3619,361.0,458,37,Canada,British Columbia,Victoria,V10 6T5,Female,College,,Married,Star,2226.59,Standard,2013,11,,
241721,728021,2018,3,6,1,7,2205,237.6,0,0,Canada,British Columbia,Vancouver,V6E 3Z3,Male,Bachelor,85364.0,Married,Star,12741.85,Standard,2013,12,,
120244,239970,2017,8,0,0,0,0,0.0,0,0,Canada,British Columbia,Vancouver,V1E 4R6,Male,Doctor,206392.0,Married,Star,5146.81,Standard,2015,8,,
313166,658330,2018,7,0,0,0,0,0.0,0,0,Canada,British Columbia,Vancouver,V5R 1W3,Female,Bachelor,58228.0,Divorced,Star,3895.96,Standard,2017,2,,
33330,985211,2017,2,0,0,0,0,0.0,0,0,Canada,Quebec,Montreal,H2T 2J6,Male,College,,Married,Nova,6714.07,Standard,2014,4,,
135736,167578,2017,9,2,2,4,2352,235.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Master,100817.0,Married,Nova,16301.97,Standard,2012,7,,
246132,672600,2018,3,0,0,0,0,0.0,0,0,Canada,Ontario,Ottawa,K1F 2R2,Male,College,,Single,Nova,19228.46,Standard,2017,2,,


In [5]:

# EXPLORAR LOS DATAFRAME EN GENERAL 

def exploracion_datos(df):
    print(' Filas y Columnas del DATAFRAME \n')
    print(f"El número de filas que tenemos es de {df.shape[0]}.\nEl número de columnas es de {df.shape[1]}\n")
    print('____________________________________________________________\n')
    
    print(' Nombre de todas las Columnas del DATAFRAME: \n')
    print(df.columns)
    print('____________________________________________________________\n')
    
    print('INFORMACIÓN GENERAL DEL DATAFRAME \n')
    print(df.info())
    print('____________________________________________________________\n')
    
    print('Ver los NULOS del DataFrame \n')
    print(f'Nulos de todo el data: --> {df.isnull().sum().mean() * 100} \n')
    for columna in df.columns:
        cantidad_valores_nulos = df[columna].isnull().mean() * 100
        print(f'La columna {columna}: {cantidad_valores_nulos}')
    print('____________________________________________________________\n')
    
    print('Valores ÚNICOS por columna:\n')
    for columna in df.columns:
        cantidad_valores_unicos = df[columna].unique()
        print(f'La columna {columna}: {len(cantidad_valores_unicos)}')
        print(f'La columna {columna}: {cantidad_valores_unicos}')
        
    print('____________________________________________________________\n')
    
    print('Valores DUPLICADOS por columna es de:\n')
    for columna in df.columns:
        cantidad_duplicados = df[columna].duplicated().sum()
        print(f'La columna {columna}: {cantidad_duplicados}')
    print('____________________________________________________________\n')
  
    print('--> RESUMEN ESTADÍSTICO \n')
    print('<<< Variables Numéricas >>> \n')
    print(df.describe().T)
    print('<<< Variables Categóricas >>> \n')
    print(df.describe(include='object').T)
   

In [6]:
exploracion_datos(df_union)

 Filas y Columnas del DATAFRAME 

El número de filas que tenemos es de 403760.
El número de columnas es de 25

____________________________________________________________

 Nombre de todas las Columnas del DATAFRAME: 

Index(['Loyalty Number', 'Year', 'Month', 'Flights Booked',
       'Flights with Companions', 'Total Flights', 'Distance',
       'Points Accumulated', 'Points Redeemed', 'Dollar Cost Points Redeemed',
       'Country', 'Province', 'City', 'Postal Code', 'Gender', 'Education',
       'Salary', 'Marital Status', 'Loyalty Card', 'CLV', 'Enrollment Type',
       'Enrollment Year', 'Enrollment Month', 'Cancellation Year',
       'Cancellation Month'],
      dtype='object')
____________________________________________________________

INFORMACIÓN GENERAL DEL DATAFRAME 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403760 entries, 0 to 403759
Data columns (total 25 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       ----

In [15]:
# Seleccionar solo las columnas numéricas
numeric_columns = df_union.select_dtypes(include=['number']).columns

# Encontrar las columnas que tienen valores negativos
negative_values = (df_union[numeric_columns] < 0).any()

# Mostrar las columnas que contienen valores negativos
columns_with_negatives = negative_values[negative_values].index.tolist()
print(f"Columns with negative values: {columns_with_negatives}")

# Si quieres ver las filas que contienen valores negativos en las columnas identificadas
rows_with_negatives = df_union[df_union[columns_with_negatives].lt(0).any(axis=1)]
rows_with_negatives


Columns with negative values: ['Salary']


Unnamed: 0,Loyalty_Number,Year,Month,Flights_Booked,Flights_with_Companions,Total_Flights,Distance,Points_Accumulated,Points_Redeemed,Dollar_Cost_Points_Redeemed,Country,Province,City,Postal_Code,Gender,Education,Salary,Marital_Status,Loyalty_Card,CLV,Enrollment_Type,Enrollment_Year,Enrollment_Month
291,115505,2017,1,0,0,0,0,0.0,0,0,Canada,Newfoundland,St. John's,A1C 6H9,Male,Bachelor,-10605.0,Married,Nova,5860.17,2018 Promotion,2018,4
956,152016,2017,1,0,0,0,0,0.0,0,0,Canada,Ontario,Toronto,P1J 8T7,Female,Bachelor,-58486.0,Married,Aurora,5067.21,2018 Promotion,2018,2
1716,194065,2017,1,0,0,0,0,0.0,0,0,Canada,Ontario,Sudbury,M5V 1G5,Female,Bachelor,-31911.0,Married,Nova,2888.85,2018 Promotion,2018,2
2081,212128,2017,1,3,0,3,1959,195.0,0,0,Canada,Ontario,Toronto,P2T 6G3,Male,Bachelor,-49001.0,Married,Nova,3130.68,2018 Promotion,2018,2
2433,232755,2017,1,0,0,0,0,0.0,0,0,Canada,British Columbia,Vancouver,V1E 4R6,Female,Bachelor,-46683.0,Single,Nova,4787.81,2018 Promotion,2018,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398813,734647,2018,12,15,0,15,1440,144.0,0,0,Canada,Saskatchewan,Regina,S1J 3C5,Male,Bachelor,-46303.0,Married,Nova,11280.73,2018 Promotion,2018,4
399884,790475,2018,12,7,1,8,1056,105.0,0,0,Canada,Ontario,Trenton,K8V 4B2,Female,Bachelor,-34079.0,Married,Nova,12913.50,2018 Promotion,2018,2
402017,906428,2018,12,13,0,13,2912,291.0,0,0,Canada,Yukon,Whitehorse,Y2K 6R0,Male,Bachelor,-57297.0,Married,Star,10018.66,2018 Promotion,2018,4
402459,430398,2018,12,5,5,10,660,66.0,539,44,Canada,Newfoundland,St. John's,A1C 6H9,Male,Bachelor,-17534.0,Married,Nova,49423.80,2018 Promotion,2018,3


In [16]:
df_union['Salary'] = df_union['Salary'].abs()

In [17]:
df_union.Salary

0          92552.0
1              NaN
2              NaN
3          63253.0
4          91163.0
            ...   
403755         NaN
403756    217943.0
403757     47670.0
403758         NaN
403759     46594.0
Name: Salary, Length: 403760, dtype: float64

In [19]:
# Seleccionar solo las columnas numéricas
numeric_columns = df_union.select_dtypes(include=['number']).columns

# Encontrar las columnas que tienen valores negativos
negative_values = (df_union[numeric_columns] < 0).any()

# Mostrar las columnas que contienen valores negativos
columns_with_negatives = negative_values[negative_values].index.tolist()
print(f"Columns with negative values: {columns_with_negatives}")


Columns with negative values: []


In [11]:
df_union['Salary']

0          92552.0
1              NaN
2              NaN
3          63253.0
4          91163.0
            ...   
403755         NaN
403756    217943.0
403757     47670.0
403758         NaN
403759     46594.0
Name: Salary, Length: 403760, dtype: float64

In [20]:
df_union['Salary'] = df_union['Salary'].abs()

In [7]:
# ELIMINO ESTAS 2 COLUMNAS PORQUE TIENEN UN 88% DE NULOS
df_union.drop(["Cancellation Year", "Cancellation Month"], axis=1, inplace=True )

In [9]:
df_union.columns = df_union.columns.str.replace(" ", "_")
df_union.sample(1)

Unnamed: 0,Loyalty_Number,Year,Month,Flights_Booked,Flights_with_Companions,Total_Flights,Distance,Points_Accumulated,Points_Redeemed,Dollar_Cost_Points_Redeemed,Country,Province,City,Postal_Code,Gender,Education,Salary,Marital_Status,Loyalty_Card,CLV,Enrollment_Type,Enrollment_Year,Enrollment_Month
349854,818995,2018,9,8,8,16,3920,392.0,0,0,Canada,British Columbia,Dawson Creek,U5I 4F1,Male,Bachelor,70226.0,Married,Nova,3208.23,Standard,2014,11
