# Limpieza datos y gestión nulos

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings("ignore")

In [2]:
def apertura_exploracion(csv):

    """ Función para leer csv, convertir a df y hacer una primera exploración."""
    
    try:
        # Convertir el csv a DataFrame
        df = pd.read_csv(f"../files/{csv}.csv")        

        # Muestro las primeras filas
        display(df.head())

        # Obtengo las listas
        print(f"-----\n\nEl DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.\n-----")

        # Consulto si hay filas duplicadas
        print(f"\nEl número de filas duplicadas es {df.duplicated().sum()}\n-----")

        # Muestro el tipo de dato y si hay nulos por cada columna
        print("\nInformación del DataFrame:")
        df.info()

        # Muestro las estadísticas de columnas numéricas
        print("-----\n\nEstadísticas descriptivas:")
        display(df.describe().T)

        # Me devuelve un df que tendré que igualar a una variable
        return df  

    # Excepciones en caso de no enconrar el archivo o de que haya un error
    except FileNotFoundError:
        print(f"Error: No se encontró el archivo '../files/{csv}.csv'.")
        return None  
    
    except Exception as e:
        print(f"Ocurrió un error: {e}")
        return None 
    

In [4]:
df = apertura_exploracion("Full Loyalty Program")

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,100018,2017,1,3,0,3,1521,152.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
1,100102,2017,1,10,4,14,2030,203.0,0,0,Canada,Ontario,Toronto,M1R 4K3,Male,College,,Single,Nova,2887.74,Standard,2013,3,,
2,100140,2017,1,6,0,6,1200,120.0,0,0,Canada,British Columbia,Dawson Creek,U5I 4F1,Female,College,,Divorced,Nova,2838.07,Standard,2016,7,,
3,100428,2017,1,6,0,6,606,60.0,0,0,Canada,British Columbia,Vancouver,V6E 3D9,Male,Bachelor,63478.0,Married,Aurora,5845.43,Standard,2012,8,,
4,100550,2017,1,3,0,3,2037,203.0,0,0,Canada,Quebec,Montreal,H2Y 4R4,Female,Bachelor,54133.0,Married,Nova,7861.8,Standard,2013,2,,


-----

El DataFrame tiene 207632 filas y 25 columnas.
-----

El número de filas duplicadas es 0
-----

Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207632 entries, 0 to 207631
Data columns (total 25 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Loyalty Number               207632 non-null  int64  
 1   Year                         207632 non-null  int64  
 2   Month                        207632 non-null  int64  
 3   Flights Booked               207632 non-null  int64  
 4   Flights with Companions      207632 non-null  int64  
 5   Total Flights                207632 non-null  int64  
 6   Distance                     207632 non-null  int64  
 7   Points Accumulated           207632 non-null  float64
 8   Points Redeemed              207632 non-null  int64  
 9   Dollar Cost Points Redeemed  207632 non-null  int64  
 10  Country                      207632 non-null  

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,207632.0,549240.808411,258456.99958,100018.0,326707.0,549612.0,772019.0,999986.0
Year,207632.0,2017.536295,0.498682,2017.0,2017.0,2018.0,2018.0,2018.0
Month,207632.0,6.641394,3.448752,1.0,4.0,7.0,10.0,12.0
Flights Booked,207632.0,8.03905,4.668948,1.0,4.0,8.0,11.0,21.0
Flights with Companions,207632.0,2.015706,2.538359,0.0,0.0,1.0,4.0,11.0
Total Flights,207632.0,10.054756,5.807808,1.0,6.0,10.0,14.0,32.0
Distance,207632.0,2361.633876,1135.83517,90.0,1458.0,2298.0,3150.0,6293.0
Points Accumulated,207632.0,241.6426,116.118272,9.0,149.0,235.0,323.0,676.5
Points Redeemed,207632.0,59.96854,170.314671,0.0,0.0,0.0,0.0,876.0
Dollar Cost Points Redeemed,207632.0,4.853655,13.775511,0.0,0.0,0.0,0.0,71.0
