# Limpieza datos y gestión nulos

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings("ignore")

In [2]:
def apertura_exploracion(csv):

    """ Función para leer csv, convertir a df y hacer una primera exploración."""
    
    try:
        # Convertir el csv a DataFrame
        df = pd.read_csv(f"../files/{csv}.csv")        

        # Muestro las primeras filas
        display(df.head())

        # Obtengo las listas
        print(f"-----\n\nEl DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.\n-----")

        # Consulto si hay filas duplicadas
        print(f"\nEl número de filas duplicadas es {df.duplicated().sum()}\n-----")

        # Muestro el tipo de dato y si hay nulos por cada columna
        print("\nInformación del DataFrame:")
        df.info()

        # Muestro las estadísticas de columnas numéricas
        print("-----\n\nEstadísticas descriptivas:")
        display(df.describe().T)

        # Me devuelve un df que tendré que igualar a una variable
        return df  

    # Excepciones en caso de no enconrar el archivo o de que haya un error
    except FileNotFoundError:
        print(f"Error: No se encontró el archivo '../files/{csv}.csv'.")
        return None  
    
    except Exception as e:
        print(f"Ocurrió un error: {e}")
        return None 
    

In [3]:
df = apertura_exploracion("Full Loyalty Program")

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month
0,100018,2017,1,3,0,3,1521,152.0,0,0,Canada,Alberta,Edmonton,T9G 1W3,Female,Bachelor,92552.0,Married,Aurora,7919.2,Standard,2016,8,,
1,100102,2017,1,10,4,14,2030,203.0,0,0,Canada,Ontario,Toronto,M1R 4K3,Male,College,,Single,Nova,2887.74,Standard,2013,3,,
2,100140,2017,1,6,0,6,1200,120.0,0,0,Canada,British Columbia,Dawson Creek,U5I 4F1,Female,College,,Divorced,Nova,2838.07,Standard,2016,7,,
3,100428,2017,1,6,0,6,606,60.0,0,0,Canada,British Columbia,Vancouver,V6E 3D9,Male,Bachelor,63478.0,Married,Aurora,5845.43,Standard,2012,8,,
4,100550,2017,1,3,0,3,2037,203.0,0,0,Canada,Quebec,Montreal,H2Y 4R4,Female,Bachelor,54133.0,Married,Nova,7861.8,Standard,2013,2,,


-----

El DataFrame tiene 207632 filas y 25 columnas.
-----

El número de filas duplicadas es 0
-----

Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207632 entries, 0 to 207631
Data columns (total 25 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Loyalty Number               207632 non-null  int64  
 1   Year                         207632 non-null  int64  
 2   Month                        207632 non-null  int64  
 3   Flights Booked               207632 non-null  int64  
 4   Flights with Companions      207632 non-null  int64  
 5   Total Flights                207632 non-null  int64  
 6   Distance                     207632 non-null  int64  
 7   Points Accumulated           207632 non-null  float64
 8   Points Redeemed              207632 non-null  int64  
 9   Dollar Cost Points Redeemed  207632 non-null  int64  
 10  Country                      207632 non-null  

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Loyalty Number,207632.0,549240.808411,258456.99958,100018.0,326707.0,549612.0,772019.0,999986.0
Year,207632.0,2017.536295,0.498682,2017.0,2017.0,2018.0,2018.0,2018.0
Month,207632.0,6.641394,3.448752,1.0,4.0,7.0,10.0,12.0
Flights Booked,207632.0,8.03905,4.668948,1.0,4.0,8.0,11.0,21.0
Flights with Companions,207632.0,2.015706,2.538359,0.0,0.0,1.0,4.0,11.0
Total Flights,207632.0,10.054756,5.807808,1.0,6.0,10.0,14.0,32.0
Distance,207632.0,2361.633876,1135.83517,90.0,1458.0,2298.0,3150.0,6293.0
Points Accumulated,207632.0,241.6426,116.118272,9.0,149.0,235.0,323.0,676.5
Points Redeemed,207632.0,59.96854,170.314671,0.0,0.0,0.0,0.0,876.0
Dollar Cost Points Redeemed,207632.0,4.853655,13.775511,0.0,0.0,0.0,0.0,71.0


In [4]:
round(df.isna().sum()/df.shape[0]*100, 2)

Loyalty Number                  0.00
Year                            0.00
Month                           0.00
Flights Booked                  0.00
Flights with Companions         0.00
Total Flights                   0.00
Distance                        0.00
Points Accumulated              0.00
Points Redeemed                 0.00
Dollar Cost Points Redeemed     0.00
Country                         0.00
Province                        0.00
City                            0.00
Postal Code                     0.00
Gender                          0.00
Education                       0.00
Salary                         25.45
Marital Status                  0.00
Loyalty Card                    0.00
CLV                             0.00
Enrollment Type                 0.00
Enrollment Year                 0.00
Enrollment Month                0.00
Cancellation Year              96.36
Cancellation Month             96.36
dtype: float64

## Cancellation Year y Cancellation Month

Creamos una columna nueva que me indique si el cliente está activo o no, y estas las dejamos como están. 

In [5]:
# miro el tipo de dato que contiene la columna "Cancellation Month"
df["Cancellation Month"].unique()

array([nan,  3.,  8.,  6., 10.,  1.,  4.,  2., 12., 11.,  5.,  7.,  9.])

In [6]:
# Creo una lista para hacerlo fuera del df en pequeño
lista = [3.0, "nan", 7.8, "nan", "nan"]

# Bucle para iterar por los elementos de la lista y me los imprima como active o cancelled
for datos in lista:
    if datos == "nan":
        print("Active")

    else:
        print("Cancelled")
        # prints para comprobar que funciona

Cancelled
Active
Cancelled
Active
Active


In [7]:
# Creo por asignación directa una nueva columna "Loyalty Status"
# Active si el valor en "Cancellation Month" es NaN
# Cancelled si no es NaN

df["Loyalty Status"] = df["Cancellation Month"].apply( lambda dato: "Active" if pd.isna(dato) else "Cancelled")

In [8]:
# compruebo que no haya nulos en la nueva columna
df["Loyalty Status"].isna().sum()

0

## Salary valores negativos

Sustituimos los valores negativos por valores absolutos

In [9]:
# controlo cuantos valores negativos hay en Salary 
# son más que en la primera exploración porque al unir los df, 
# con cada vuelo que el cliente haya podido realizar, vemos todos sus datos.
df[df["Salary"] < 0].shape

(206, 26)

In [10]:
# transformo solo en la columna Salary los negativos a positivos
df["Salary"] = df["Salary"].abs()

In [11]:
# Compruebo de nuevo que haya salido correctamente
df[df["Salary"] < 0].shape

(0, 26)

## Salary Nulos

- Los nulos corresponden siempre a los clientes que pertenecen a la categoria "College" en la variable Education. 

- Como es una categoría formativa similar a "Bachellor" y "High School or Below", haremos la mediana.

- Sustituiremos los nulos de esos clientes de categoría "College" por la mediana

In [12]:
# Compruebo las categorías dentro de Education
df["Education"].unique()

array(['Bachelor', 'College', 'Master', 'Doctor', 'High School or Below'],
      dtype=object)

In [13]:
# Calculo la mediana del salario de las dos categorias juntas ("Bachelor", "High School or Bellow")
mediana = df[df["Education"].isin(["Bachelor", "High School or Bellow"])]["Salary"].median()

In [14]:
# Sustiyo los valores nulos en Salary por la mediana
# Serán todos los valores nulos porque en el análisis previo 
# se ha podido controlar que todos los nulos eran "College"
df["Salary"] = df["Salary"].fillna(mediana)

In [15]:
# Comprobamos los nulos después del cambio
df["Salary"].isna().sum()

0

In [16]:
# Comprobamos que la cifra sea la correcta
df[df["Education"]== "College"].sample(3)

Unnamed: 0,Loyalty Number,Year,Month,Flights Booked,Flights with Companions,Total Flights,Distance,Points Accumulated,Points Redeemed,Dollar Cost Points Redeemed,Country,Province,City,Postal Code,Gender,Education,Salary,Marital Status,Loyalty Card,CLV,Enrollment Type,Enrollment Year,Enrollment Month,Cancellation Year,Cancellation Month,Loyalty Status
170988,974041,2018,3,3,1,4,896,111.25,537,43,Canada,New Brunswick,Fredericton,E3B 2H2,Female,College,72421.0,Single,Nova,3097.92,Standard,2016,6,,,Active
101649,734727,2018,1,6,2,8,1296,129.0,0,0,Canada,Quebec,Tremblant,H5Y 2S9,Female,College,72421.0,Single,Aurora,6778.84,Standard,2015,7,,,Active
198404,175017,2018,12,5,0,5,3685,368.0,0,0,Canada,British Columbia,Whistler,V6T 1Y8,Female,College,72421.0,Married,Nova,17349.03,Standard,2016,11,,,Active


In [17]:
# Comrpobamos que solo queden los nulos de las columnas "Cancellation Month" y "Cancellation Year"
round(df.isna().sum()/df.shape[0]*100, 2)

Loyalty Number                  0.00
Year                            0.00
Month                           0.00
Flights Booked                  0.00
Flights with Companions         0.00
Total Flights                   0.00
Distance                        0.00
Points Accumulated              0.00
Points Redeemed                 0.00
Dollar Cost Points Redeemed     0.00
Country                         0.00
Province                        0.00
City                            0.00
Postal Code                     0.00
Gender                          0.00
Education                       0.00
Salary                          0.00
Marital Status                  0.00
Loyalty Card                    0.00
CLV                             0.00
Enrollment Type                 0.00
Enrollment Year                 0.00
Enrollment Month                0.00
Cancellation Year              96.36
Cancellation Month             96.36
Loyalty Status                  0.00
dtype: float64

## Guardamos un nuevo csv clean nonulls

In [18]:
df.to_csv("../files/Full Loyalty Program Clean Nonulls.csv", index=False)