<a href="https://colab.research.google.com/github/Andru-1987/74235-_DataScience_I/blob/main/clase_4/ejercicio-practioco/clase_imputacion_arreglado_imputacion_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder


In [None]:
URL_DATASET = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"

In [None]:
dataframe = pd.read_csv(URL_DATASET)

In [None]:
dataframe

In [None]:
dataframe.info()

In [None]:
MARKET_STORE_URL = "https://raw.githubusercontent.com/Andru-1987/csv_files_ds/refs/heads/main/market_data.csv"

In [None]:
class MarketStore:
    def __init__(self, url):
        self.url = url
        self.dataframe = None
        self.dataframe_imputed_knn = None

    def get_dataframe(self):
        self.dataframe = pd.read_csv(self.url)
        return self.dataframe

    def get_information(self):
        print("\nInformacion de los primeros registros")
        print(self.dataframe.head())
        print("\nInformacion sobre los datos  columnas y valores nulls")
        print(self.dataframe.info())
        print("\nInformacion estadistica de los datos")
        print(self.dataframe.describe().transpose())

    def nullish_counting(self):
        # return self.dataframe.isnull().sum()
        total_rows= len(self.dataframe)
        null_percentage = (self.dataframe.isnull().sum() / total_rows) * 100
        null_percentage_sorted = null_percentage.sort_values(ascending=False)


        plt.figure(figsize=(16, 9))
        ax = null_percentage_sorted.plot(kind='bar', color="skyblue")
        ax.set_xlabel("Columnas")
        ax.set_ylabel("Porcentaje de valores nulos")
        ax.set_title("Porcentaje de valores nulos por columna")
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        plt.show()

    def imputar_manual(self):

        data_fullfilled = self.dataframe.copy()

        for col in self.dataframe.select_dtypes(include=['object']).columns:
            self.dataframe[col].fillna(self.dataframe[col].mode()[0], inplace=True)

        for col in self.dataframe.select_dtypes(include=['number']).columns:
            self.dataframe[col].fillna(self.dataframe[col].mean(), inplace=True)


    def imputacion_knn_imputer(self, n_neighbors=4):
        # Copia del dataset original
        df_encoder = self.dataframe.copy()
        encoders = {}

        # Detectar columnas categóricas
        category_columns = df_encoder.select_dtypes(include=['object', 'category']).columns

        for col in category_columns:
            print(f"Se le aplicará el siguiente mapeo a la columna: {col}")
            le = LabelEncoder()
            # hacer una mascara para evitar los valores no existen como nan values
            # como datos categoricos
            not_null_values = df_encoder[col].notnull()
            df_encoder.loc[not_null_values, col] = le.fit_transform(df_encoder.loc[not_null_values, col]).astype(str)
            df_encoder.loc[~not_null_values,col] = np.nan
            encoders[col] = le

        # Imputación con KNN
        imputer = KNNImputer(n_neighbors=n_neighbors)
        data_imputed = imputer.fit_transform(df_encoder)
        self.dataframe_imputed_knn = pd.DataFrame(data_imputed, columns=df_encoder.columns)

        # Decodificación
        df_decoded = self.dataframe_imputed_knn.copy()
        for col, le in encoders.items():
            if col in df_decoded.columns:
                print(f"Se le aplicará el decoding para: {col}")
                df_decoded[col] = le.inverse_transform(df_decoded[col].astype(int))

        self.dataframe_imputed_knn = df_decoded


    def plot_distribution(self,columna):

        if pd.api.types.is_numeric_dtype(self.dataframe[columna]):
            plt.figure(figsize=(16, 9))
            plt.subplot(1, 2, 1)
            plt.boxplot(self.dataframe[columna])
            plt.title(f'Boxplot de {columna}')

            plt.subplot(1, 2, 2)
            plt.hist(self.dataframe[columna], bins=20, edgecolor='k')
            plt.title(f'Histograma de {columna}')
            plt.tight_layout()
            plt.show()
        else:
            plt.figure(figsize=(16, 9))
            self.dataframe[columna].value_counts().plot(kind='bar')
            plt.title(f'Frecuencia de {columna}')
            plt.xlabel(columna)
            plt.ylabel('Frecuencia')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

    def plot_distribution_knn(self,columna):

        if pd.api.types.is_numeric_dtype(self.dataframe_imputed_knn[columna]):
            plt.figure(figsize=(16, 9))
            plt.subplot(1, 2, 1)
            plt.boxplot(self.dataframe_imputed_knn[columna])
            plt.title(f'Boxplot de {columna}')

            plt.subplot(1, 2, 2)
            plt.hist(self.dataframe_imputed_knn[columna], bins=20, edgecolor='k')
            plt.title(f'Histograma de {columna}')
            plt.tight_layout()
            plt.show()
        else:
            plt.figure(figsize=(16, 9))
            self.dataframe_imputed_knn[columna].value_counts().plot(kind='bar')
            plt.title(f'Frecuencia de {columna}')
            plt.xlabel(columna)
            plt.ylabel('Frecuencia')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()



In [None]:
market_store = MarketStore(MARKET_STORE_URL)


In [None]:
market_store.get_dataframe()

In [None]:
# market_store.get_information()

In [None]:
# market_store.nullish_counting()

In [None]:
# market_store.dataframe.Outlet_Size.value_counts()
# market_store.imputar_manual()

In [None]:
# market_store.nullish_counting()

In [None]:
# market_store.plot_distribution("Outlet_Size")
# market_store.plot_distribution("Item_Weight")

In [None]:
market_store.imputacion_knn_imputer(4)

In [None]:
# Ahora como podremos observar los valores por medio del KnnImputer son muchos mas uniformes
# entregandonos una forma mas adecuada de imputar los datos

market_store.plot_distribution_knn("Outlet_Size")
market_store.plot_distribution_knn("Item_Weight")
