In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score 
import statsmodels.formula.api as smf
import pyreadr

In [66]:
data_proyecto = pyreadr.read_r('data_encriptada.rds')

# Extraer el DataFrame contenido
data_proyecto = data_proyecto[None]  # Usualmente el objeto principal está bajo la clave None


data_proyecto

Unnamed: 0,IDENTIFICADOR,y_rango,y_dicotomica,x4,x5,x6,x7,x8,x9,x10,...,x103,x104,x105,x106,x107,x108,x109,x1,x2,x3
0,100913.0,y_0,0,2021-08-31,1,S,C,932.0,True,4.940656e-324,...,,,,187313.297258,83616.305251,0.284405,0.357849,6151.726167,19.0,119255.573915
1,99176.0,y_30,0,2021-05-31,1,S,C,828.0,True,0.000000e+00,...,,,,164352.312433,127734.957850,0.246262,0.309129,9591.393465,1.0,37411.199713
2,107305.0,y_30,0,2022-05-31,0,S,C,868.0,True,4.940656e-324,...,0,21,21,112131.529063,80433.365550,0.423221,0.467257,11251.425064,1.0,33641.425657
3,107539.0,y_30,0,2022-05-31,1,N,C,891.0,True,4.940656e-324,...,0,60,60,374626.594517,187313.297258,0.340834,0.360938,10138.006103,0.0,69805.958238
4,97503.0,y_30,0,2021-03-31,1,M,C,927.0,False,4.940656e-324,...,,,,482180.681329,399735.279859,0.468886,0.589293,28548.623991,0.0,70557.339480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,96263.0,y_30,0,2021-02-28,0,M,C,978.0,False,4.940656e-324,...,,,,,,,,,4.0,181157.899713
2319,96136.0,y_30,0,2021-02-28,1,M,C,924.0,False,4.940656e-324,...,,,,,,,,,2.0,107827.161102
2320,96805.0,y_30,0,2021-02-28,1,M,C,828.0,False,0.000000e+00,...,,,,114804.924126,89226.624968,0.459019,0.623195,18101.531467,1.0,46306.244967
2321,105061.0,y_30,0,2022-03-31,0,S,C,794.0,True,0.000000e+00,...,,,,241694.577107,187845.526249,0.323695,0.340907,12605.183419,0.0,41210.746430


In [67]:
# Lista de las variables que deseas modificar
variables = ['x10', 'x13', 'x23', 'x32', 'x36', 'x44', 'x47', 'x48', 
             'x56', 'x57', 'x58', 'x59', 'x60', 'x67', 'x68', 'x68', 'x2']

# Crear un diccionario para almacenar el número de categorías por variable
num_categorias = {}

# Iterar sobre las variables y asignar categorías
for var in variables:
    # Reemplazar 'ND' por NaN (valor faltante)
    data_proyecto[var] = data_proyecto[var].replace('ND', np.nan)
    
    # Obtener los valores únicos de la variable, ignorando NaN
    unique_values = data_proyecto[var].dropna().unique()
    
    # Ordenar los valores únicos
    unique_values = sorted(unique_values)
    
    # Crear un diccionario de mapeo de valores a categorías
    value_to_category = {val: i+1 for i, val in enumerate(unique_values)}
    
    # Asegurar que la categoría 0 esté incluida
    value_to_category[np.nan] = 0  # Añadir la categoría para NaN (Desconocido)
    
    # Asignar las categorías a la variable
    data_proyecto[f'cat_{var}'] = data_proyecto[var].map(value_to_category)
    
    # Convertir a tipo categórico
    data_proyecto[f'cat_{var}'] = data_proyecto[f'cat_{var}'].astype('category')
    
    # Guardamos el número de categorías para cada variable
    num_categorias[var] = len(unique_values) + 1  # +1 por la categoría 'Desconocido'

# Imprimir cuántas categorías se asignaron a cada variable
print("\nNúmero de categorías por variable:")
print(num_categorias)

# Mostrar el dataframe con las nuevas columnas
data_proyecto




Número de categorías por variable:
{'x10': 14, 'x13': 13, 'x23': 6, 'x32': 4, 'x36': 49, 'x44': 6, 'x47': 9, 'x48': 7, 'x56': 7, 'x57': 4, 'x58': 4, 'x59': 12, 'x60': 19, 'x67': 5, 'x68': 5, 'x2': 25}


  data_proyecto[var] = data_proyecto[var].replace('ND', np.nan)


Unnamed: 0,IDENTIFICADOR,y_rango,y_dicotomica,x4,x5,x6,x7,x8,x9,x10,...,cat_x47,cat_x48,cat_x56,cat_x57,cat_x58,cat_x59,cat_x60,cat_x67,cat_x68,cat_x2
0,100913.0,y_0,0,2021-08-31,1,S,C,932.0,True,4.940656e-324,...,7,2,1,1,1,1,6,1,1,20
1,99176.0,y_30,0,2021-05-31,1,S,C,828.0,True,0.000000e+00,...,5,1,1,1,1,1,1,1,1,2
2,107305.0,y_30,0,2022-05-31,0,S,C,868.0,True,4.940656e-324,...,6,2,1,1,2,2,3,1,1,2
3,107539.0,y_30,0,2022-05-31,1,N,C,891.0,True,4.940656e-324,...,6,2,1,1,1,2,2,1,1,1
4,97503.0,y_30,0,2021-03-31,1,M,C,927.0,False,4.940656e-324,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,96263.0,y_30,0,2021-02-28,0,M,C,978.0,False,4.940656e-324,...,0,0,0,0,0,0,0,1,0,5
2319,96136.0,y_30,0,2021-02-28,1,M,C,924.0,False,4.940656e-324,...,0,0,0,0,0,0,0,1,0,3
2320,96805.0,y_30,0,2021-02-28,1,M,C,828.0,False,0.000000e+00,...,0,0,0,0,0,0,0,1,0,2
2321,105061.0,y_30,0,2022-03-31,0,S,C,794.0,True,0.000000e+00,...,5,1,1,1,1,1,1,1,1,1


In [68]:
# Lista de las variables que deseas modificar
variables = ['x10', 'x13', 'x23', 'x32', 'x36', 'x44', 'x47', 'x48', 
             'x56', 'x57', 'x58', 'x59', 'x60', 'x67', 'x68', 'x68', 'x2']

# Iterar sobre las variables y crear los dataframes con la columna original y categorizada
for var in variables:
    # Crear el dataframe para la variable con las dos columnas (original y categorizada)
    revisa = data_proyecto[[f'cat_{var}', var]].copy()
    
    # Mostrar las primeras 20 filas del dataframe
    print(f"Primeras 20 filas de {var}:")
    print(revisa.head(20))
    print("\n" + "-"*50)  # Separador para facilitar la lectura

Primeras 20 filas de x10:
   cat_x10            x10
0        2  4.940656e-324
1        1   0.000000e+00
2        2  4.940656e-324
3        2  4.940656e-324
4        2  4.940656e-324
5        2  4.940656e-324
6        1   0.000000e+00
7        2  4.940656e-324
8        1   0.000000e+00
9        1   0.000000e+00
10       1   0.000000e+00
11       1   0.000000e+00
12       2  4.940656e-324
13       1  -0.000000e+00
14       2  4.940656e-324
15       2  4.940656e-324
16       3  9.881313e-324
17       2  4.940656e-324
18       1   0.000000e+00
19       1   0.000000e+00

--------------------------------------------------
Primeras 20 filas de x13:
   cat_x13            x13
0        2  4.940656e-324
1        1   0.000000e+00
2        1   0.000000e+00
3        1   0.000000e+00
4        2  4.940656e-324
5        1   0.000000e+00
6        1   0.000000e+00
7        1   0.000000e+00
8        1   0.000000e+00
9        1   0.000000e+00
10       1   0.000000e+00
11       1   0.000000e+00
12       1  

In [69]:
columnas_especificas = ['x8', 'x11', 'x12', 'x14', 'x15', 'x31', 'x33', 'x34', 
                        'x35', 'x40', 'x42', 'x51', 'x52', 'x53', 'x54', 'x55', 
                        'x61', 'x65', 'x69', 'x70', 'x71', 'x74', 'x75']

# Rellenar valores NA con la media solo en las columnas específicas si son numéricas
for columna in columnas_especificas:
    if pd.api.types.is_numeric_dtype(data_proyecto[columna]):
        media = data_proyecto[columna].mean()  # Calcula la media de la columna
        data_proyecto[columna].fillna(media, inplace=True)  # Rellena los NA con la media

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_proyecto[columna].fillna(media, inplace=True)  # Rellena los NA con la media


In [70]:
datos_limpios_de_na = data_proyecto[['x8', 'x11', 'x12', 'x14', 'x15', 'x31', 'x33', 'x34', 
                        'x35', 'x40', 'x42', 'x51', 'x52', 'x53', 'x54', 'x55', 
                        'x61', 'x65', 'x69', 'x70', 'x71', 'x74', 'x75','cat_x10', 'cat_x13', 'cat_x23', 'cat_x32', 'cat_x36', 'cat_x44', 'cat_x47', 'cat_x48', 
             'cat_x56', 'cat_x57', 'cat_x58', 'cat_x59', 'cat_x60', 'cat_x67', 'cat_x68', 'cat_x68', 'cat_x2']].copy()




datos_limpios_de_na

Unnamed: 0,x8,x11,x12,x14,x15,x31,x33,x34,x35,x40,...,cat_x48,cat_x56,cat_x57,cat_x58,cat_x59,cat_x60,cat_x67,cat_x68,cat_x68.1,cat_x2
0,932.0,4135.239559,41269.186417,23199.002121,157088.053130,0.0,0.956862,4.940656e-324,0.000000e+00,18984.214667,...,2,1,1,1,1,6,1,1,1,20
1,828.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000e+00,7.410985e-323,0.000000,...,1,1,1,1,1,1,1,1,1,2
2,868.0,9965.394903,99453.425637,0.000000,0.000000,0.0,1.000000,0.000000e+00,0.000000e+00,2203.238803,...,2,1,1,2,2,3,1,1,1,2
3,891.0,14286.986459,142582.382258,0.000000,0.000000,0.0,1.000000,1.482197e-323,0.000000e+00,3158.694989,...,2,1,1,1,2,2,1,1,1,1
4,927.0,14897.806895,148678.295702,34.423622,233.093633,0.0,1.000000,0.000000e+00,1.333977e-322,3320.553583,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,978.0,24910.529289,248604.043930,42228.128232,197237.010143,0.0,1.000000,0.000000e+00,0.000000e+00,38399.398278,...,0,0,0,0,0,0,1,0,0,5
2319,924.0,4240.247431,42317.152167,27201.797626,123778.404428,0.0,0.956862,4.940656e-324,7.410985e-323,22125.252355,...,0,0,0,0,0,0,1,0,0,3
2320,828.0,0.000000,0.000000,30087.924433,203735.205819,0.0,1.000000,0.000000e+00,0.000000e+00,23435.816488,...,0,0,0,0,0,0,1,0,0,2
2321,794.0,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,2.964394e-323,2.964394e-323,0.000000,...,1,1,1,1,1,1,1,1,1,1
