In [44]:
# Librerias:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn import linear_model
from sklearn.metrics import roc_curve, auc, confusion_matrix

import matplotlib.pyplot as plt

In [45]:
# Guardar datos en variables:
x_data = pd.read_csv('G:/trainX_reto1.csv', sep=';', decimal='.', index_col=0, na_values='?')
y_data = pd.read_csv('G:/trainY_reto1.csv', sep=';', decimal='.', index_col=0, na_values='?')

In [46]:
# Separación en train y test:
x_train, x_test = train_test_split(x_data, test_size=0.2, random_state=5)
y_train, y_test = train_test_split(y_data, test_size=0.2, random_state=5)

# Ordenar los df por id:
x_train.sort_index(inplace=True)
y_train.sort_index(inplace=True)

# Limpieza de datos:

In [47]:
# Exploración de datos:
print(x_train.info())
print(x_train.describe())
print('\nModas:\n', x_train.mode(axis=0, dropna=False))
N_x, D_x = x_train.shape

print('\n\n', y_train.info())
print(y_train.describe())
N_y, D_y = y_train.shape

<class 'pandas.core.frame.DataFrame'>
Index: 256 entries, 0 to 399
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     253 non-null    float64
 1   bp      247 non-null    float64
 2   sg      229 non-null    float64
 3   al      229 non-null    float64
 4   su      227 non-null    float64
 5   rbc     154 non-null    object 
 6   pc      216 non-null    object 
 7   pcc     253 non-null    object 
 8   ba      253 non-null    object 
 9   bgr     227 non-null    float64
 10  bu      244 non-null    float64
 11  sc      246 non-null    float64
 12  sod     202 non-null    float64
 13  pot     201 non-null    float64
 14  hemo    225 non-null    float64
 15  pcv     209 non-null    float64
 16  wbcc    187 non-null    float64
 17  rbcc    169 non-null    float64
 18  htn     254 non-null    object 
 19  dm      254 non-null    object 
 20  cad     254 non-null    object 
 21  appet   256 non-null    object 
 22  pe     

In [48]:
# Codificación de los datos tipo 'object':
def encode_object(df):
    df_code = df.copy()
    code_to_categ = {}

    for col in df_code.columns:
        if df_code[col].dtype == 'object':
            df_code[col] = df_code[col].astype('category')
            codes = df_code[col].cat.codes.replace(-1, np.nan)  # Reemplaza -1 por NaN
            code_to_categ[col] = dict(zip(codes, df_code[col]))
            df_code[col] = codes

    return df_code, code_to_categ

x_train_df, code_to_categ_x = encode_object(x_train)
y_train_df, code_to_categ_y = encode_object(y_train)

In [49]:
# Visualización de valores NaN:
missig_data = x_train_df.isna()

missing_values_column = missig_data.sum(axis=0)
missing_values_row = missig_data.sum(axis=1)

mask_mayorq0 = missing_values_column > 0
mask_mayorq1 = missing_values_row > 0

print(f'Columnas con valores nulos:\n{missing_values_column[mask_mayorq0]}\n')
print(f'Filas con valores nulos:\n{missing_values_row[mask_mayorq1]}\n')

missing_count_row = missing_values_row.value_counts().sort_index()
print(f'Valores NaN en cada fila:\n{missing_count_row}')
missing_count_col = missing_values_column.value_counts().sort_index()
print(f'Valores NaN en cada columna:\n{missing_count_col}')

Columnas con valores nulos:
age       3
bp        9
sg       27
al       27
su       29
rbc     102
pc       40
pcc       3
ba        3
bgr      29
bu       12
sc       10
sod      54
pot      55
hemo     31
pcv      47
wbcc     69
rbcc     87
htn       2
dm        2
cad       2
dtype: int64

Filas con valores nulos:
0      3
1      5
2      3
6      3
7      3
      ..
336    2
349    2
350    2
378    2
381    2
Length: 157, dtype: int64

Valores NaN en cada fila:
0     99
1     31
2     22
3     23
4     24
5     20
6      4
7     13
8      4
9      9
10     2
11     5
Name: count, dtype: int64
Valores NaN en cada columna:
0      3
2      3
3      3
9      1
10     1
12     1
27     2
29     2
31     1
40     1
47     1
54     1
55     1
69     1
87     1
102    1
Name: count, dtype: int64


In [50]:
# Sustitución de valores NaN por valor 'mean' columnas 'rbc':
mean_rbc = x_train_df['rbc'].mean()
x_train_df['rbc'].fillna(mean_rbc, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_train_df['rbc'].fillna(mean_rbc, inplace=True)


In [51]:
#-- IMPUTACIÓN MULTIVARIANTE SIN COL. DUMMIES --#
imputer = IterativeImputer()
train_imputed = imputer.fit_transform(x_train_df)
x_train_df = pd.DataFrame(train_imputed, columns=x_train_df.columns)              # Convertir de nuevo a DataFrame
print(x_train_df.isna().sum())                                              # Comprobamos que no quedan valores NaN

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
dtype: int64


In [52]:
# #-- IMPUTACIÓN MULTIVARIANTE --#

# # Creas una copia del DataFrame original:
# x_train_df_copy = x_train_df.copy()

# # Creas un DataFrame para almacenar las columnas dummies
# dummies_df = pd.DataFrame()

# # Para cada columna en el DataFrame original
# for column in x_train_df_copy.columns:
#     # Creas una columna dummy que indica si un valor fue imputado o no
#     dummies_df[column + '_imputed'] = x_train_df_copy[column].isna().astype(int)

# # Imputación multivariante de los datos NaN:
# imputer = IterativeImputer()
# train_imputed = imputer.fit_transform(x_train_df_copy)
# x_train_df_copy = pd.DataFrame(train_imputed, columns=x_train_df_copy.columns)

# # Restableces los índices de los DataFrames
# x_train_df.reset_index(drop=True, inplace=True)
# y_train_df.reset_index(drop=True, inplace=True)
# dummies_df.reset_index(drop=True, inplace=True)

# # Concatenas el DataFrame de columnas dummies y el DataFrame original
# x_train_df = pd.concat([dummies_df, x_train_df_copy], axis=1)

# print(x_train_df.isna().sum())                                              # Comprobamos que no quedan valores NaN



In [53]:
print(x_train_df.shape)

# Aumento de dimensionalidad:
degree = 2
interaction_only = True

polyf = PolynomialFeatures(degree=degree, interaction_only=interaction_only)
polyf.set_output(transform="pandas")

x_train_dim = polyf.fit_transform(x_train_df)
print(x_train_dim.shape)

(256, 24)
(256, 301)


In [54]:
#-- Escalados a intervalos diferentes --#

# Escalado a intervalo unidad [0,1]:
scalerUnit = MinMaxScaler()
x_train_dim_unit = scalerUnit.fit_transform(x_train_dim)

# Escalado al máx. de los valores abs.:
scalerMaxAbs = MaxAbsScaler()
x_train_dim_MaxAbs = scalerMaxAbs.fit_transform(x_train_dim)

# Estandarizamos:
scalerStd = StandardScaler()
x_train_dim_std = scalerStd.fit_transform(x_train_dim)

In [55]:
#-- FILTRADO --#

# Filtrado por varianza:
selector_var = VarianceThreshold()
selector_var.set_output(transform="pandas")

x_train_var = selector_var.fit_transform(x_train_dim)
x_train_unit_var = selector_var.fit_transform(x_train_dim_unit)
x_train_MaxAbs_var = selector_var.fit_transform(x_train_dim_MaxAbs)
x_train_sdt_var = selector_var.fit_transform(x_train_dim_std)

# Filtrado por correlación:

corr_matrix_var = x_train_dim.corr().abs()  # Calculas la matriz de correlación
upper = corr_matrix_var.where(np.triu(np.ones(corr_matrix_var.shape), k=1).astype(bool)) # Seleccionas el triángulo superior de la matriz de correlación
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] # Encuentras las columnas con correlación mayor a 0.95 (puedes ajustar este valor a tus necesidades)
x_train_corr = x_train_dim.drop(x_train_dim[to_drop], axis=1)    # Eliminas las columnas altamente correlacionadas

x_train_dim_unit_df = pd.DataFrame(x_train_dim_unit)    # Conviertes el array de numpy a un DataFrame de pandas
corr_matrix_unit = x_train_dim_unit_df.corr().abs() # Ahora puedes calcular la matriz de correlación
upper = corr_matrix_unit.where(np.triu(np.ones(corr_matrix_unit.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_train_unit_corr = x_train_dim_unit_df.drop(x_train_dim_unit_df.columns[to_drop], axis=1)  # Eliminas las columnas altamente correlacionadas

x_train_dim_MaxAbs_df = pd.DataFrame(x_train_dim_MaxAbs)    # Convertir x_train_dim_MaxAbs a DataFrame de Pandas
corr_matrix_MaxAbs = x_train_dim_MaxAbs_df.corr().abs() # Para x_train_MaxAbs_corr
upper = corr_matrix_MaxAbs.where(np.triu(np.ones(corr_matrix_MaxAbs.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_train_MaxAbs_corr = x_train_dim_MaxAbs_df.drop(x_train_dim_MaxAbs_df.columns[to_drop], axis=1)

x_train_dim_std_df = pd.DataFrame(x_train_dim_std)  # Convertir x_train_dim_std a DataFrame de Pandas
corr_matrix_std = x_train_dim_std_df.corr().abs()# Para x_train_std_corr
upper = corr_matrix_std.where(np.triu(np.ones(corr_matrix_std.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_train_std_corr = x_train_dim_std_df.drop(x_train_dim_std_df.columns[to_drop], axis=1)

In [56]:
#-- ENTRENAMOS LOS MODELOS LINEALES --#

reg_model_var = linear_model.LinearRegression()
reg_model_var.fit(x_train_var, y_train_df['class'])

reg_model_unit_dim = linear_model.LinearRegression()
reg_model_unit_dim.fit(x_train_unit_var, y_train_df['class'])

reg_model_MaxAbs_dim = linear_model.LinearRegression()
reg_model_MaxAbs_dim.fit(x_train_MaxAbs_var, y_train_df['class'])

reg_model_std_dim = linear_model.LinearRegression()
reg_model_std_dim.fit(x_train_sdt_var, y_train_df['class'])

reg_model_corr = linear_model.LinearRegression()
reg_model_corr.fit(x_train_corr, y_train_df['class'])

reg_model_unit_corr = linear_model.LinearRegression()
reg_model_unit_corr.fit(x_train_unit_corr, y_train_df['class'])

reg_model_MaxAbs_corr = linear_model.LinearRegression()
reg_model_MaxAbs_corr.fit(x_train_MaxAbs_corr, y_train_df['class'])

reg_model_std_cor = linear_model.LinearRegression()
reg_model_std_cor.fit(x_train_std_corr, y_train_df['class'])

In [57]:
#-- COMPROVACIÓN Y COMPARACIÓN DE MODELOS --#

# Procesado de los test:

# Ordenar los df por id:
x_test.sort_index(inplace=True)
y_test.sort_index(inplace=True)

# Codificación de los datos tipo 'object':
def encode_object(df):
    df_code = df.copy()
    code_to_categ = {}

    for col in df_code.columns:
        if df_code[col].dtype == 'object':
            df_code[col] = df_code[col].astype('category')
            codes = df_code[col].cat.codes.replace(-1, np.nan)  # Reemplaza -1 por NaN
            code_to_categ[col] = dict(zip(codes, df_code[col]))
            df_code[col] = codes

    return df_code, code_to_categ

x_test_df, code_to_categ_x = encode_object(x_test)
y_test_df, code_to_categ_y = encode_object(y_test)


# Visualización de valores NaN:
missig_data = x_test_df.isna()

missing_values_column = missig_data.sum(axis=0)
missing_values_row = missig_data.sum(axis=1)

mask_mayorq0 = missing_values_column > 0
mask_mayorq1 = missing_values_row > 0

print(f'Columnas con valores nulos:\n{missing_values_column[mask_mayorq0]}\n')
print(f'Filas con valores nulos:\n{missing_values_row[mask_mayorq1]}\n')

missing_count_row = missing_values_row.value_counts().sort_index()
print(f'Valores NaN en cada fila:\n{missing_count_row}')
missing_count_col = missing_values_column.value_counts().sort_index()
print(f'Valores NaN en cada columna:\n{missing_count_col}')

# Sustitución de valores NaN por valor 'mean' columnas 'rbc':
test_mean_rbc = x_test_df['rbc'].mean()
x_test_df['rbc'].fillna(test_mean_rbc, inplace=True)


#-- IMPUTACIÓN MULTIVARIANTE SIN COL. DUMMIES --#
test_imputed = imputer.transform(x_test_df)
x_test_df = pd.DataFrame(test_imputed, columns=x_test_df.columns)              # Convertir de nuevo a DataFrame
print('Comprobamos que no quedan valores NaN:\n', x_test_df.isna().sum())                                              # Comprobamos que no quedan valores NaN

print(x_test_df.shape)
x_test_dim = polyf.transform(x_test_df)
print(x_test_dim.shape)

Columnas con valores nulos:
age       4
sg        4
al        4
su        4
rbc      18
pc        6
bgr       7
bu        4
sc        4
sod      16
pot      16
hemo      8
pcv      10
wbcc     14
rbcc     19
appet     1
pe        1
ane       1
dtype: int64

Filas con valores nulos:
4       2
18      1
23     10
35      2
39      1
44      2
62      1
65      6
67      8
83      4
102     2
109    10
112     6
114     5
117     3
119     7
123     5
126     1
141     2
143     4
150     3
154     1
163     1
182     3
184     1
191     1
206     3
220     6
228    11
232     9
268     6
280     5
294     3
303     2
363     2
365     2
dtype: int64

Valores NaN en cada fila:
0     28
1      8
2      8
3      5
4      2
5      3
6      4
7      1
8      1
9      1
10     2
11     1
Name: count, dtype: int64
Valores NaN en cada columna:
0     6
1     3
4     6
6     1
7     1
8     1
10    1
14    1
16    2
18    1
19    1
Name: count, dtype: int64
Comprobamos que no quedan valores NaN:
 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_test_df['rbc'].fillna(test_mean_rbc, inplace=True)


In [58]:
#-- Escalados a intervalos diferentes --#

# Escalado a intervalo unidad [0,1]:
x_test_dim_unit = scalerUnit.transform(x_test_dim)

# Escalado al máx. de los valores abs.:
x_test_dim_MaxAbs = scalerMaxAbs.transform(x_test_dim)

# Estandarizamos:
x_test_dim_std = scalerStd.transform(x_test_dim)


In [59]:
#-- FILTRADO --#

# Filtrado por varianza:
x_test_var = selector_var.transform(x_test_dim)
x_test_unit_var = selector_var.transform(x_test_dim_unit)
x_test_MaxAbs_var = selector_var.transform(x_test_dim_MaxAbs)
x_test_sdt_var = selector_var.transform(x_test_dim_std)

# Filtrado por correlación:

corr_matrix_test_var = x_test_dim.corr().abs()
upper = corr_matrix_test_var.where(np.triu(np.ones(corr_matrix_test_var.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_test_corr = x_test_dim.drop(x_test_dim[to_drop], axis=1)

x_test_dim_unit_df = pd.DataFrame(x_test_dim_unit)
corr_matrix_test_unit = x_test_dim_unit_df.corr().abs()
upper = corr_matrix_test_unit.where(np.triu(np.ones(corr_matrix_test_unit.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_test_unit_corr = x_test_dim_unit_df.drop(x_test_dim_unit_df.columns[to_drop], axis=1)

x_test_dim_MaxAbs_df = pd.DataFrame(x_test_dim_MaxAbs)
corr_matrix_test_MaxAbs = x_test_dim_MaxAbs_df.corr().abs()
upper = corr_matrix_test_MaxAbs.where(np.triu(np.ones(corr_matrix_test_MaxAbs.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_test_MaxAbs_corr = x_test_dim_MaxAbs_df.drop(x_test_dim_MaxAbs_df.columns[to_drop], axis=1)

x_test_dim_std_df = pd.DataFrame(x_test_dim_std)
corr_matrix_test_std = x_test_dim_std_df.corr().abs()
upper = corr_matrix_test_std.where(np.triu(np.ones(corr_matrix_test_std.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
x_test_std_corr = x_test_dim_std_df.drop(x_test_dim_std_df.columns[to_drop], axis=1)



In [60]:
# # Obtener las diferencias entre los nombres de características
# diferencias = set(x_test_corr.columns) - set(x_train_corr.columns)

# # Imprimir las diferencias
# print("Las siguientes características están en el primer DataFrame pero no en el segundo:")
# for caracteristica in diferencias:
#     print(caracteristica)

# # Obtener las diferencias en el otro sentido también
# diferencias_inversas = set(x_test_corr.columns) - set(x_train_corr.columns)

# # Imprimir las diferencias en el otro sentido
# print("\nLas siguientes características están en el segundo DataFrame pero no en el primero:")
# for caracteristica in diferencias_inversas:
#     print(caracteristica)


In [61]:
#-- CÁLCULO DE PROBABILIDADES --#

y_score_var = reg_model_var.predict(x_test_var)
# Cálculo de la curva ROC y AUC
fpr_var, tpr_var, threshold_var = roc_curve(y_test_df['class'], y_score_var)
roc_auc_var = auc(fpr_var, tpr_var)

y_score_unit = reg_model_unit_dim.predict(x_test_unit_var)
# Cálculo de la curva ROC y AUC
fpr_unit, tpr_unit, threshold_unit = roc_curve(y_test_df['class'], y_score_unit)
roc_auc_unit = auc(fpr_unit, tpr_unit)

y_score_MaxAbs = reg_model_MaxAbs_dim.predict(x_test_MaxAbs_var)
# Cálculo de la curva ROC y AUC
fpr_MaxAbs, tpr_MaxAbs, threshold_MaxAbs = roc_curve(y_test_df['class'], y_score_MaxAbs)
roc_auc_MaxAbs = auc(fpr_MaxAbs, tpr_MaxAbs)

y_score_std = reg_model_std_dim.predict(x_test_sdt_var)
# Cálculo de la curva ROC y AUC
fpr_std, tpr_std, threshold_std = roc_curve(y_test_df['class'], y_score_std)
roc_auc_std = auc(fpr_std, tpr_std)

y_score_corr = reg_model_corr.predict(x_test_corr)
# Cálculo de la curva ROC y AUC
fpr_corr, tpr_corr, threshold_corr = roc_curve(y_test_df['class'], y_score_corr)
roc_auc_corr = auc(fpr_corr, tpr_corr)

y_score_unit_corr = reg_model_unit_corr.predict(x_test_unit_corr)
# Cálculo de la curva ROC y AUC
fpr_unit_corr, tpr_unit_corr, threshold_unit_corr = roc_curve(y_test_df['class'], y_score_unit_corr)
roc_auc_unit_corr = auc(fpr_unit_corr, tpr_unit_corr)

y_score_MaxAbs_corr = reg_model_MaxAbs_corr.predict(x_test_MaxAbs_corr)
# Cálculo de la curva ROC y AUC
fpr_MaxAbs_corr, tpr_MaxAbs_corr, threshold_MaxAbs_corr = roc_curve(y_test_df['class'], y_score_MaxAbs_corr)
roc_auc_MaxAbs_corr = auc(fpr_MaxAbs_corr, tpr_MaxAbs_corr)

y_score_std_corr = reg_model_std_cor.predict(x_test_std_corr)
# Cálculo de la curva ROC y AUC
fpr_std_corr, tpr_std_corr, threshold_std_corr = roc_curve(y_test_df['class'], y_score_std_corr)
roc_auc_std_corr = auc(fpr_std_corr, tpr_std_corr)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- x1
- x10
- x100
- x101
- x102
- ...
Feature names seen at fit time, yet now missing:
- age
- age al
- age ane
- age appet
- age ba
- ...
