In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


#### Preprocesamiento de datos 
- Carga de datos
- Manejo de características categóricas, escalado y abordaje del desequilibrio de clases utilizando técnicas como ADASYN. 
- Ingeniería de características para potencialmente mejorar el rendimiento del modelo, como la creación de nuevas características y la agregación de variables relacionadas.


In [2]:
# Load the uploaded CSV file to inspect its contents
file_path = '../data/raw/alzheimers_disease_data.csv'
df= pd.read_csv(file_path)
df.head(5)

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [3]:
# Eliminar del dataset las variables no relevantes PatientID y DoctorInCharge
df = df.drop(columns=['PatientID', 'DoctorInCharge'])
df.head(2)

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,7.118696,0,0,2.592424,0,0,0,0,1,0


Separar variable objetivo

In [4]:
X = df.drop(columns=['Diagnosis'])
y = df['Diagnosis']

Separar variables categorícas y hacer ingeniería de características

In [7]:
# Codificación de etiquetas para características categóricas
categorical_cols = ['Gender', 'Ethnicity', 'Smoking', 'EducationLevel', 'FamilyHistoryAlzheimers',
                    'CardiovascularDisease', 'Diabetes', 'Depression', 'HeadInjury', 'Hypertension',
                    'MemoryComplaints', 'BehavioralProblems', 'Confusion', 'Disorientation',
                    'PersonalityChanges', 'DifficultyCompletingTasks', 'Forgetfulness']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Ingeniería de características
# Factores de riesgo para la salud
X['HealthScore'] = (X['BMI'] + X['CholesterolTotal'] + X['CholesterolLDL'] - X['CholesterolHDL']) / 4
X['BP_Ratio'] = X['SystolicBP'] / X['DiastolicBP']

# Factores de estilo de vida
X['LifestyleScore'] = (X['PhysicalActivity'] + X['DietQuality'] + X['SleepQuality']) / 3
X['SmokingAlcoholInteraction'] = X['Smoking'] * X['AlcoholConsumption']

# Historial médico
X['ChronicConditionsCount'] = (X['CardiovascularDisease'] +
                                X['Diabetes'] +
                                X['Hypertension'])

# Evaluación Cognitiva y Funcional
X['CognitiveDeclineScore'] = (X['MMSE'] + X['FunctionalAssessment']) / 2
X['MemoryBehaviorIssuesCount'] = (X['MemoryComplaints'] + X['BehavioralProblems'])


# Interacción entre características
X['Age_BMI_Interaction'] = X['Age'] * X['BMI']
X['Age_CholesterolInteraction'] = X['Age'] * X['CholesterolTotal']


# Ratios de estilo de vida y salud
X['BMILifestyleRatio'] = X['BMI'] / (X['PhysicalActivity'] + X['DietQuality'] + X['SleepQuality'])

# Funciones codificadas
bins = [0, 30, 50, 70, 100]
labels = ['Young', 'Middle-aged', 'Senior', 'Elderly']
X['AgeGroup'] = pd.cut(X['Age'], bins=bins, labels=labels)

# Ratios de colesterol
X['CholesterolLDL_HDL_Ratio'] = X['CholesterolLDL'] / X['CholesterolHDL']
X['CholesterolTriglycerides_Ratio'] = X['CholesterolTriglycerides'] / X['CholesterolTotal']

In [8]:
# Select only the engineered features
engineered_features = [
    'HealthScore', 'BP_Ratio', 'LifestyleScore', 'SmokingAlcoholInteraction',
    'ChronicConditionsCount', 'CognitiveDeclineScore', 'MemoryBehaviorIssuesCount',
    'Age_BMI_Interaction', 'Age_CholesterolInteraction', 'BMILifestyleRatio',
    'AgeGroup', 'CholesterolLDL_HDL_Ratio', 'CholesterolTriglycerides_Ratio'
]

X_engineered = X[engineered_features]
X_engineered

Unnamed: 0,HealthScore,BP_Ratio,LifestyleScore,SmokingAlcoholInteraction,ChronicConditionsCount,CognitiveDeclineScore,MemoryBehaviorIssuesCount,Age_BMI_Interaction,Age_CholesterolInteraction,BMILifestyleRatio,AgeGroup,CholesterolLDL_HDL_Ratio,CholesterolTriglycerides_Ratio
0,71.940731,1.972222,5.566668,0.000000,1,13.991205,0,1673.725694,17692.779298,1.372919,Elderly,1.667061,0.669189
1,93.092449,1.796875,5.096648,0.000000,0,13.865981,0,2387.663626,20573.470956,1.754596,Elderly,2.447320,1.274561
2,96.382053,0.853448,6.448299,0.000000,0,6.625663,0,1299.099418,20745.275617,0.919926,Elderly,2.197473,0.294313
3,47.573051,1.026087,8.085386,12.209266,0,11.478117,1,2501.260461,11809.085731,1.393494,Elderly,0.954850,1.739400
4,73.578638,0.803419,4.234399,0.000000,0,9.781324,0,1843.810671,21146.594343,1.630847,Elderly,1.632894,1.225573
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,88.381495,1.207921,6.046937,0.000000,0,0.719928,0,2386.427170,17109.086243,2.156561,Senior,1.556706,0.836148
2145,51.500826,1.433962,4.273528,0.000000,0,7.572770,1,1339.342709,13978.832711,1.392909,Elderly,1.018804,1.974343
2146,77.272530,0.974576,7.925163,0.000000,0,9.491570,0,1191.688880,18250.890964,0.650943,Elderly,1.567718,1.243763
2147,57.174738,1.072917,5.313528,0.000000,1,4.602191,0,1193.393076,18891.380976,0.959809,Elderly,0.645697,0.599733


In [9]:
# Save the engineered features dataset
X_engineered.to_csv('../data/processed/engineered_features.csv', index=False)

# If you want to include the target variable
X_engineered_with_target = X_engineered.copy()
X_engineered_with_target['Diagnosis'] = y
X_engineered_with_target.to_csv('../data/processed/engineered_features_with_target.csv', index=False)

Separar caraacterísticas numéricas y categóricas para imputación

In [10]:
numeric_features = X_engineered.select_dtypes(include=['float64', 'int64'])
categorical_features = X_engineered.select_dtypes(include=['object'])

# Imputar características numéricas
if not numeric_features.empty:
    numeric_imputer = SimpleImputer(strategy='mean')
    X_numeric_imputed = numeric_imputer.fit_transform(numeric_features)
else:
    X_numeric_imputed = numeric_features

# Imputar características categóricas
if not categorical_features.empty:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    X_categorical_imputed = categorical_imputer.fit_transform(categorical_features)
else:
    X_categorical_imputed = categorical_features

# Combinar categorias imputadas
X_imputed = pd.DataFrame(X_numeric_imputed, columns=numeric_features.columns)
if not categorical_features.empty:
    X_imputed[categorical_features.columns] = X_categorical_imputed

Escalar datos

In [11]:
# Escalra los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

Dividir los datos preprocesados en entrenamiento y validación

In [12]:
# Dividir en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

Aumentar datos

In [13]:
from imblearn.over_sampling import ADASYN

# Aplicar ADASYN para aumentar datos
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Guardar los datos para usarlos en los modelos
pd.DataFrame(X_train_resampled).to_csv('../data/processed/X_train_resampled.csv', index=False)
pd.DataFrame(y_train_resampled).to_csv('../data/processed/y_train_resampled.csv', index=False)
pd.DataFrame(X_test).to_csv('../data/processed/X_test.csv', index=False)
pd.DataFrame(y_test).to_csv('../data/processed/y_test.csv', index=False)

In [None]:

df_selected.to_csv('../data/processed/alzheimer_proc.csv')