In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%pip install seaborn
import seaborn as sns

In [None]:
data = pd.read_csv('./data.csv')

In [None]:
data.head()

In [None]:
#edad vs varicela
plt.figure(figsize=(10, 6))
plt.scatter(data['Age'], data['Varicella'])
plt.xlabel('Edad')
plt.ylabel('Varicela')
plt.title('Edad vs Varicela')
plt.grid(True)
plt.show()



In [None]:
data.info()

In [None]:
data[data.columns] = data[data.columns].astype('Int64')

In [None]:
df = data.copy()

In [None]:
gender = {1:'Male', 2: 'Female'}
breastfeeding = {1: 'yes', 2:'no', 3:'unknown'}
varicella = {1 : 'positive', 2: 'negative', 3: 'unknown'}
group = {1: 'CDMS' , 2: 'Non-CDMS' }

In [None]:
data['Gender'] = data['Gender'].map(gender)
data['Breastfeeding'] = data['Breastfeeding'].map(breastfeeding)
data['Varicella'] = data['Varicella'].map(varicella)
data['group'] = data['group'].map(group)

#con .map, estamos cambiando los valores de las columnas por los valores que le asignamos en los diccionarios

In [None]:
print (data.columns)

In [None]:
data = data.drop('Unnamed: 0', axis=1)
data.head(6)

In [None]:
#transformo en numerico las columnas que son categoricas, en 0 y 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data["Varicella"] = le.fit_transform(data["Varicella"])
data["Breastfeeding"] = le.fit_transform(data["Breastfeeding"])
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

data.head(6)


In [None]:
# def count_plot(data, feature, title, hue=True): 
#     plt.figure(figsize=(7, 5),facecolor='#F6F5F4')
#     total = float(len(data))
#     if hue:
#         ax = sns.countplot(x=data[feature], hue=data['group'] , palette='rainbow')
        
#     else:
#         ax = sns.countplot(x=data[feature], hue=None, palette='rainbow')
    
#     ax.set_facecolor('#F6F5F4')

#     for p in ax.patches:

#         height = p.get_height()
#         ax.text(p.get_x()+p.get_width()/2.,height + 3,'{:1.1f} %'.format((height/total)*100), ha="center",
#                bbox=dict(facecolor='none', edgecolor='black', boxstyle='round', linewidth=0.5))

#     ax.set_title(title, fontsize=16, y=1.05)
#     sns.despine(right=True)
#     sns.despine(offset=5, trim=True)

In [None]:
# count_plot(data, 'group', 'both genders - gender', hue=False)

In [None]:
data.groupby(['Gender', 'group'] )['group'].count()

In [None]:
# data.groupby(['Periventricular_MRI', 'Infratentorial_MRI', 'Cortical_MRI', 'Spinal_Cord_MRI'])['group']\
#     .value_counts(normalize=True).mul(100).to_frame(name='Percent within group (%)')\
#     .style.background_gradient(cmap='Blues')

In [None]:
total = len(data)

# Calculo cuanto tiene cada categoria
num_male = (data['Gender'] == 0).sum()
num_female = (data['Gender'] == 1).sum()
num_unknown = total - num_male - num_female

# porcentajes
percent_male = (num_male / total) * 100
percent_female = (num_female / total) * 100
percent_unknown = (num_unknown / total) * 100

In [None]:
print(f"Porcentaje de hombres: {percent_male:.2f}%")
print(f"Porcentaje de mujeres: {percent_female:.2f}%")
print(f"Porcentaje de desconocido: {percent_unknown:.2f}%")

In [None]:
data["group"] = le.fit_transform(data["group"])
data["group"] = 1 - data["group"]
data

In [None]:
#hisogramas para visualizar la distribucion de los datos de todas las columnas del dataset
columnas = data.select_dtypes(include=['int64', 'float64']).columns


for a in columnas:
   plt.figure(figsize=(8, 6))
   sns.histplot(data[a], kde=True, color='blue')
   plt.title(f'Distribución de datos de {a}')
   plt.xlabel(a)
   plt.ylabel('Frecuencia')
   plt.show()

In [None]:
# Calcular la matriz de correlación
matriz_de_correlacion = data.corr()

correlations_with_group = matriz_de_correlacion['group'].drop('group').sort_values(ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x=correlations_with_group.values, y=correlations_with_group.index, palette='viridis')
plt.title('Correlaciones de las variables con la columna group')
plt.xlabel('Correlación')
plt.ylabel('Variables')
plt.show()

In [None]:
#1ER MODELO DE REGRESION LOGISTICA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
X = data.drop('group', axis=1)
y = data['group']

In [None]:
# Dividir los datos en conjuntos de entrenamiento (60%), validación (20%) y prueba (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
# Escalar las características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Entrenar un modelo de clasificación (Random Forest en este caso)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Validar el modelo
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Precisión en el conjunto de validación:s {val_accuracy:.2f}')
print('Reporte de clasificación en el conjunto de validación:')
print(classification_report(y_val, y_val_pred))

# Evaluar el modelo en el conjunto de prueba
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Precisión en el conjunto de prueba: {test_accuracy:.2f}')
print('Reporte de clasificación en el conjunto de prueba:')
print('Matriz de confusión en el conjunto de validación:')
print(confusion_matrix(y_val, y_val_pred))

In [None]:
# Validación cruzada
cross_val_scores = cross_val_score(model, X, y, cv=5)
print(f'Precisión media de validación cruzada: {cross_val_scores.mean():.2f}')
cross_val_predictions = cross_val_predict(model, X, y, cv=5)
print('Reporte de clasificación de validación cruzada:')
print(classification_report(y, cross_val_predictions))
print('Matriz de confusión de validación cruzada:')
print(confusion_matrix(y, cross_val_predictions))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
data