### 1. Load the data

In [18]:
import pandas as pd

column_types = {
    'departamento': 'category',
    'municipio': 'category',
    'sexo': 'category',
    'año_registrado': 'category', 
    'edad': 'int64',  
    'periodo': 'category',
    'etnia': 'category',
    'escolaridad': 'category',
    'ocupacion': 'category',
    'causa': 'category',
    'asistencia': 'category',
    'lugar': 'category'
}

defunciones = pd.read_csv('defunciones_clean.csv', dtype=column_types)
defunciones.head()


Unnamed: 0,departamento,municipio,sexo,año_registrado,edad,periodo,etnia,escolaridad,ocupacion,causa,asistencia,lugar
0,17.0,1703,1.0,2012,28,3.0,,1.0,,M329,5.0,6.0
1,1.0,101,2.0,2012,88,3.0,,2.0,,E142,1.0,6.0
2,1.0,101,2.0,2012,74,3.0,,2.0,,E039,1.0,1.0
3,1.0,101,2.0,2012,43,3.0,,2.0,,E149,1.0,6.0
4,1.0,101,2.0,2012,88,3.0,,2.0,,E119,1.0,6.0


### 2. Preprocessing
1. Simplify ICD10 Code
2. Use age_groups instead of age
3. Handle Missing Values

In [19]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

def load_codes(file_path):
    with open(file_path, 'r') as f:
        codes = json.load(f)
    return {entry['code']: entry['code'] for entry in codes if entry['level'] == 0}

def simplify_icd10(code, code_map):
    if pd.isna(code):
        return 'Unknown'
    code = code.split('.')[0]  
    code_prefix = code[:3]  
    for code_range in code_map:
        if '-' not in code_range:
            if code_range[:3] == code_prefix:
                return code_map[code_range]
        else: 
            start, end = code_range.split('-')
            start_prefix = start[:3]
            end_prefix = end[:3]
            
            # For example, if the code is 'B014', the prefix is 'B01' and the range is 'B00-B99', so it's a match.
            if start_prefix <= code_prefix <= end_prefix:
                return code_map[code_range]
    return 'Other'

def preprocess_data(defunciones, code_map):
    defunciones = defunciones.dropna()
    defunciones['causa_simplificada'] = defunciones['causa'].apply(lambda x: simplify_icd10(x, code_map))
    
    print('Numero de causas antes de simplificar:', defunciones['causa'].nunique())
    print('Numero de causas despues de simplificar:', defunciones['causa_simplificada'].nunique())
    
    data = defunciones.copy()
    data = data.drop(columns=['causa'])
    data = data[data['causa_simplificada'] != 'Other']
    data = data.rename(columns={'causa_simplificada': 'causa'})
    
    mode = data['ocupacion'].mode()[0]
    data['ocupacion'] = data['ocupacion'].replace(['NEOG', 'IGNORADO'], mode)
    
    data['age_group'] = pd.cut(data['edad'], bins=[0, 18, 35, 50, 65, float('inf')],
                                        labels=['0-18', '19-35', '36-50', '51-65', '65+'])
    
    data.drop(columns=['edad'], inplace=True)
    
    return data

def analyze_causa_counts(simple_causes):
    causa_counts = simple_causes['causa'].value_counts()
    
    plt.boxplot(causa_counts)
    plt.show()
    
    causa_counts_df = simple_causes['causa'].value_counts().reset_index()
    causa_counts_df.columns = ['causa', 'count']
    
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='causa', y='count', data=causa_counts_df)
    plt.xticks(rotation=90)
    plt.show()
    
    plt.hist(causa_counts, bins=20)
    plt.show()
    
    causa_counts = causa_counts[causa_counts < 10000]
    
    return causa_counts
  
icd10_codes = load_codes('codes.json')
causes = preprocess_data(defunciones, icd10_codes)
# causa_counts = analyze_causa_counts(causes)
# print(causes['age_group'].value_counts())

causes.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  defunciones['causa_simplificada'] = defunciones['causa'].apply(lambda x: simplify_icd10(x, code_map))
  data['ocupacion'] = data['ocupacion'].replace(['NEOG', 'IGNORADO'], mode)


Numero de causas antes de simplificar: 3007
Numero de causas despues de simplificar: 20


Unnamed: 0,departamento,municipio,sexo,año_registrado,periodo,etnia,escolaridad,ocupacion,asistencia,lugar,causa,age_group
70905,14.0,1415,1.0,2013,3.0,1.0,1.0,61,5.0,6.0,K00-K95,65+
70906,14.0,1411,1.0,2013,3.0,1.0,1.0,61,5.0,6.0,K00-K95,65+
70907,16.0,1601,1.0,2013,3.0,1.0,9.0,92,5.0,6.0,K00-K95,65+
70908,1.0,116,1.0,2013,3.0,9.0,1.0,61,5.0,6.0,K00-K95,65+
70909,12.0,1219,1.0,2013,3.0,4.0,1.0,92,5.0,6.0,K00-K95,65+


In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, pair_confusion_matrix
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier

# Load the dataset
data = causes.copy()
data = data.sample(frac=0.1, random_state=42)

# Separate features and target
X = data.drop('causa', axis=1)
X = X.drop('edad', axis=1) # drop the age column, we will use the age_group column
y = data['causa']

# Apply Random Oversampling
X_resampled, y_resampled = RandomOverSampler(random_state=42).fit_resample(X, y)

# Use all available features
selected_features = X.columns.tolist()


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split( X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create the preprocessing pipeline
# numeric_features = ['edad']
categorical_features = [feat for feat in selected_features if feat != 'edad' ]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        # ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# # Create the pipeline
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                            ('model',  LogisticRegression(max_iter=1000, C=1, penalty='l2', solver='saga'))])

# # Fit the model
# pipeline.fit(X_train, y_train)

# # Make predictions
# y_pred = pipeline.predict(X_test)

# # Calculate and print the confusion matrix and accuracy
# print("Confusion Matrix:")
# cm = confusion_matrix(y_test, y_pred)

# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
# plt.title(f"Confusion Matrix - Model 1")
# plt.xlabel('Predicted Labels')
# plt.ylabel('True Labels')
# plt.tight_layout()
# plt.show()
    
# plt.show()
# print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")

pipeline2 = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model',  DecisionTreeClassifier(random_state=42))])

# Fit the model
pipeline2.fit(X_train, y_train)

# Make predictions
y_pred = pipeline2.predict(X_test)

# Calculate and print the confusion matrix and accuracy
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.title(f"Confusion Matrix - Model 2")

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")

# show the tree


# parameters = [{'model__C': 1, 'model__l1_ratio': 0.1, 'model__penalty': 'l1', 'model__solver': 'saga'},
#               {'model__C': 1.0, 'model__penalty': 'l2'},
#               {'model__C': 10.0, 'model__penalty': 'l2'}]

# for param in parameters:
#     # Set the parameters
#     pipeline.set_params(**param)
    
  

# # Define the hyperparameter grid for GridSearchCV
# param_grid = {
#     'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'model__penalty': ['l1', 'l2', 'elasticnet'],
#     'model__C': [0.1, 1, 10],
#     'model__l1_ratio': [0.1, 0.5, 0.9]
# }

# # Perform grid search cross-validation
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Get the best model and accuracy
# best_model = grid_search.best_estimator_
# best_score = grid_search.best_score_
# best_params = grid_search.best_params_

# # Make predictions on the test set using the best model
# y_pred = best_model.predict(X_test)

# # Evaluate the model
# cm = confusion_matrix(y_test, y_pred)
# accuracy = accuracy_score(y_test, y_pred)

# print("Best Hyperparameters:", best_params)
# print("Best Accuracy:", best_score)
# print("Confusion Matrix:")
# print(cm)
# print("Test Accuracy:", accuracy)

NameError: name 'causes' is not defined

Best Hyperparameters: {'model__C': 1, 'model__l1_ratio': 0.1, 'model__penalty': 'l1', 'model__solver': 'saga'}

SVM

In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC  # Importa la clase SVC para SVM
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler

# Carga el conjunto de datos
data = causes.copy()
data = data.sample(frac=0.1, random_state=42)

# Separa características y variable objetivo
X = data.drop('causa', axis=1)
X = X.drop('edad', axis=1)  # Elimina la columna de edad, usaremos la columna de grupo de edad
y = data['causa']

# Aplica Sobremuestreo Aleatorio
X_resampled, y_resampled = RandomOverSampler(random_state=42).fit_resample(X, y)

# Utiliza todas las características disponibles
selected_features = X.columns.tolist()

# Divide los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Crea el preprocesador
categorical_features = [feat for feat in selected_features if feat != 'edad']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

# Crea el pipeline con SVM
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', SVC())])

# Define el grid de hiperparámetros para GridSearchCV
param_grid = {
    'model__C': [0.1, 1],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale']
}

# Realiza la búsqueda en grid con validación cruzada
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtiene el mejor modelo y sus parámetros
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_

# Realiza predicciones en el conjunto de prueba usando el mejor modelo
y_pred = best_model.predict(X_test)

# Evalúa el modelo
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Imprime los resultados
print("Mejores Hiperparámetros:", best_params)
print("Mejor Precisión:", best_score)
print("Matriz de Confusión:")
print(cm)
print("Precisión en el Conjunto de Prueba:", accuracy)


NameError: name 'causes' is not defined