In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


>

# Carga de Librerias

In [1]:
#Manipulacion y Calculo de datos
import pandas as pd
import numpy as np
pd.options.display.max_columns
import scipy.stats as stats

# VIsualización de datos
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

#Manejo de avisos de Warning
import warnings
warnings.filterwarnings('ignore')


# Prepocesamiento
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


#Machine Learning
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss
from sklearn.model_selection import GridSearchCV

>

>

# Carga de datos transformados

In [2]:
datos_ml = pd.read_csv("/content/drive/MyDrive/EAN/proyecto VIC/df_encoded40_transformados.csv")

In [3]:
datos_ml.head(3)

Unnamed: 0,ano_del_hecho_2016,ano_del_hecho_2017,ano_del_hecho_2018,ano_del_hecho_2019,ano_del_hecho_2020,ano_del_hecho_2021,ano_del_hecho_2022,ano_del_hecho_2023,sexo_de_la_victima_Mujer,grupo_mayor_menor_de_edad_b) Mayores de Edad (>18 años),...,presunto_agresor_Padrastro,presunto_agresor_Padre,presunto_agresor_Pareja o Expareja,presunto_agresor_Primo (a),presunto_agresor_Profesor (a),presunto_agresor_Sin información,presunto_agresor_Sobrino (a),presunto_agresor_Suegro (a),presunto_agresor_Tío (a),presunto_agresor_Yerno
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Se realiza la carga total de los datos ya tranformados con One Hot Encoded, estan listos para ser trabajados con los modelos de Machine Learning que se seleccionen

In [18]:
datos_ml.shape

(246282, 1298)

>

>

>

#Sub Set de datos con solo el 20%

Despues de varios intentos por trabajar con todos los datos o con parciles un poco más grandes, se debe tomar la decisión de trabajar con un 20% de todos los datos, el cual a  probado ser factible para avanzar con el proyecto sin interrumpciones por falta de potencia computacional

In [4]:
datos_ml2 = datos_ml.sample(frac=0.2, random_state=42)
datos_ml2.shape

(49256, 1298)

>

# Preparación de datos para algoricmos de Machine Learning

Se decidió avanzar con un modelo de clasificación que nos prediga quien el presunto agresor en los caso de violencia intrafamiliar, esto es particularmente útil en aquellos caso en los que se asiste a instituciones de salud por algún tipo de herida pero no se reporta una denuncia contra el agresor, usaremos los datos históricos que tenemos para usar las distintas características como predictores.

>

In [5]:
datos_ml2.columns

Index(['ano_del_hecho_2016', 'ano_del_hecho_2017', 'ano_del_hecho_2018',
       'ano_del_hecho_2019', 'ano_del_hecho_2020', 'ano_del_hecho_2021',
       'ano_del_hecho_2022', 'ano_del_hecho_2023', 'sexo_de_la_victima_Mujer',
       'grupo_mayor_menor_de_edad_b) Mayores de Edad (>18 años)',
       ...
       'presunto_agresor_Padrastro', 'presunto_agresor_Padre',
       'presunto_agresor_Pareja o Expareja', 'presunto_agresor_Primo (a)',
       'presunto_agresor_Profesor (a)', 'presunto_agresor_Sin información',
       'presunto_agresor_Sobrino (a)', 'presunto_agresor_Suegro (a)',
       'presunto_agresor_Tío (a)', 'presunto_agresor_Yerno'],
      dtype='object', length=1298)

## Separación de etiquetas de la variable dependiente "Posible_Agresor"

In [6]:
lista_etiquetas = [col for col in datos_ml2.columns if col.startswith('presunto_agresor')]

['presunto_agresor_Amante',
 'presunto_agresor_Compañero (a) permanente',
 'presunto_agresor_Cuñado (a)',
 'presunto_agresor_Encargado del cuidado',
 'presunto_agresor_Esposo (a)',
 'presunto_agresor_Ex Compañero (a) sentimental',
 'presunto_agresor_Ex amante',
 'presunto_agresor_Ex compañero (a) permanente',
 'presunto_agresor_Ex esposo (a)',
 'presunto_agresor_Ex novio (a)',
 'presunto_agresor_Hermanastro (a)',
 'presunto_agresor_Hermano (a)',
 'presunto_agresor_Hijastro (a)',
 'presunto_agresor_Hijo (a)',
 'presunto_agresor_Madrastra',
 'presunto_agresor_Madre',
 'presunto_agresor_Nieto (a)',
 'presunto_agresor_Novio (a)',
 'presunto_agresor_Nuera',
 'presunto_agresor_Otros familiares civiles o consanguíneos',
 'presunto_agresor_Padrastro',
 'presunto_agresor_Padre',
 'presunto_agresor_Pareja o Expareja',
 'presunto_agresor_Primo (a)',
 'presunto_agresor_Profesor (a)',
 'presunto_agresor_Sin información',
 'presunto_agresor_Sobrino (a)',
 'presunto_agresor_Suegro (a)',
 'presunto_ag

>

### Variables dependiente

In [7]:
y = datos_ml2[lista_etiquetas]
y.head(1)

Unnamed: 0,presunto_agresor_Amante,presunto_agresor_Compañero (a) permanente,presunto_agresor_Cuñado (a),presunto_agresor_Encargado del cuidado,presunto_agresor_Esposo (a),presunto_agresor_Ex Compañero (a) sentimental,presunto_agresor_Ex amante,presunto_agresor_Ex compañero (a) permanente,presunto_agresor_Ex esposo (a),presunto_agresor_Ex novio (a),...,presunto_agresor_Padrastro,presunto_agresor_Padre,presunto_agresor_Pareja o Expareja,presunto_agresor_Primo (a),presunto_agresor_Profesor (a),presunto_agresor_Sin información,presunto_agresor_Sobrino (a),presunto_agresor_Suegro (a),presunto_agresor_Tío (a),presunto_agresor_Yerno
234069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


>

### Variables Independientes

In [8]:
X = datos_ml2.drop(lista_etiquetas, axis=1)
X.head(1)

Unnamed: 0,ano_del_hecho_2016,ano_del_hecho_2017,ano_del_hecho_2018,ano_del_hecho_2019,ano_del_hecho_2020,ano_del_hecho_2021,ano_del_hecho_2022,ano_del_hecho_2023,sexo_de_la_victima_Mujer,grupo_mayor_menor_de_edad_b) Mayores de Edad (>18 años),...,diagnostico_topografico_de_la_lesion_Trauma de abdomen,diagnostico_topografico_de_la_lesion_Trauma de cuello,diagnostico_topografico_de_la_lesion_Trauma de miembros,diagnostico_topografico_de_la_lesion_Trauma de tórax,diagnostico_topografico_de_la_lesion_Trauma facial,diagnostico_topografico_de_la_lesion_Trauma área Pélvica,sexo_del_presunto_agresor_Mujer,sexo_del_presunto_agresor_No binario,sexo_del_presunto_agresor_Sin información,sexo_del_presunto_agresor_Transgenero
234069,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


>

>

# Divición de datos para entrenamiento y testeo

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Forma de X_train:", X_train.shape)
print("Forma de X_test:", X_test.shape)
print("Forma de y_train:", y_train.shape)
print("Forma de y_test:", y_test.shape)

Forma de X_train: (34479, 1268)
Forma de X_test: (14777, 1268)
Forma de y_train: (34479, 30)
Forma de y_test: (14777, 30)


>

>

# Selección de modelos de Machine Learning

Al tener un modelo de predicción multietiqueta, decidimos trabajar
con el Meta - Estimador "MultiOutputClassififier como base para modelos de clasificación ya conocidos para ello elegimos probar tres modelos con los siguientes métodos



>

>

# Regresión Logistica

Ideal para problemas de clasificación, es versátil  y computacionalmente eficiente, teniendo en cuenta que hemos tenido considerables problemas de rendimiento computacional, se hace ideal para iniciar y probar.

In [12]:
#-------------------------Inicio sección de codigo generado con AI Gemini ----------------------------------------

cols_to_drop = []
for col in y_train.columns:
    if len(y_train[col].unique()) <= 1:
        cols_to_drop.append(col)

# Drop the identified columns from y_train and y_test
y_train = y_train.drop(columns=cols_to_drop)
y_test = y_test.drop(columns=cols_to_drop)
print(f"Dropped columns: {cols_to_drop}")
print("Shape of y_train after dropping columns:", y_train.shape)
print("Shape of y_test after dropping columns:", y_test.shape)

#-------------------------Fin  sección de codigo generado con AI Gemini ----------------------------------------

#Modelo
model = MultiOutputClassifier(LogisticRegression(C=0.1, penalty="l1", solver="liblinear"))

#Ajuste del Modelo
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)


precision = precision_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
recall = recall_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
f1 = f1_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
hamming = hamming_loss(y_test, y_pred)

print(f'Exactitud: {accuracy:.2f}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Hamming Loss: {hamming}")

Dropped columns: []
Shape of y_train after dropping columns: (34479, 29)
Shape of y_test after dropping columns: (14777, 29)
Exactitud: 0.41
Precision: 0.6816088079991012
Recall: 0.4123003737682637
F1-score: 0.5138042005420054
Hamming Loss: 0.0267937358383135


# RandomForestClassifier

Idela para encontrar relaciones no lineales, lo que hace que sea un potente modelo para encontrar relaciones que se ajustan más a el comportamiento del mundo real sin supocisiones lineales

In [15]:
#-------------------------Inicio sección de codigo generado con AI Gemini ----------------------------------------

cols_to_drop = []
for col in y_train.columns:
    if len(y_train[col].unique()) <= 1:
        cols_to_drop.append(col)

# Drop the identified columns from y_train and y_test
y_train = y_train.drop(columns=cols_to_drop)
y_test = y_test.drop(columns=cols_to_drop)
print(f"Dropped columns: {cols_to_drop}")
print("Shape of y_train after dropping columns:", y_train.shape)
print("Shape of y_test after dropping columns:", y_test.shape)

#-------------------------Fin  sección de codigo generado con AI Gemini ----------------------------------------

#Modelo
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=10, max_depth= None, min_samples_split=2))

# Ajuste del Modelo
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
recall = recall_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
f1 = f1_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
hamming = hamming_loss(y_test, y_pred)

print(f'Exactitud: {accuracy:.2f}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Hamming Loss: {hamming}")

Dropped columns: []
Shape of y_train after dropping columns: (34479, 29)
Shape of y_test after dropping columns: (14777, 29)
Exactitud: 0.26
Precision: 0.6594603653427182
Recall: 0.2674142031940197
F1-score: 0.3805241272604197
Hamming Loss: 0.029897347462155773


>

>

# GradientBoostingClassifier

Al igual que el RandomForestClassifier, los modelos de boosting pueden capturar relaciones no lineales y complejas entre características y etiquetas, al tener que trabajar con menos datos de los esperado este modelo nos podría ofrecer una mayor captura de relaciones entre las variables.

In [16]:
#-------------------------Inicio sección de codigo generado con AI Gemini ----------------------------------------

cols_to_drop = []
for col in y_train.columns:
    if len(y_train[col].unique()) <= 1:
        cols_to_drop.append(col)

# Drop the identified columns from y_train and y_test
y_train = y_train.drop(columns=cols_to_drop)
y_test = y_test.drop(columns=cols_to_drop)
print(f"Dropped columns: {cols_to_drop}")
print("Shape of y_train after dropping columns:", y_train.shape)
print("Shape of y_test after dropping columns:", y_test.shape)

#-------------------------Fin  sección de codigo generado con AI Gemini ----------------------------------------

# Modelo
model = MultiOutputClassifier(GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_depth=3))

# Ajuste del Modelo
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
recall = recall_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
f1 = f1_score(y_test, y_pred, average='micro') # or 'macro', 'weighted'
hamming = hamming_loss(y_test, y_pred)

print(f'Exactitud: {accuracy:.2f}')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Hamming Loss: {hamming}")

Dropped columns: []
Shape of y_train after dropping columns: (34479, 29)
Shape of y_test after dropping columns: (14777, 29)
Exactitud: 0.20
Precision: 0.7292455210698966
Recall: 0.1963982330954808
F1-score: 0.30945497376592784
Hamming Loss: 0.03009803212354708


>

>

# Resultados Modelos

---------------------------------------------------------

**Regresión Logistica** (Mejor Modelo)

- Exactitud: 0.41
- Precision: 0.6816088079991012
- Recall: 0.4123003737682637
- F1-score: 0.5138042005420054
- Hamming Loss: 0.0267937358383135

---------------------------------------------------------

**RandomForestClassifier**

- Exactitud: 0.26
- Precision: 0.6594603653427182
- Recall: 0.2674142031940197
- F1-score: 0.3805241272604197
- Hamming Loss: 0.029897347462155773

---------------------------------------------------------

**GradientBoosterClasifier**

- Exactitud: 0.20
- Precision: 0.7292455210698966
- Recall: 0.1963982330954808
- F1-score: 0.30945497376592784
- Hamming Loss: 0.03009803212354708

---------------------------------------------------------