# Clasificación por Random Forest

## Importar las librerías

In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import RandomOverSampler

## Importar el conjunto de datos

In [23]:
data = pd.read_csv('data.csv')

data.head()

Unnamed: 0,Área_Geográfica,Asistencia_Recibida,Clase_Parto,Edad_Madre,Estado_Civil_Madre,Escolaridad_Madre,Ocupación_Madre,Semanas_Gestacion,Sexo,Sitio_Ocurrencia,Tipo_Atencion,Total_Hijos_Nacidos,Total_Hijos_Nacidos_Muertos,Total_Hijos_Vivos,Via_Parto,Nacionalidad_Madre,Grupo_Etnico_Madre,Causa_Defunción
0,99,6,1,18.0,2,99,99,34.416349,1,2,1,1.269311,2.0,1.0,99,1,4,0
1,99,1,1,36.0,2,99,1,34.416349,1,9,1,1.269311,3.0,2.0,99,1,4,0
2,99,2,1,22.0,2,99,99,34.416349,1,2,1,1.269311,3.0,2.0,99,1,4,0
3,99,2,1,26.0,2,99,99,34.416349,1,2,1,1.269311,0.0,0.0,99,1,4,0
4,99,2,1,43.0,2,99,99,34.416349,2,2,1,1.269311,10.0,6.0,99,1,4,0


In [24]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [26]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [27]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

classifier = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)


In [28]:
best_classifier = grid_search.best_estimator_
y_pred = best_classifier.predict(X_test)
confusion_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Best parameters:", grid_search.best_params_)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)

Best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Confusion Matrix:
 [[8712   75    2]
 [  16 8862    0]
 [ 825    7    3]]
Accuracy: 0.9500054048211004


In [29]:
importance_scores = best_classifier.feature_importances_
sorted_indices = importance_scores.argsort()[::-1]

top_five_indices = sorted_indices[:5]
top_five_variables = data.columns[top_five_indices]

print("Top five indices:\n", sorted_indices)
print("Top five variables:\n", top_five_variables)

Top five indices:
 [ 3  7 13 16  5 12  9 11  8  4  1 14  0  2  6 10 15]
Top five variables:
 Index(['Edad_Madre', 'Semanas_Gestacion', 'Total_Hijos_Vivos',
       'Grupo_Etnico_Madre', 'Escolaridad_Madre'],
      dtype='object')
