<a href="https://colab.research.google.com/github/AlejoGomezQ/Practica_04_Analitica_Datos/blob/main/ModelosPredictivos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modelos predictivos - registro de vendedores de cartagena

## Objetivo del modelo

Desarrollar modelos de clasificación para predecir el ESTADO (ACTIVO/INACTIVO) de los vendedores registrados en Cartagena.

**Variable objetivo:** ESTADO

**Variables predictoras:** Etnia, Género, Localidad, Tipo de oferta, Tipo de venta, Actividad de oferta

## 1. Importar librerías

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

## 2. Cargar datos limpios

In [2]:
df = pd.read_csv('Registro_vendedores_limpio.csv')

print(f"Dimensiones: {df.shape}")
print(f"\nColumnas: {list(df.columns)}")
df.head()

Dimensiones: (3416, 8)

Columnas: ['Actividad_Oferta', 'Localidad', 'Barrios', 'Tipo_de_Venta', 'Tipo_de_Oferta', 'Genero', 'Etnia', 'Estado']


Unnamed: 0,Actividad_Oferta,Localidad,Barrios,Tipo_de_Venta,Tipo_de_Oferta,Genero,Etnia,Estado
0,ARTESANIAS,'LOCALIDAD 3','ALAMEDA LA VICTORIA',ESTACIONARIO,BIENES,FEMENINO,NINGUNA,ACTIVO
1,AGUA/REFRESCOS,'LOCALIDAD 1',BAZURTO,AMBULANTE,SERVICIOS,MASCULINO,NINGUNA,ACTIVO
2,AGUA/REFRESCOS,'LOCALIDAD 1',BAZURTO,ESTACIONARIO,BIENES,FEMENINO,NINGUNA,ACTIVO
3,AGUA/REFRESCOS,'LOCALIDAD 1',BAZURTO,AMBULANTE,SERVICIOS,FEMENINO,NINGUNA,ACTIVO
4,AGUA/REFRESCOS,'LOCALIDAD 1',BAZURTO,ESTACIONARIO,BIENES,FEMENINO,NINGUNA,ACTIVO


## 3. Preparación de datos

In [3]:
posibles_nombres_estado = ['ESTADO', 'Estado', 'estado']
columna_estado = None

for nombre in posibles_nombres_estado:
    if nombre in df.columns:
        columna_estado = nombre
        break

y = df[columna_estado]
X = df.drop(columns=[columna_estado])

print(f"Variable objetivo: {columna_estado}")
print(f"Distribución de clases:\n{y.value_counts()}")

Variable objetivo: Estado
Distribución de clases:
Estado
ACTIVO      2136
INACTIVO    1280
Name: count, dtype: int64


### 3.1 Codificación de variables categóricas

In [4]:
le_X = {}
X_encoded = X.copy()

for col in X.columns:
    if X[col].dtype == 'object':
        le_X[col] = LabelEncoder()
        X_encoded[col] = le_X[col].fit_transform(X[col].astype(str))

le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

print("Codificación completada")
X_encoded.head()

Codificación completada


Unnamed: 0,Actividad_Oferta,Localidad,Barrios,Tipo_de_Venta,Tipo_de_Oferta,Genero,Etnia
0,38,2,0,1,0,0,4
1,37,0,13,0,1,1,4
2,37,0,13,1,0,0,4
3,37,0,13,0,1,0,4
4,37,0,13,1,0,0,4


### 3.2 División de datos (70% entrenamiento, 30% prueba)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

print(f"Entrenamiento: {X_train.shape[0]} registros")
print(f"Prueba: {X_test.shape[0]} registros")

Entrenamiento: 2391 registros
Prueba: 1025 registros


## 4. Entrenamiento y evaluación de modelos

### 4.1 Árbol de decisión

In [6]:
modelo_arbol = DecisionTreeClassifier(random_state=42, max_depth=10)
modelo_arbol.fit(X_train, y_train)

y_pred_arbol = modelo_arbol.predict(X_test)

print("ÁRBOL DE DECISIÓN")
print(f"Accuracy: {accuracy_score(y_test, y_pred_arbol):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_arbol, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_arbol, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_arbol, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_test, y_pred_arbol)}")

ÁRBOL DE DECISIÓN
Accuracy: 0.7561
Precision: 0.7689
Recall: 0.7561
F1-Score: 0.7592

Matriz de Confusión:
[[482 159]
 [ 91 293]]


### 4.2 K-Nearest Neighbors

In [7]:
modelo_knn = KNeighborsClassifier(n_neighbors=5)
modelo_knn.fit(X_train, y_train)

y_pred_knn = modelo_knn.predict(X_test)

print("K-NEAREST NEIGHBORS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_knn, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_knn, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_knn, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_test, y_pred_knn)}")

K-NEAREST NEIGHBORS
Accuracy: 0.7522
Precision: 0.7622
Recall: 0.7522
F1-Score: 0.7550

Matriz de Confusión:
[[486 155]
 [ 99 285]]


### 4.3 Red neuronal

In [8]:
modelo_mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
modelo_mlp.fit(X_train, y_train)

y_pred_mlp = modelo_mlp.predict(X_test)

print("RED NEURONAL (MLP)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_mlp, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_mlp, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_mlp, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_test, y_pred_mlp)}")

RED NEURONAL (MLP)
Accuracy: 0.7434
Precision: 0.7484
Recall: 0.7434
F1-Score: 0.7452

Matriz de Confusión:
[[494 147]
 [116 268]]


### 4.4 Support Vector Machine

In [9]:
modelo_svm = SVC(kernel='rbf', random_state=42)
modelo_svm.fit(X_train, y_train)

y_pred_svm = modelo_svm.predict(X_test)

print("SUPPORT VECTOR MACHINE")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_svm, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_svm, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_svm, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_test, y_pred_svm)}")

SUPPORT VECTOR MACHINE
Accuracy: 0.6624
Precision: 0.6479
Recall: 0.6624
F1-Score: 0.6444

Matriz de Confusión:
[[531 110]
 [236 148]]


### 4.5 Random Forest

In [10]:
modelo_rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
modelo_rf.fit(X_train, y_train)

y_pred_rf = modelo_rf.predict(X_test)

print("RANDOM FOREST")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"\nMatriz de Confusión:\n{confusion_matrix(y_test, y_pred_rf)}")

RANDOM FOREST
Accuracy: 0.7571
Precision: 0.7672
Recall: 0.7571
F1-Score: 0.7598

Matriz de Confusión:
[[488 153]
 [ 96 288]]


## 5. Comparación de modelos

In [11]:
resultados = {
    'Modelo': ['Árbol de Decisión', 'KNN', 'Red Neuronal', 'SVM', 'Random Forest'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_arbol),
        accuracy_score(y_test, y_pred_knn),
        accuracy_score(y_test, y_pred_mlp),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_rf)
    ],
    'Precision': [
        precision_score(y_test, y_pred_arbol, average='weighted'),
        precision_score(y_test, y_pred_knn, average='weighted'),
        precision_score(y_test, y_pred_mlp, average='weighted'),
        precision_score(y_test, y_pred_svm, average='weighted'),
        precision_score(y_test, y_pred_rf, average='weighted')
    ],
    'Recall': [
        recall_score(y_test, y_pred_arbol, average='weighted'),
        recall_score(y_test, y_pred_knn, average='weighted'),
        recall_score(y_test, y_pred_mlp, average='weighted'),
        recall_score(y_test, y_pred_svm, average='weighted'),
        recall_score(y_test, y_pred_rf, average='weighted')
    ],
    'F1-Score': [
        f1_score(y_test, y_pred_arbol, average='weighted'),
        f1_score(y_test, y_pred_knn, average='weighted'),
        f1_score(y_test, y_pred_mlp, average='weighted'),
        f1_score(y_test, y_pred_svm, average='weighted'),
        f1_score(y_test, y_pred_rf, average='weighted')
    ]
}

df_resultados = pd.DataFrame(resultados)
df_resultados = df_resultados.sort_values('Accuracy', ascending=False)

print("COMPARACIÓN DE MODELOS")
print("="*70)
df_resultados

COMPARACIÓN DE MODELOS


Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-Score
4,Random Forest,0.757073,0.767225,0.757073,0.759813
0,Árbol de Decisión,0.756098,0.768899,0.756098,0.759186
1,KNN,0.752195,0.762195,0.752195,0.754956
2,Red Neuronal,0.743415,0.748376,0.743415,0.745213
3,SVM,0.662439,0.647852,0.662439,0.644418
