In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

In [2]:
df = pd.read_csv("../csv/df_subconjunto_cluster.csv")

In [3]:
df.head()

Unnamed: 0,estudios,experiencia,skills,tecnologias_aptitudes,vacaciones,beneficios,salario_medio,cluster
0,1,3.0,0,8,25.6,0,37800.0,0
1,1,2.0,0,8,23.2,0,30000.0,0
2,1,3.0,0,11,23.2,0,39900.0,0
3,1,5.0,0,11,25.6,0,41100.0,0
4,1,2.0,0,10,25.6,0,30000.0,0


In [4]:
df.shape

(5359, 8)

In [5]:
df["cluster"].value_counts()

cluster
1    4003
0    1356
Name: count, dtype: int64

In [6]:
df.describe()

Unnamed: 0,estudios,experiencia,skills,tecnologias_aptitudes,vacaciones,beneficios,salario_medio,cluster
count,5359.0,5359.0,5359.0,5359.0,5359.0,5359.0,5359.0,5359.0
mean,0.253032,2.353331,1.506438,3.690987,23.923381,1.277291,44419.338683,0.746968
std,0.43479,1.918978,2.937177,2.490605,1.552377,2.813009,12093.32588,0.43479
min,0.0,0.0,0.0,0.0,10.0,0.0,850.0,0.0
25%,0.0,0.0,0.0,2.0,23.2,0.0,37500.0,0.0
50%,0.0,3.0,0.0,3.0,23.8,0.0,42600.0,1.0
75%,1.0,3.0,0.0,5.0,24.2,0.0,50000.0,1.0
max,1.0,10.0,19.0,25.0,40.0,17.0,150000.0,1.0


In [7]:
X = df.drop(columns=["cluster"])
y = df["cluster"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest:")
print(classification_report(y_test, y_pred_rf))


Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       271
           1       1.00      1.00      1.00       801

    accuracy                           1.00      1072
   macro avg       1.00      1.00      1.00      1072
weighted avg       1.00      1.00      1.00      1072



In [10]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("KNN:")
print(classification_report(y_test, y_pred_knn))

KNN:
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       271
           1       0.97      0.97      0.97       801

    accuracy                           0.96      1072
   macro avg       0.94      0.95      0.94      1072
weighted avg       0.96      0.96      0.96      1072



In [11]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print("Naive Bayes:")
print(classification_report(y_test, y_pred_nb))

Naive Bayes:
              precision    recall  f1-score   support

           0       0.48      1.00      0.65       271
           1       1.00      0.63      0.77       801

    accuracy                           0.72      1072
   macro avg       0.74      0.81      0.71      1072
weighted avg       0.87      0.72      0.74      1072



In [12]:
#Regrasión 
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Regresión Logística:")
print(classification_report(y_test, y_pred_lr))

Regresión Logística:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       271
           1       1.00      1.00      1.00       801

    accuracy                           1.00      1072
   macro avg       1.00      1.00      1.00      1072
weighted avg       1.00      1.00      1.00      1072



In [13]:
# Guardar modelo
joblib.dump(lr, 'modelo.pkl')

# Guardar escalador
joblib.dump(scaler, 'escalador.pkl')

['escalador.pkl']