In [1]:
# %pip install optuna
# %pip install xgboost 

import optuna
import pandas as pd
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("dataset.csv")
df = df.dropna()

X, y = df[["Time", "Current education expenditure, total", "Current health expenditure", "Domestic general government health expenditure", "Unemployment, youth total   ", "Unemployment with basic education", "Unemployment with intermediate education", "Unemployment, total   ", "Unemployment, youth total   ", "Wage and salaried workers, total"]], df["Vulnerable employment, total   "]

y = y.apply(lambda x: 1 if x > y.mean() else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [3]:
rf_clf = RandomForestClassifier(n_estimators=170, random_state=42)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.67


In [4]:
boston = fetch_openml(name="boston", version=1, as_frame=True)
X, y = boston.data, boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_test = y_test.apply(lambda x: 1 if x > y.mean() else 0)
y_train = y_train.apply(lambda x: 1 if x > y.mean() else 0)

In [5]:
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

y_pred = rf_reg.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")

MSE: 0.07


In [6]:
log_clf = LogisticRegression(random_state=42, solver="liblinear")
svm_clf = SVC(probability=True, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svm_clf), ('rf', rf_clf)],
    voting='soft'
)

X_test = X_test.astype(float)

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
print(f"VotingClassifier Accuracy: {accuracy_score(y_test, y_pred):.2f}")

VotingClassifier Accuracy: 0.91


In [7]:
estimators = [
    ('lr', LogisticRegression(random_state=42, solver="liblinear")),
    ('svc', SVC(probability=True, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)

X_train = X_train.astype(float)

stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)
print(f"StackingClassifier Accuracy: {accuracy_score(y_test, y_pred):.2f}")

StackingClassifier Accuracy: 0.95


In [8]:
xgb_clf = XGBClassifier(n_estimators=100, random_state=42)
xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred):.2f}")

XGBoost Accuracy: 0.92


In [9]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)

    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(f"Mejores hiperparámetros: {study.best_params}")

[I 2025-03-19 18:53:48,497] A new study created in memory with name: no-name-e1b996b0-5532-4adf-a230-c02aaba5d6db
[I 2025-03-19 18:53:48,523] Trial 0 finished with value: 0.8947368421052632 and parameters: {'n_estimators': 13, 'max_depth': 10}. Best is trial 0 with value: 0.8947368421052632.
[I 2025-03-19 18:53:48,652] Trial 1 finished with value: 0.9342105263157895 and parameters: {'n_estimators': 85, 'max_depth': 10}. Best is trial 1 with value: 0.9342105263157895.
[I 2025-03-19 18:53:48,842] Trial 2 finished with value: 0.8947368421052632 and parameters: {'n_estimators': 155, 'max_depth': 2}. Best is trial 1 with value: 0.9342105263157895.
[I 2025-03-19 18:53:49,034] Trial 3 finished with value: 0.9473684210526315 and parameters: {'n_estimators': 137, 'max_depth': 9}. Best is trial 3 with value: 0.9473684210526315.
[I 2025-03-19 18:53:49,223] Trial 4 finished with value: 0.9407894736842105 and parameters: {'n_estimators': 135, 'max_depth': 23}. Best is trial 3 with value: 0.94736842

Mejores hiperparámetros: {'n_estimators': 137, 'max_depth': 9}
