In [None]:
import pandas as pd

# Importar los datos
df = pd.read_csv("manufacturing_defect_dataset_OK .csv", sep=';', engine='python')

In [None]:
X = df.drop("DefectStatus", axis=1)
y = df["DefectStatus"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
pip install xgboost


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Entrenar modelo XGBoost con configuración por defecto

xgb_model.fit(X_train, y_train)

# Predicciones
y_pred_xgb = xgb_modelo.predict(X_test)

# Evaluación
print("Evaluación - XGBoost")
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))


In [None]:
# Entrenar el modelo con hiperparámetros por defecto (limitado a 50 árboles)
xgb_default = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=50,  
    random_state=42
)



xgb_default.fit(X_train, y_train)

#  predicciones y evaluar
y_pred = xgb_default.predict(X_test)

#  Mostrar resultados
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))

In [None]:

from sklearn.metrics import make_scorer
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

xgb_modelo = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

#  Definir el espacio de búsqueda
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 10],
    'learning_rate': [0.01, 0.03],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.5, 1.0]
}

#  Configurar validación cruzada y métrica
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score, pos_label=0) 
#  Ejecutar búsqueda en rejilla
grid_search = GridSearchCV(
    estimator=xgb_modelo,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    verbose=1,
    n_jobs=-1
)

#  Ajustar modelo
grid_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros y puntuación
print("Mejores hiperparámetros:")
print(grid_search.best_params_)
print("Mejor F1-score para clase 0:", grid_search.best_score_)

In [None]:
# Definir el modelo con los mejores hiperparámetros encontrados
best_xgb = XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.03,
    max_depth=3,
    n_estimators=200,
    subsample=1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

best_xgb.fit(
    X_train, y_train,)


# Predecir sobre el conjunto de test
y_pred = best_xgb.predict(X_test)
y_proba_bestxgb = best_xgb.predict_proba(X_test)[:, 1]
# Evaluar 
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))

print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_bestxgb))

In [None]:

fpr, tpr, _ = roc_curve(y_test,y_proba_bestxgb)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_bestxgb):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - XGBoost")
plt.legend()
plt.grid()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Definir y entrenar el modelo XGBoost ajustado
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=210,
    max_depth=4,
    learning_rate=0.066,
    subsample=0.827,
    colsample_bytree=0.546,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Predecir probabilidades
y_prob = xgb_model.predict_proba(X_test)[:, 1]


fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)


fpr, tpr, _ = roc_curve(y_test, y_proba_xgb)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc_score(y_test, y_proba_xgb):.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("Curva ROC - XGBoost")
plt.legend()
plt.grid()
plt.show()


In [None]:
cm = confusion_matrix(y_test, y_pred)
labels = ['No defectuoso', 'Defectuoso']


plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicción')
plt.ylabel('Valor real')
plt.title('Matriz de confusión del modelo XGBoost (modelo ajustado)')
plt.tight_layout()
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Aplicar SMOTE sobre el conjunto de entrenamiento
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train_res, y_train_res)


y_pred = model.predict(X_test)


print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))

from xgboost import plot_importance
import matplotlib.pyplot as plt



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# División estratificada
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Calcular peso
scale = y_train.value_counts()[1] / y_train.value_counts()[0]  # defectuoso / no defectuoso

# Definir y entrenar el modelo con scale_pos_weight
xgb_weighted = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=scale
)

xgb_weighted.fit(X_train, y_train)

# Evaluación
y_pred = xgb_weighted.predict(X_test)
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))

print("\nReporte de clasificación:")
print(classification_report(y_test, y_pred))



In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(xgb_es, X, y, scoring='f1', cv=cv)

print("F1-score medio:", scores.mean())



In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.5, 0.5)
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
print("Mejores parámetros encontrados:")
print(random_search.best_params_)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Modelo con los mejores parámetros obtenidos por RandomizedSearchCV
best_random_model = XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.03,
    max_depth=3,
    n_estimators=200,
    subsample=1.0,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)


best_random_model.fit(X_train, y_train)


y_pred = best_random_model.predict(X_test)
y_proba_xgb_best = best_random_model.predict_proba(X_test)[:, 1]


print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_xgb_best))

In [None]:
!pip install optuna


In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss')
    score = cross_val_score(model, X, y, cv=3, scoring='f1_macro').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Mejores hiperparámetros:", study.best_params)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Entrenar el modelo con los mejores parámetros de Optuna
best_xgb_optuna = XGBClassifier(
    n_estimators=133,
    max_depth=3,
    learning_rate=0.037,
    subsample=0.976,
    colsample_bytree=0.972,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

best_xgb_optuna.fit(X_train, y_train)

# Predicción
y_pred = best_xgb_optuna.predict(X_test)
y_proba_best_xgb = best_xgb_optuna.predict_proba(X_test)[:, 1]

# Evaluación
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))

print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_best_xgb))

In [None]:

import pandas as pd
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve


# Separar clases
df_minority = df[df['DefectStatus'] == 0]
df_majority = df[df['DefectStatus'] == 1]

# Muestreo para igualar las clases
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42)

# Conjunto balanceado
df_balanced = pd.concat([df_minority, df_majority_downsampled])
X_bal = df_balanced.drop('DefectStatus', axis=1)
y_bal = df_balanced['DefectStatus']

# División train/test
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.2, stratify=y_bal, random_state=42)

# Modelo XGBoost con hiperparámetros ajustados
model = XGBClassifier(
    n_estimators=133,
    max_depth=3,
    learning_rate=0.037,
    subsample=0.976,
    colsample_bytree=0.972,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

# Evaluar
y_pred = model.predict(X_test)
y_proba_ajustado = model.predict_proba(X_test)[:, 1]

print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))
(conf_matrix, report)
print("ROC-AUC:", roc_auc_score(y_test, y_proba_ajustado))
