In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
X_train = pd.read_csv('../data/processed/X_train_ml.csv')
X_test = pd.read_csv('../data/processed/X_test_ml.csv')
y_train = pd.read_csv('../data/processed/y_train_ml.csv').squeeze()  
y_test = pd.read_csv('../data/processed/y_test_ml.csv').squeeze()


In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilitate pentru clasa pozitivă (diabet)

# Evaluare
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))


In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label='Logistic Regression (AUC = %.3f)' % roc_auc_score(y_test, y_proba))
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
features = X_train.columns
importances = model.coef_[0]
feature_importance = pd.Series(importances, index=features).sort_values(key=abs, ascending=False)
print(feature_importance)

# Plot feature importances
plt.figure(figsize=(8, 5))
sns.barplot(x=feature_importance.values, y=feature_importance.index)
plt.title('Feature Importance (Logistic Regression Coefficients)')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, digits=3))
print("Random Forest - ROC-AUC Score:", roc_auc_score(y_test, y_proba_rf))

print(confusion_matrix(y_test, y_pred_rf))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
                    use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("XGBoost - Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb, digits=3))
print("XGBoost - ROC-AUC Score:", roc_auc_score(y_test, y_proba_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

In [None]:
# Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(5,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.title('Random Forest - Confusion Matrix')
plt.show()

fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)
plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = %.3f)' % roc_auc_score(y_test, y_proba_rf))

# XGBoost
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(5,4))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Oranges')
plt.title('XGBoost - Confusion Matrix')
plt.show()

fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_proba_xgb)
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost (AUC = %.3f)' % roc_auc_score(y_test, y_proba_xgb))

# Logistic Regression (optional)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr_lr, tpr_lr, label='LogReg (AUC = %.3f)' % roc_auc_score(y_test, y_proba))

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - All Models')
plt.legend()
plt.show()


In [None]:
from lightgbm import LGBMClassifier

# Calculează scale_pos_weight ca la XGBoost pentru class imbalance
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

lgbm = LGBMClassifier(scale_pos_weight=scale_pos_weight, random_state=42, n_jobs=-1)
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)
y_proba_lgbm = lgbm.predict_proba(X_test)[:, 1]

print("LightGBM - Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lgbm, digits=3))
print("LightGBM - ROC-AUC Score:", roc_auc_score(y_test, y_proba_lgbm))


In [None]:
cm_lgbm = confusion_matrix(y_test, y_pred_lgbm)
plt.figure(figsize=(5,4))
sns.heatmap(cm_lgbm, annot=True, fmt='d', cmap='Purples')
plt.title('LightGBM - Confusion Matrix')
plt.show()

fpr_lgbm, tpr_lgbm, _ = roc_curve(y_test, y_proba_lgbm)
plt.plot(fpr_lgbm, tpr_lgbm, label='LightGBM (AUC = %.3f)' % roc_auc_score(y_test, y_proba_lgbm))

# Adaugă și celelalte modele pe plot dacă vrei comparație vizuală
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost (AUC = %.3f)' % roc_auc_score(y_test, y_proba_xgb))
plt.plot(fpr_lr, tpr_lr, label='LogReg (AUC = %.3f)' % roc_auc_score(y_test, y_proba))
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - All Models')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculează toate metricile pentru fiecare model
model_names = ['LogReg', 'RandomForest', 'XGBoost', 'LightGBM']
auc_scores = [
    roc_auc_score(y_test, y_proba),
    roc_auc_score(y_test, y_proba_rf),
    roc_auc_score(y_test, y_proba_xgb),
    roc_auc_score(y_test, y_proba_lgbm)
]
recall_scores = [
    recall_score(y_test, y_pred),
    recall_score(y_test, y_pred_rf),
    recall_score(y_test, y_pred_xgb),
    recall_score(y_test, y_pred_lgbm)
]
precision_scores = [
    precision_score(y_test, y_pred),
    precision_score(y_test, y_pred_rf),
    precision_score(y_test, y_pred_xgb),
    precision_score(y_test, y_pred_lgbm)
]
f1_scores = [
    f1_score(y_test, y_pred),
    f1_score(y_test, y_pred_rf),
    f1_score(y_test, y_pred_xgb),
    f1_score(y_test, y_pred_lgbm)
]

import numpy as np
import matplotlib.pyplot as plt

x = np.arange(len(model_names))
width = 0.2

plt.figure(figsize=(10,6))
plt.bar(x - 1.5*width, auc_scores, width, label='ROC-AUC')
plt.bar(x - 0.5*width, recall_scores, width, label='Recall (diabetes)')
plt.bar(x + 0.5*width, precision_scores, width, label='Precision (diabetes)')
plt.bar(x + 1.5*width, f1_scores, width, label='F1 (diabetes)')

plt.xticks(x, model_names)
plt.ylim(0, 1.1)
plt.ylabel('Score')
plt.title('Model Comparison: ROC-AUC, Recall, Precision, F1')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.show()


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'num_leaves': [15, 31, 63],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'scale_pos_weight': [scale_pos_weight]  # deja calculat anterior
}

lgbm_gs = LGBMClassifier(random_state=42, n_jobs=-1)

grid = GridSearchCV(lgbm_gs, params, scoring='roc_auc', cv=3, verbose=1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best ROC-AUC on CV:", grid.best_score_)

# Evaluează pe test set cu cel mai bun model
best_lgbm = grid.best_estimator_
y_pred_best = best_lgbm.predict(X_test)
y_proba_best = best_lgbm.predict_proba(X_test)[:, 1]
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba_best))


In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

params = {
    'num_leaves': [15, 31, 63],
    'max_depth': [3, 5, 7, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [300, 500],    # <- aici e modificarea!
    'scale_pos_weight': [scale_pos_weight]
}

lgbm_gs = LGBMClassifier(random_state=42, n_jobs=-1)

grid = GridSearchCV(lgbm_gs, params, scoring='roc_auc', cv=3, verbose=1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best ROC-AUC on CV:", grid.best_score_)

# Evaluează pe test set cu cel mai bun model
best_lgbm = grid.best_estimator_
y_pred_best = best_lgbm.predict(X_test)
y_proba_best = best_lgbm.predict_proba(X_test)[:, 1]
print("Test ROC-AUC:", roc_auc_score(y_test, y_proba_best))


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
plt.title('LightGBM Tunat - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
import joblib

# Salvează modelul LightGBM tunat
joblib.dump(best_lgbm, '../models/lgbm_best_model.pkl')

print("Modelul și scalerul au fost salvați în folderul models/")
