In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from feature_engine.selection import DropCorrelatedFeatures
import joblib
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import VotingClassifier

In [2]:
def grab_cols(df):
    num_cols = list(df.select_dtypes(include="number"))
    cat_cols = [col for col in df.columns if col not in num_cols]
    num_but_cat = [col for col in num_cols if df[col].nunique()<10]
    cat_but_car = [col for col in cat_cols if df[col].nunique() >20]
    cat_cols = cat_cols + num_but_cat 
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"cat_cols = {len(cat_cols)}")
    print(f"num_cols = {len(num_cols)}")
    print(f"num_but_cat = {len(num_but_cat)}")
    print(f"cat_but_car= {len(cat_but_car)}")
    return cat_cols,num_cols,cat_but_car,num_but_cat

In [3]:
def diabetes_data_prep():
    df = pd.read_csv("diabetes.csv")
    df.columns = [col.lower() for col in df.columns]
    X = df.drop("outcome",axis=1)
    y =df["outcome"]
    cols=["glucose","bloodpressure","skinthickness","insulin","bmi"]

    for col in cols:
        X.loc[X[col]==0,col] = np.nan
    def outliers(df,variable):
        q1= df[variable].quantile(0.2)
        q3 = df[variable].quantile(0.8)
        iqr = q3 - q1
        lower_lim = q1 - 1.5*iqr
        upper_lim = q3 + 1.5*iqr
        return lower_lim,upper_lim

    def replace_outliers(X,col):
        lower_lim,upper_lim = outliers(X,col)
        X[col].clip(lower=lower_lim,upper=upper_lim,inplace=True)

    replace_outliers(X,"insulin")
    imp_missforest = IterativeImputer(
    estimator=XGBRegressor(n_estimators=300,max_depth=5),
    max_iter=30,
    initial_strategy="median",
    random_state=0
    ).set_output(transform="pandas")

    X=imp_missforest.fit_transform(X)
    def ohe(dataframe,cat_cols):
        dataframe = pd.get_dummies(dataframe,columns=cat_cols,drop_first=True,dtype=int)
        return dataframe
    X["new_glucose_cat"] = pd.cut(x=X["glucose"],bins=[-1,100,140,200],labels=["normal","prediabetes","danger"])

    X.loc[X["age"]<32,"new_age_cat"] = 0
    X.loc[(X["age"]>=32) & (X["age"]<=50),"new_age_cat"]= 1
    X.loc[X["age"]>50,"new_age_cat"] =2

    # X["new_age2"] = pd.cut(x=X["age"],bins=[-1,32,50,100],labels= [0,1,2]) # alt sınıfa dahil eder

    X["new_bmi"] = pd.cut(x=X["bmi"],bins=[-1,18.5,24.9,29.9,100],labels=["underweight","healthy","overweight","obese"])
    X["new_bloodpressure"] = pd.cut(x=X["bloodpressure"],bins=[-1,79,89,123],labels=["normal","hs1","hs2"])
    
    cat_cols,num_cols,cat_but_car,num_but_cat = grab_cols(X)
    X=ohe(X,cat_cols)
    lof = LocalOutlierFactor(n_neighbors=10,n_jobs=-1)
    lof.fit_predict(X)
    X_scores = lof.negative_outlier_factor_
    df = pd.concat([X,y],axis=1)
    df=df.drop(labels =list(df[X_scores<-1.8].index),axis=0 )
    X=df.drop("outcome",axis=1)
    y = df["outcome"]
    sc = StandardScaler().set_output(transform="pandas")
    X = sc.fit_transform(X)
    return X,y
X,y = diabetes_data_prep()



cat_cols = 4
num_cols = 8
num_but_cat = 1
cat_but_car= 0


In [4]:
rf_params={"max_depth":[3,4,5,6], 
           "min_samples_split":[15,20],
           "n_estimators":[200,300]}

xgb_params = {"booster":["gblinear","gbtree"],
              "n_estimators":[200,300],
              "reg_lambda":[0.02,0.05],
              "reg_alpha":[0.01,0.02]}

lr_params = {'C': [0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            "max_iter":[5000,1000]}


classifiers = [("rf",RandomForestClassifier(class_weight='balanced'),rf_params),
               ("xgb",XGBClassifier(objective ="binary:logistic",scale_pos_weight=1.88),xgb_params),
               ("lr",LogisticRegression(solver='liblinear',class_weight='balanced'),lr_params)]

def hyperparameter_optimization(X,y,scoring="roc_auc"):
    print("hyperparameter optimization")
    best_models ={}
    for name,classifier,params in classifiers:
        print(f"##### {name}######")
        cv_results = cross_val_score(classifier,X,y,scoring=scoring,cv=10,n_jobs=-1).mean()
        print(f"{scoring} (Before): {cv_results}")
        
        gs = GridSearchCV(classifier,params,cv=10,scoring=scoring).fit(X,y)
        final_model = classifier.set_params(**gs.best_params_)
        
        cv_results = cross_val_score(final_model,X,y,scoring=scoring,cv=10,n_jobs=-1).mean()
        print(f"{scoring} (After): {cv_results}")
        print(f"{name} best_params: {gs.best_params_}", end="\n\n")
        best_models[name] = final_model
    return best_models
    
best_models = hyperparameter_optimization(X,y,scoring="roc_auc")

hyperparameter optimization
##### rf######
roc_auc (Before): 0.8298014710157569
roc_auc (After): 0.8456871329728474
rf best_params: {'max_depth': 5, 'min_samples_split': 15, 'n_estimators': 300}

##### xgb######
roc_auc (Before): 0.789438862724577
roc_auc (After): 0.8463807779522066
xgb best_params: {'booster': 'gblinear', 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 0.05}

##### lr######
roc_auc (Before): 0.841388801674516
roc_auc (After): 0.8460068608640038
lr best_params: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1'}



In [7]:
def voting_classifier(best_models,X,y):
    voting_clf=VotingClassifier(estimators = [("lr",best_models["lr"]),
                                            ("rf",best_models["rf"]),
                                            ("xg",best_models["xgb"])],
                              voting='soft',
                            weights=[1,1,1])
    cv_results = cross_validate(voting_clf,X,y,cv=10,scoring=["accuracy","roc_auc","recall"])
    print(f"accuracy: {cv_results['test_accuracy'].mean()}")
    print(f"recall: {cv_results['test_recall'].mean()}")
    print(f"roc_auc: {cv_results['test_roc_auc'].mean()}")
    return voting_clf
  
voting_clf = voting_classifier(best_models,X,y)

accuracy: 0.7776315789473685
recall: 0.7767806267806268
roc_auc: 0.8494496773068201


In [32]:
def main():
    X,y = diabetes_data_prep()
    best_models = hyperparameter_optimization(X,y,scoring="roc_auc")
    voting_clf = voting_classifier(best_models,X,y)
    joblib.dump(voting_clf,"voting_clf.pkl")
    return voting_clf

In [34]:
if __name__=="__main__":
    main()



cat_cols = 4
num_cols = 8
num_but_cat = 1
cat_but_car= 0
hyperparameter optimization
##### rf######
roc_auc (Before): 0.8468396418396418
roc_auc (After): 0.8443767660910517
rf best_params: {'max_depth': 4, 'min_samples_split': 15, 'n_estimators': 300}

##### xgb######
roc_auc (Before): 0.8463807779522066
roc_auc (After): 0.8463807779522066
xgb best_params: {'booster': 'gblinear', 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 0.05}

##### lr######
roc_auc (Before): 0.8460824466538751
roc_auc (After): 0.8460824466538753
lr best_params: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l1'}

accuracy: 0.7763157894736842
recall: 0.7806267806267806
roc_auc: 0.8493038548752836


In [17]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score

# Stratified K-Fold oluşturma (sınıf dağılımını korur)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Metrikleri saklamak için boş listeler
accuracies = []
recalls = []
roc_aucs = []

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # scale_pos_weight hesaplama
    ratio = float(y_train.loc[y_train == 0].shape[0]) / y_train.loc[y_train == 1].shape[0]
    
    clf = xgb.XGBClassifier(scale_pos_weight=ratio,booster="gblinear")
    
    # Modeli eğit
    clf.fit(X_train, y_train)
    
    # Test verisi üzerinde tahmin yap
    predictions = clf.predict(X_test)
    probas = clf.predict_proba(X_test)[:,1]
    
    # Metrikleri hesapla
    acc = accuracy_score(y_test, predictions)
    rec = recall_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, probas)
    
    accuracies.append(acc)
    recalls.append(rec)
    roc_aucs.append(roc_auc)

    print(f"Accuracy: {acc:.4f}, Recall: {rec:.4f}, ROC AUC: {roc_auc:.4f}, ratio: {ratio:.4f}")

# Ortalama metrik değerlerini yazdırma
print("\nAverage Metrics:")
print(f"Accuracy: {sum(accuracies)/len(accuracies):.4f}")
print(f"Recall: {sum(recalls)/len(recalls):.4f}")
print(f"ROC AUC: {sum(roc_aucs)/len(roc_aucs):.4f}")

Accuracy: 0.7763, Recall: 0.8148, ROC AUC: 0.8866, ratio: 1.8861
Accuracy: 0.7632, Recall: 0.8148, ROC AUC: 0.8322, ratio: 1.8861
Accuracy: 0.8026, Recall: 0.7778, ROC AUC: 0.8813, ratio: 1.8861
Accuracy: 0.6974, Recall: 0.7778, ROC AUC: 0.7823, ratio: 1.8861
Accuracy: 0.7500, Recall: 0.8462, ROC AUC: 0.8585, ratio: 1.8739
Accuracy: 0.8158, Recall: 0.8077, ROC AUC: 0.8515, ratio: 1.8739
Accuracy: 0.7632, Recall: 0.7692, ROC AUC: 0.8600, ratio: 1.8739
Accuracy: 0.7632, Recall: 0.7692, ROC AUC: 0.8108, ratio: 1.8739
Accuracy: 0.7895, Recall: 0.7308, ROC AUC: 0.8200, ratio: 1.8739
Accuracy: 0.7763, Recall: 0.6538, ROC AUC: 0.8400, ratio: 1.8739

Average Metrics:
Accuracy: 0.7697
Recall: 0.7762
ROC AUC: 0.8423


In [24]:
import numpy as np

# Örnek y verisi
y_train = np.array([0, 1, 2, 1, 0,0,0,0,0,1,2,])  # asıl y_train veriniz bu şekilde olacaktır.
y_train

array([0, 1, 2, 1, 0, 0, 0, 0, 0, 1, 2])

In [25]:
class_counts = np.bincount(y_train)
class_counts

array([6, 3, 2], dtype=int64)

In [26]:
min_count = min(class_counts)
min_count

2

In [27]:
weights = min_count / class_counts
weights

array([0.33333333, 0.66666667, 1.        ])

In [30]:
import numpy as np

# Örnek y verisi
y_train = np.array([0, 1, 2, 1, 0,0,0,0,0,1,2,])  # asıl y_train veriniz bu şekilde olacaktır.

# Sınıf frekanslarını hesaplama
class_counts = np.bincount(y_train)

# En az örneğe sahip sınıfın frekansı
min_count = min(class_counts)

# Sınıflar için ağırlık faktörleri
weights = min_count / class_counts

# Eğitim verisine ilgili ağırlıkları atama
sample_weights = np.array([weights[i] for i in y_train])
sample_weights

array([0.33333333, 0.66666667, 1.        , 0.66666667, 0.33333333,
       0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.66666667,
       1.        ])

In [None]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train, sample_weight=sample_weights)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from feature_engine.selection import DropCorrelatedFeatures
import joblib
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import VotingClassifier

def grab_cols(df):
    num_cols = list(df.select_dtypes(include="number"))
    cat_cols = [col for col in df.columns if col not in num_cols]
    num_but_cat = [col for col in num_cols if df[col].nunique()<10]
    cat_but_car = [col for col in cat_cols if df[col].nunique() >20]
    cat_cols = cat_cols + num_but_cat 
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"cat_cols = {len(cat_cols)}")
    print(f"num_cols = {len(num_cols)}")
    print(f"num_but_cat = {len(num_but_cat)}")
    print(f"cat_but_car= {len(cat_but_car)}")
    return cat_cols,num_cols,cat_but_car,num_but_cat

def diabetes_data_prep():
    df = pd.read_csv("diabetes.csv")
    df.columns = [col.lower() for col in df.columns]
    X = df.drop("outcome",axis=1)
    y =df["outcome"]
    cols=["glucose","bloodpressure","skinthickness","insulin","bmi"]

    for col in cols:
        X.loc[X[col]==0,col] = np.nan
    def outliers(df,variable):
        q1= df[variable].quantile(0.2)
        q3 = df[variable].quantile(0.8)
        iqr = q3 - q1
        lower_lim = q1 - 1.5*iqr
        upper_lim = q3 + 1.5*iqr
        return lower_lim,upper_lim

    def replace_outliers(X,col):
        lower_lim,upper_lim = outliers(X,col)
        X[col].clip(lower=lower_lim,upper=upper_lim,inplace=True)

    replace_outliers(X,"insulin")
    imp_missforest = IterativeImputer(
    estimator=XGBRegressor(n_estimators=300,max_depth=5),
    max_iter=30,
    initial_strategy="median",
    random_state=0
    ).set_output(transform="pandas")

    X=imp_missforest.fit_transform(X)
    def ohe(dataframe,cat_cols):
        dataframe = pd.get_dummies(dataframe,columns=cat_cols,drop_first=True,dtype=int)
        return dataframe
    X["new_glucose_cat"] = pd.cut(x=X["glucose"],bins=[-1,100,140,200],labels=["normal","prediabetes","danger"])

    X.loc[X["age"]<32,"new_age_cat"] = 0
    X.loc[(X["age"]>=32) & (X["age"]<=50),"new_age_cat"]= 1
    X.loc[X["age"]>50,"new_age_cat"] =2

    # X["new_age2"] = pd.cut(x=X["age"],bins=[-1,32,50,100],labels= [0,1,2]) # alt sınıfa dahil eder

    X["new_bmi"] = pd.cut(x=X["bmi"],bins=[-1,18.5,24.9,29.9,100],labels=["underweight","healthy","overweight","obese"])
    X["new_bloodpressure"] = pd.cut(x=X["bloodpressure"],bins=[-1,79,89,123],labels=["normal","hs1","hs2"])
    
    cat_cols,num_cols,cat_but_car,num_but_cat = grab_cols(X)
    X=ohe(X,cat_cols)
    lof = LocalOutlierFactor(n_neighbors=10,n_jobs=-1)
    lof.fit_predict(X)
    X_scores = lof.negative_outlier_factor_
    df = pd.concat([X,y],axis=1)
    df=df.drop(labels =list(df[X_scores<-1.8].index),axis=0 )
    X=df.drop("outcome",axis=1)
    y = df["outcome"]
    sc = StandardScaler().set_output(transform="pandas")
    X = sc.fit_transform(X)
    return X,y




def hyperparameter_optimization(X,y,scoring="roc_auc"):
    rf_params={"max_depth":[3,4,5,6], 
           "min_samples_split":[15,20],
           "n_estimators":[200,300]}

    xgb_params = {"booster":["gblinear","gbtree"],
              "n_estimators":[200,300],
              "reg_lambda":[0.02,0.05],
              "reg_alpha":[0.01,0.02]}

    lr_params = {'C': [0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            "max_iter":[5000,1000]}


    classifiers = [("rf",RandomForestClassifier(class_weight='balanced'),rf_params),
               ("xgb",XGBClassifier(objective ="binary:logistic",scale_pos_weight=1.88),xgb_params),
               ("lr",LogisticRegression(solver='liblinear',class_weight='balanced'),lr_params)]
    print("hyperparameter optimization")
    best_models ={}
    for name,classifier,params in classifiers:
        print(f"##### {name}######")
        cv_results = cross_val_score(classifier,X,y,scoring=scoring,cv=10,n_jobs=-1).mean()
        print(f"{scoring} (Before): {cv_results}")
        
        gs = GridSearchCV(classifier,params,cv=10,scoring=scoring).fit(X,y)
        final_model = classifier.set_params(**gs.best_params_)
        
        cv_results = cross_val_score(final_model,X,y,scoring=scoring,cv=10,n_jobs=-1).mean()
        print(f"{scoring} (After): {cv_results}")
        print(f"{name} best_params: {gs.best_params_}", end="\n\n")
        best_models[name] = final_model
    return best_models
