In [12]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

In [13]:
def data_prep(data):
    # Load and preprocess the dataset

    # Split the dataset into training and testing sets
    X = data.iloc[:, :-1]
    y = data["label"]
    # Apply feature scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, y


In [14]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
column_names = [f"attribute_{i}" for i in range(1, 58)] + ["label"]
df = pd.read_csv(url, header=None, names=column_names)
# Split the dataset into training and testing sets
X = df.iloc[:, :-1]
y = df["label"]
# Apply feature scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
print(X)
print(y)
s=0
k = 0
for i in y:
    if i ==1:
        s+=1
    else:
        k+=1
print(s,k)

[[-3.42433707e-01  3.30884903e-01  7.12858774e-01 ... -4.52472762e-02
   4.52979198e-02 -8.72413388e-03]
 [ 3.45359395e-01  5.19091945e-02  4.35129540e-01 ... -2.44326749e-03
   2.50562832e-01  1.22832407e+00]
 [-1.45921392e-01 -1.65071912e-01  8.51723390e-01 ...  1.45920848e-01
   2.22110599e+00  3.25873251e+00]
 ...
 [ 6.40127868e-01 -1.65071912e-01  3.83734930e-02 ... -1.19382054e-01
  -2.36941335e-01 -2.72627750e-01]
 [ 2.80176333e+00 -1.65071912e-01 -5.56760578e-01 ... -1.27482666e-01
  -2.42072958e-01 -3.38603654e-01]
 [-3.42433707e-01 -1.65071912e-01  7.32696576e-01 ... -1.24236117e-01
  -2.42072958e-01 -4.01280763e-01]]
0       1
1       1
2       1
3       1
4       1
       ..
4596    0
4597    0
4598    0
4599    0
4600    0
Name: label, Length: 4601, dtype: int64
1813 2788


In [15]:
# Base Models
def base_models(X, y, scoring="roc_auc"):
    print("Base Models....")
    classifiers = [('LR', LogisticRegression()),
                   ('KNN', KNeighborsClassifier()),
                   ("SVC", SVC()),
                   ("CART", DecisionTreeClassifier()),
                   ("RF", RandomForestClassifier()),
                   ('Adaboost', AdaBoostClassifier()),
                   ('GBM', GradientBoostingClassifier()),
                   ('LightGBM', LGBMClassifier()),
                   # ('CatBoost', CatBoostClassifier(verbose=False))
                   ]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
        print(f"{scoring}: {round(cv_results['test_score'].mean(), 4)} ({name}) ")

In [16]:
knn_params = {"n_neighbors": [1, 3, 5, 7, 9]}
              #"metric":[1, 2]}


cart_params = { "min_samples_leaf": [1, 2, 5, 10, 20],
    "ccp_alpha": [0],  # Complexity parameter set to zero
    "min_samples_split": [2, 4, 10, 20, 40]}

rf_params = {"n_estimators": [500],  # J=500
    "max_features": ['sqrt', 'log2', None],  # Choose appropriate value for "m"
    "min_samples_leaf": [5]}

gbm_params = { "max_depth": [3, 4, 5],  # Depth
    "n_estimators": [50, 100, 150, 200, 250],  # Number of Trees
    "learning_rate": [0.01, 0.1]}

classifiers = [('KNN', KNeighborsClassifier(), knn_params),
               ("CART", DecisionTreeClassifier(), cart_params),
               ("RF", RandomForestClassifier(), rf_params),
               ('GBM', GradientBoostingClassifier(), gbm_params)]

In [17]:
def hyperparameter_optimization(X, y, cv=3, scoring="roc_auc"):
    print("Hyperparameter Optimization....")
    best_models = {}
    for name, classifier, params in classifiers:
        print(f"########## {name} ##########")
        cv_results = cross_validate(classifier, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (Before): {round(cv_results['test_score'].mean(), 4)}")

        gs_best = GridSearchCV(classifier, params, cv=cv, n_jobs=-1, verbose=False).fit(X, y)
        final_model = classifier.set_params(**gs_best.best_params_)

        cv_results = cross_validate(final_model, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (After): {round(cv_results['test_score'].mean(), 4)}")
        print(f"{name} best params: {gs_best.best_params_}", end="\n\n")
        best_models[name] = final_model
    return best_models

In [18]:
def voting_classifier(best_models, X, y):
    print("Voting Classifier...")
    voting_clf = VotingClassifier(estimators=[('KNN', best_models["KNN"]), ('RF', best_models["RF"]),
                                              ('GBM', best_models["GBM"])],
                                  voting='soft').fit(X, y)
    cv_results = cross_validate(voting_clf, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
    print(f"Accuracy: {cv_results['test_accuracy'].mean()}")
    print(f"F1Score: {cv_results['test_f1'].mean()}")
    print(f"ROC_AUC: {cv_results['test_roc_auc'].mean()}")
    return voting_clf

In [19]:
def main():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
    column_names = [f"attribute_{i}" for i in range(1, 58)] + ["label"]
    df = pd.read_csv(url, header=None, names=column_names)
    X, y = data_prep(df)
    base_models(X, y)
    best_models = hyperparameter_optimization(X, y)
    voting_clf = voting_classifier(best_models, X, y)
    joblib.dump(voting_clf, "voting_clf.pkl")
    return best_models

In [20]:
if __name__ == "__main__":
    print("İşlem başladı")
    main()

İşlem başladı
Base Models....
roc_auc: 0.954 (LR) 
roc_auc: 0.9243 (KNN) 
roc_auc: 0.9611 (SVC) 
roc_auc: 0.8734 (CART) 
roc_auc: 0.9716 (RF) 
roc_auc: 0.9616 (Adaboost) 
roc_auc: 0.974 (GBM) 
[LightGBM] [Info] Number of positive: 1208, number of negative: 1859
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6291
[LightGBM] [Info] Number of data points in the train set: 3067, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393870 -> initscore=-0.431073
[LightGBM] [Info] Start training from score -0.431073
[LightGBM] [Info] Number of positive: 1209, number of negative: 1858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tot