# functions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import time
import json
import copy
from itertools import product
from joblib import Parallel, delayed

from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression,SGDClassifier, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neural_network import MLPClassifier
from scipy.interpolate import interp1d
from sklearn.utils import compute_sample_weight
from lightgbm import LGBMClassifier


from sklearn.metrics import roc_curve,make_scorer,average_precision_score, matthews_corrcoef,precision_recall_curve,classification_report \
,accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, roc_auc_score,confusion_matrix, auc,brier_score_loss,\
fowlkes_mallows_score,cohen_kappa_score,jaccard_score

In [None]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [None]:
def to_json(model_lite,model,model_name):
    json_array = json.dumps(model_lite, cls=NumpyArrayEncoder,indent=2)
    with open(f"{model_name}_clf_lite.json", 'w') as json_file:
        json_file.write(json_array)
    #json_array = json.dumps(model, cls=NumpyArrayEncoder,indent=2)
    #with open(f"{model_name}_clf.json", 'w') as json_file:
    #    json_file.write(json_array)

In [6]:
def scoring_metrics(y_test, y_pred, y_pred_prob):
    scores = {
        'accuracy': accuracy_score(y_test, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred,zero_division=1),
        'average_precision': average_precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'jaccard': jaccard_score(y_test, y_pred),
        'fowlkes_mallows': fowlkes_mallows_score(y_test, y_pred),
        'cohen_kappa': cohen_kappa_score(y_test, y_pred),
        'matthews_corrcoef': matthews_corrcoef(y_test, y_pred),
        'pr_auc': average_precision_score(y_test, y_pred_prob), # No zero_division for average_precision_score
        'roc_auc': roc_auc_score(y_test, y_pred_prob),
    }
    return scores


In [7]:
def grid_search(model, param_grid, X_outter, y_outter,sgd = False,class_weight = False):
    best_score = None
    best_params = None
    best_model = None
    param_comb_arr = []
    param_comb_arr_lite = []
    # Generate all possible combinations of hyperparameters
    all_params = list(product(*param_grid.values()))

    for params in all_params:
        # Create a dictionary of hyperparameter values
        param_dict = {param_name: param_value for param_name, param_value in zip(param_grid.keys(), params)}
        param_comb = {}
        param_comb_lite = {}
        param_comb['params'] = param_dict
        param_comb_lite['params'] = param_dict
        
        # Set the model hyperparameters
        model.set_params(**param_dict)
        inner_fold_arr = []
        inner_fold_arr_lite = []
        scores = []
        for fold_num, (inner_train_index, inner_test_index) in enumerate(inner_cv.split(X_outter, y_outter)):
            X_inner_train, X_inner_test = X_outter[inner_train_index], X_outter[inner_test_index]
            y_inner_train, y_inner_test = y_outter[inner_train_index], y_outter[inner_test_index]
            
            inner_fold = {}
            inner_fold_lite = {}
            inner_fold['fold_num'] = fold_num+1
            inner_fold_lite['fold_num'] = fold_num+1
            test_row = {}
            train_row = {}
            
            if(class_weight):
                unique_classes = np.unique(y_inner_train)
                class_counts = np.bincount(y_inner_train)
                class_frequencies = {class_label: count / len(y_inner_train) for class_label, count in zip(unique_classes, class_counts)}
                balanced_sample_weights = np.array([1.0 / class_frequencies[class_label] for class_label in y_inner_train])
                start_time = time.time()
                model.fit(X_inner_train, y_inner_train,sample_weight = balanced_sample_weights)
                end_time = time.time()
                train_row['fit_time']=end_time - start_time
                
            else:
                start_time = time.time()
                model.fit(X_inner_train, y_inner_train)
                end_time = time.time()
                train_row['fit_time']=end_time - start_time
            
            start_time = time.time()
            y_pred = model.predict(X_inner_train)
            end_time = time.time()
            train_row['pred_time']=end_time - start_time
            
            if(sgd):
                start_time = time.time()
                y_pred_prob = cross_val_predict(model, X_inner_train, y_inner_train, cv=3,method="decision_function")
                end_time = time.time()
                
            else:
                start_time = time.time()
                y_pred_prob = model.predict_proba(X_inner_train)[:,1]
                end_time = time.time()
            train_row['pred_proba_time']=end_time - start_time

            train_row.update(scoring_metrics(y_inner_train,y_pred, y_pred_prob))
            
            inner_fold_lite['train'] =  copy.deepcopy(train_row)
            train_row['y'] = y_inner_train
            train_row['y_pred'] = y_pred
            train_row['y_pred_prob'] = y_pred_prob
            train_row['indices'] = inner_train_index

            inner_fold['train']=train_row

            start_time = time.time()
            y_pred = model.predict(X_inner_test)
            end_time = time.time()
            test_row['pred_time']=end_time - start_time
        
            if(sgd):
                start_time = time.time()
                y_pred_prob = cross_val_predict(model, X_inner_test, y_inner_test, cv=3,method="decision_function")
                end_time = time.time()
                
            else:
                start_time = time.time()
                y_pred_prob = model.predict_proba(X_inner_test)[:,1]
                end_time = time.time()
            test_row['pred_proba_time']=end_time - start_time
            
            
            test_metrics = scoring_metrics(y_inner_test,y_pred, y_pred_prob)
            test_row.update(test_metrics)
            scores.append(test_metrics)
            
            inner_fold_lite['test']=copy.deepcopy(test_row)
            
            test_row['y'] = y_inner_test
            test_row['y_pred'] = y_pred
            test_row['y_pred_prob'] = y_pred_prob
            test_row['indices'] = inner_test_index

            inner_fold['test']=test_row
            inner_fold_arr.append(inner_fold)
            inner_fold_arr_lite.append(inner_fold_lite)
            
        total_f1 = 0.0
        for data in scores:
            total_f1 += data['f1']
        score_f1 = total_f1 / 3

        if best_score is None or score_f1 > best_score:
            best_score = score_f1
            best_params = param_dict
            best_model = copy.deepcopy(model)
        param_comb['inner_fold'] = inner_fold_arr
        param_comb_lite['inner_fold'] = inner_fold_arr_lite
        param_comb_arr.append(param_comb)
        param_comb_arr_lite.append(param_comb_lite)


    return best_model,best_params,param_comb_arr,param_comb_arr_lite

In [8]:

def outer_metric(model, params, X_out_train, y_out_train,idx_train,X_out_test,y_out_test,idx_test,outer_fold_num,sgd = False, class_weight = False):
  outer_fold = {}
  outer_fold_lite = {}
  test_row = {}
  train_row = {}
  
  outer_fold['fold_num']= int(outer_fold_num)
  outer_fold['best_params'] = params
  outer_fold_lite['fold_num']= int(outer_fold_num)
  outer_fold_lite['best_params'] = params
  
  if(class_weight):
    unique_classes = np.unique(y_out_train)
    class_counts = np.bincount(y_out_train)
    class_frequencies = {class_label: count / len(y_out_train) for class_label, count in zip(unique_classes, class_counts)}
    balanced_sample_weights = np.array([1.0 / class_frequencies[class_label] for class_label in y_out_train])
    start_time = time.time()
    model.fit(X_out_train, y_out_train,sample_weight = balanced_sample_weights)
    end_time = time.time()
    train_row['fit_time']=end_time - start_time
    
  else:
    start_time = time.time()
    model.fit(X_out_train, y_out_train)
    end_time = time.time()
    train_row['fit_time']=end_time - start_time

  start_time = time.time()
  y_pred = model.predict(X_out_train)
  end_time = time.time()
  train_row['pred_time']=end_time - start_time
  if(sgd):
    start_time = time.time()
    y_pred_prob = cross_val_predict(model, X_out_train, y_out_train, cv=3,method="decision_function")
    end_time = time.time()
    
  else:
    start_time = time.time()
    y_pred_prob = model.predict_proba(X_out_train)[:,1]
    end_time = time.time()
  train_row['pred_proba_time']=end_time - start_time

  train_row.update(scoring_metrics(y_out_train,y_pred, y_pred_prob))

  outer_fold_lite['train']=copy.deepcopy(train_row)
  
  train_row['y'] = y_out_train
  train_row['y_pred'] = y_pred
  train_row['y_pred_prob'] = y_pred_prob
  train_row['indices'] = idx_train

  outer_fold['train']=train_row

  start_time = time.time()
  y_pred = model.predict(X_out_test)
  end_time = time.time()
  test_row['pred_time']=end_time - start_time
      
  if(sgd):
    start_time = time.time()
    y_pred_prob = cross_val_predict(model, X_out_test, y_out_test, cv=3,method="decision_function")
    end_time = time.time()
    
  else:
    start_time = time.time()
    y_pred_prob = model.predict_proba(X_out_test)[:,1]
    end_time = time.time()
  test_row['pred_proba_time']=end_time - start_time

  test_row.update(scoring_metrics(y_out_test,y_pred, y_pred_prob))

  outer_fold_lite['test']=copy.deepcopy(test_row)
  
  test_row['y'] = y_out_test
  test_row['y_pred'] = y_pred
  test_row['y_pred_prob'] = y_pred_prob
  test_row['indices'] = idx_test

  outer_fold['test']=test_row
  return outer_fold,outer_fold_lite

In [9]:
def process_fold(model,fold_num, outer_train_index, outer_test_index, sgd = False,class_weight = False):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]

    best_model, best_param, param_arr, param_arr_lite = grid_search(model, param_grid, X_outer_train, y_outer_train, sgd, class_weight)
    outer_loop, outer_loop_lite = outer_metric(best_model, best_param, X_outer_train, y_outer_train, outer_train_index,
                                                X_outer_test, y_outer_test, outer_test_index, fold_num + 1,sgd, class_weight)
    outer_loop['param_comb'] = param_arr
    outer_loop_lite['param_comb'] = param_arr_lite
    print(f'Finished fold {fold_num + 1}')
    return outer_loop, outer_loop_lite

In [None]:
momental_dataset = "6"

In [None]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [None]:
def to_json(model_lite,model,model_name):
    json_array = json.dumps(model_lite, cls=NumpyArrayEncoder,indent=2)
    with open(f"result_dataset_{momental_dataset}/{model_name}_reg_lite.json", 'w') as json_file:
        json_file.write(json_array)
    #json_array = json.dumps(model, cls=NumpyArrayEncoder,indent=2)
    #with open(f"result_dataset_{momental_dataset}/{model_name}_reg.json", 'w') as json_file:
    #    json_file.write(json_array)

# load the data and prepare it for training

In [2]:
df = pd.read_csv(f'./dataset{momental_dataset}/procesed_dataset_{momental_dataset}.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
X = df.drop("stroke", axis=1)
y = df["stroke"]

# Convert the features and labels to numpy arrays
Xarr = np.array(X)
yarr = np.array(y)
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [10]:
all_models = []
all_models_lite = []

# Training the models

## AdaBoost Classifier

In [6]:
ada_clf = AdaBoostClassifier(random_state = 42)
#ada_clf.get_params()

{'algorithm': 'SAMME.R',
 'base_estimator': 'deprecated',
 'estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': 42}

In [26]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 700],  # Number of weak learners (base estimators).
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],  # Shrinkage parameter to control learning rate. Smaller values reduce overfitting.
    'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=7)],  # Base estimator. Simpler models can reduce overfitting.
    'algorithm': ['SAMME', 'SAMME.R'],  # Algorithm for updating weights. SAMME.R is recommended for better convergence.
}
ada_model = {}
ada_model['model']='AdaBoost_Classifier'
outer_loop_arr = []
ada_model_lite = {}
ada_model_lite['model']='AdaBoost_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ada_clf, param_grid, fold_num, outer_train_index, outer_test_index,False,True)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ada_model['outer_loop'] = outer_loop_arr
ada_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ada_model)
all_models_lite.append(ada_model_lite)
to_json(ada_model_lite,ada_model,'ada')

ending_time = time.time()
print(ending_time - starting_time)

7.93566370010376


## Bagging Classifier

In [8]:
bag_reg = BaggingClassifier(n_jobs = -1, random_state = 42)
#bag_reg.get_params()

{'base_estimator': 'deprecated',
 'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [9]:
param_grid = {
    'n_estimators': [10, 50, 100, 200, 300, 400, 500],  # Number of base estimators (bags). Larger values lead to stronger regularization.
    'base_estimator': [None, RidgeClassifier(alpha=1.0), DecisionTreeClassifier()],  # Base estimator to use. 
    'max_samples': [0.7, 0.8, 0.9, 1.0],  # Fraction of samples used for fitting each bag. Larger values lead to stronger regularization.
}
bag_model = {}
bag_model['model']='Bagging_Classifier'
outer_loop_arr = []
bag_model_lite = {}
bag_model_lite['model']='Bagging_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(bag_clf, param_grid, fold_num, outer_train_index, outer_test_index,False,True)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
bag_model['outer_loop'] = outer_loop_arr
bag_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(bag_model)
all_models_lite.append(bag_model_lite)
to_json(bag_model_lite,bag_model,'bag')

ending_time = time.time()
print(ending_time - starting_time)

## DecisionTreeClassifier

In [35]:
dt_clf = DecisionTreeClassifier(random_state = 42)

In [36]:
param_grid = {
    'criterion': ['gini', 'entropy'], # Function used to measure the quality of a split at each node.
    'max_depth': list(range(2, 35, 3)) + [None],  # Maximum depth of the tree. None means unlimited depth.
    'min_samples_split': list(range(2, 20, 2)),  # Minimum samples required to split an internal node.
    'max_features': ['log', 'sqrt']+ [0.1, 0.2, 0.25, 0.33, 0.5],  # Maximum number of features to consider when splitting a node during tree construction. None: can use all available features.
}
dt_model = {}
dt_model['model']='Decision_Tree_Classifier'
outer_loop_arr = []
dt_model_lite = {}
dt_model_lite['model']='Decision_Tree_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(dt_clf, param_grid, fold_num, outer_train_index, outer_test_index,False,True)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
dt_model['outer_loop'] = outer_loop_arr
dt_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(dt_model)
all_models_lite.append(dt_model_lite)

to_json(tree_model_lite,tree_model,'tree')

ending_time = time.time()
print(ending_time - starting_time)

0.846235990524292


## Gaussian distribution

In [19]:
gausNB_clf = GaussianNB()

In [20]:
param_grid = {}

gausNB_model = {}
gausNB_model['model']='Gaussian_distribution_Classifier'
outer_loop_arr = []
gausNB_model_lite = {}
gausNB_model_lite['model']='Gaussian_distribution_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(gausNB_clf, param_grid, X_outer_train, y_outer_train,False,True)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1,False,True)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
gausNB_model['outer_loop']= outer_loop_arr
gausNB_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(gausNB_model)
all_models_lite.append(gausNB_model_lite)

to_json(gausNB_model_lite,gausNB_model,'gausNB')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## GradientBoostingClassifier

In [7]:
gb_clf = GradientBoostingClassifier(random_state=42)
#gb_clf.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [32]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],  # Number of boosting stages.
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],  # Shrinkage parameter to control the contribution of each estimator. Small value means each tree in the ensemble has a minor impact on the final prediction lead to  gradual convergence of the algorithm.
    'max_depth': list(range(1, 10)),  # Maximum depth of individual decision trees.
    'min_samples_split': list(range(2, 21, 2)),  # Minimum samples required to split an internal node.
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Fraction of samples used for fitting the trees.
    'max_features': ['log', 'sqrt']+ [0.1, 0.2, 0.25, 0.33, 0.5],  # Maximum number of features to consider for a split.
}
gb_model = {}
gb_model['model']='Gradient_Boosting_Classifier'
outer_loop_arr = []
gb_model_lite = {}
gb_model_lite['model']='Gradient_Boosting_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(gb_clf, param_grid, fold_num, outer_train_index, outer_test_index,False,True)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
gb_model['outer_loop'] = outer_loop_arr
gb_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(gb_model)
all_models_lite.append(gb_model_lite)

to_json(gb_model_lite,gb_model,'gb')

ending_time = time.time()
print(ending_time - starting_time)
 

8.822300672531128


## KNeighbors Classifier

In [3]:
knn_clf = KNeighborsClassifier(n_jobs = -1)
#knn_clf.get_params()

In [18]:
param_grid = {
    'n_neighbors': list(range(1, 11, 2)),  # Number of neighbors to consider.
    'weights': ['uniform', 'distance'],  # Weighting of neighbors. 'uniform': all neighbors have equal weight, 'distance': closer neighbors have more influence.
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm for computing nearest neighbors.
    'p': [1, 2],  # Power parameter for Minkowski distance (1 for Manhattan, 2 for Euclidean).
}
knn_model = {}
knn_model['model']='K-Nearest_Neighbors_Classifier'
outer_loop_arr = []
knn_model_lite = {}
knn_model_lite['model']='K-Nearest_Neighbors_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(knn_clf, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
knn_model['outer_loop']= outer_loop_arr
knn_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(knn_model)
all_models_lite.append(knn_model_lite)

to_json(knn_model_lite,knn_model,'knn')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## LGBMClassifier

In [3]:
lgb_clf = LGBMClassifier(n_jobs = -1, class_weight = 'balanced',random_state = 42,force_col_wise = True)
#lgb_clf.get_params()

In [34]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 700],  # Number of boosting stages. Larger values may lead to better performance but longer training times.
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],  # Larger values shrinks the contribution of each tree, which can help prevent overfitting but may require more trees for similar predictive power.
    'max_depth': list(range(2, 10)),  # Maximum depth of individual trees. Larger values can capture more complex relationships and can lead to overfitting if too large.
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Fraction of samples used for fitting trees. A larger value means using more data for training.
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],  # Fraction of features used for fitting trees. A larger value increases diversity but may lead to overfitting if set too high.
}
lgb_model = {}
lgb_model['model']='LightGBM_Classifier'
outer_loop_arr = []
lgb_model_lite = {}
lgb_model_lite['model']='LightGBM_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(lgb_clf, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
lgb_model['outer_loop']= outer_loop_arr
lgb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(lgb_model)
all_models_lite.append(lgb_model_lite)

to_json(lgb_model_lite,lgb_model,'lgb')

loading bar:

[LightGBM] [Info] Number of positive: 148, number of negative: 2840
[LightGBM] [Info] Total Bins 633
[LightGBM] [Info] Number of data points in the train set: 2988, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 149, number of negative: 2839
[LightGBM] [Info] Total Bins 632
[LightGBM] [Info] Number of data points in the train set: 2988, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 149, number of negative: 2839
[LightGBM] [Info] Total Bins 632
[LightGBM] [Info] Number of data points in the train set: 2988, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightG

## Logistic Regression classifier

In [15]:
logreg_clf = LogisticRegression(class_weight='balanced', n_jobs = -1, random_state=42)

In [16]:
param_grid = {
    'C': [0.1, 0.5, 1, 2, 10, 100],  # Regularization parameter. Larger values lead to weaker regularization.
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel function to use.
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (used with 'poly' kernel).
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels. Smaller gamma values lead to smoother decision boundaries witch can overfit the data.
    # gamma = scale => gamma = 1/n_features,  gamma = auto => gamma = 1/n_samples.
}
logreg_model = {}
logreg_model['model']='Logistic_Regression_Classifier'
outer_loop_arr = []
logreg_model_lite = {}
logreg_model_lite['model']='Logistic_Regression_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(logreg_clf, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
logreg_model['outer_loop']= outer_loop_arr
logreg_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(logreg_model)
all_models_lite.append(logreg_model_lite)

to_json(logreg_model_lite,logreg_model,'logreg')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## MLPClassifier

In [29]:
mlp_clf = MLPClassifier(random_state=42)

In [30]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,)],  # Number of neurons in each hidden layer. Larger value lead to more complex
    'activation': ['identity', 'logistic', 'tanh', 'relu'],  # Activation function for hidden layers. 'identity':  returns its input as-is, 'relu': Rectified Linear Unit
    'solver': ['lbfgs', 'sgd', 'adam'],  # Optimization algorithm.
    'alpha': np.logspace(-5, 2, 8),  # L2 regularization term. Larger value lead to stronger regularization
    'learning_rate': ['constant', 'invscaling', 'adaptive'],  # Learning rate schedule for weight updates.
}
mlp_model = {}
mlp_model['model']='Multi-layer_Perceptron_Classifier'
outer_loop_arr = []
mlp_model_lite = {}
mlp_model_lite['model']='Multi-layer_Perceptron_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(mlp_clf, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
mlp_model['outer_loop'] = outer_loop_arr
mlp_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(mlp_model)
all_models_lite.append(mlp_model_lite)

to_json(mlp_model_lite,mlp_model,'mlp')

ending_time = time.time()
print(ending_time - starting_time)

14.269916772842407


## QuadraticDiscriminantAnalysis

In [21]:
qda_clf = QuadraticDiscriminantAnalysis()

In [22]:
param_grid = {
    'priors': [None, [0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6]]  # Prior probabilities for each class.
}
qda_model = {}
qda_model['model']='Quadratic_Discriminant_Analysis_Classifier'
outer_loop_arr = []
qda_model_lite = {}
qda_model_lite['model']='Quadratic_Discriminant_Analysis_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(qda_clf, param_grid,fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
qda_model['outer_loop'] = outer_loop_arr
qda_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(qda_model)
all_models_lite.append(qda_model_lite)

to_json(qda_model_lite,qda_model,'qda')

ending_time = time.time()
print(ending_time - starting_time)

loading bar:





10%




20%




30%




40%




50%




60%




70%




80%




90%




100%




## RadiusNeighborsClassifier

In [27]:
rnc_clf = RadiusNeighborsClassifier()

In [28]:
param_grid = {
    'radius': [0.1, 0.5, 1.0, 1.5, 2.0],  # Radius within which neighbors are considered. Smaller radius considers only nearby data points.
    'weights': ['uniform', 'distance'],  # Weighting of neighbors. 'uniform': all neighbors have equal weight, 'distance': closer neighbors have more influence.
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm for computing neighbors.
    'p': [1, 2],  # Power parameter for Minkowski distance (1 for Manhattan, 2 for Euclidean). Affects distance computation.
}
rnc_model = {}
rnc_model['model']='Radius_Neighbors_Classifier'
outer_loop_arr = []
rnc_model_lite = {}
rnc_model_lite['model']='Radius_Neighbors_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(rnc_clf, param_grid,fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
rnc_model['outer_loop'] = outer_loop_arr
rnc_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(rnc_model)
all_models_lite.append(rnc_model_lite)

to_json(rnc_model_lite,rnc_model,'rnc')

ending_time = time.time()
print(ending_time - starting_time)

ValueError: No neighbors found for test samples array([   0,    1,    2, ..., 1491, 1492, 1493], dtype=int64), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.

## RandomForestClassifier

In [11]:
forest_clf = RandomForestClassifier(class_weight="balanced",random_state=42,n_jobs = -1)

In [12]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 1000],  # Number of trees in the forest. More trees usually lead to better performance. Larger values lead to stronger regularization. 
    'max_depth': list(range(2, 35, 3)) + [None],  # Maximum depth of the trees. None means no maximum depth. Deeper trees can capture more complex patterns but may overfit. Smaller values lead to stronger regularization.
    'min_samples_split': list(range(2, 20, 2)),  # Minimum samples required to split an internal node. Larger values help prevent overfitting. Larger values lead to stronger regularization.
    'max_features': ['log', 'sqrt']+ [0.1, 0.2, 0.25, 0.33, 0.5],  # Maximum number of features to consider for a split. Smaller values reduce model complexity. Smaller values lead to stronger regularization.
    'criterion': ['gini', 'entropy'],  # Criterion for measuring the quality of a split.
}
forest_model = {}
forest_model['model']='Random_Forest_Classifier'
outer_loop_arr = []
forest_model_lite = {}
forest_model_lite['model']='Random_Forest_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(forest_clf, param_grid, X_outer_train, y_outer_train)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
forest_model['outer_loop']= outer_loop_arr
forest_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(forest_model)
all_models_lite.append(forest_model_lite)

to_json(forest_model_lite,forest_model,'rf')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## Ridge Classifier

In [5]:
ridge_clf = RidgeClassifier(class_weight = 'balanced', random_state = 42)
#ridge_clf.get_params()

{'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.0001}

In [None]:
param_grid = {
    'alpha': np.logspace(-5, 2, 8),  # Regularization strength (L2 regularization). Smaller values lead to weaker regularization.
    'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],  # Algorithm for optimization.
    'max_iter': [None, 50, 100, 200, 300, 400, 500, 1000],  # Maximum number of optimization iterations. If None the model takes the default for each solver.
}
ridge_model = {}
ridge_model['model']='Ridge_Classifier'
outer_loop_arr = []
ridge_model_lite = {}
ridge_model_lite['model']='Ridge_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()

outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(ridge_reg, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
ridge_model['outer_loop'] = outer_loop_arr
ridge_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(ridge_model)
all_models_lite.append(ridge_model_lite)

to_json(ridge_model_lite,ridge_model,'ridge')

ending_time = time.time()
print(ending_time - starting_time)

## SGDClassifier

In [37]:
sgd_clf = SGDClassifier(n_jobs = -1, class_weight = 'balanced')

In [39]:
param_grid = {
    'loss': ['hinge', 'log', 'modified_huber'],  # Loss function to use for optimization.
    'penalty': ['l2', 'l1', 'elasticnet'],  # Penalty term for regularization.
    'alpha': np.logspace(-5, 2, 8),  # Regularization strength. Larger values lead to stronger regularization. 
    'max_iter': [50, 100, 200, 300, 400, 500, 1000],  # Maximum number of iterations.
}
sgd_model = {}
sgd_model['model']='SGD_Classifier'
outer_loop_arr = []
sgd_model_lite = {}
sgd_model_lite['model']='SGD_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(sgd_clf, param_grid, X_outer_train, y_outer_train, True, False)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1,True, False)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
sgd_model['outer_loop']= outer_loop_arr
sgd_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(sgd_model)
all_models_lite.append(sgd_model_lite)

to_json(sgd_model_lite,sgd_model,'sgd')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## Support Vector Classifier


In [13]:
svc_clf = SVC(class_weight='balanced', probability=True, random_state=42)

In [14]:
param_grid = {
    'C': [0.1, 0.5, 1, 2, 10, 100],  # Regularization parameter. Larger values allow for more flexible decision boundaries but may overfit.
    'kernel': ['linear', 'rbf', 'poly' , 'sigmoid'],  # Kernel function for mapping data to a higher-dimensional space. Functions: Linear, Radial basis function (RBF), Polynomial.
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (used with 'poly' kernel).
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels. Smaller gamma values lead to smoother decision boundaries witch can overfit the data.
    # gamma = scale => gamma = 1/n_features,  gamma = auto => gamma = 1/n_samples.
}
svc_model = {}
svc_model['model']='Support_Vector_Classifier'
outer_loop_arr = []
svc_model_lite = {}
svc_model_lite['model']='Support_Vector_Classifier'
outer_loop_arr_lite = []

starting_time = time.time()


outer_loop_results = Parallel(n_jobs=-1)(
    delayed(process_fold)(svc_clf, param_grid, fold_num, outer_train_index, outer_test_index)
    for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y))
)

# Extract the results for each fold
outer_loop_arr = [result[0] for result in outer_loop_results]
outer_loop_arr_lite = [result[1] for result in outer_loop_results]

# Continue with appending the results to your all_models and all_models_lite lists
svc_model['outer_loop'] = outer_loop_arr
svc_model_lite['outer_loop'] = outer_loop_arr_lite
all_models.append(svc_model)
all_models_lite.append(svc_model_lite)

to_json(svc_model_lite,svc_model,'svc')

ending_time = time.time()
print(ending_time - starting_time)
 

59.21150255203247


## XGBClassifier

In [5]:
xgb_clf = XGBClassifier(random_state=42, n_jobs = -1)
#xgb_clf.get_params()

In [24]:
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 700],  # Number of boosting stages. Larger values may lead to better performance but longer training times.
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0], # Larger values shrinks the contribution of each tree, which can help prevent overfitting but may require more trees for similar predictive power.
    'max_depth': list(range(1, 10)),  # Maximum depth of individual trees. Larger values can capture more complex relationships and can lead to overfitting if too large.
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Fraction of samples used for fitting trees.  Smaller values reduce overfitting risk. 
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],  # Fraction of features used for fitting trees. A larger value increases diversity but may lead to overfitting if set too high.
    'objective': ['binary:logistic'],  # Learning task and objective function for binary classification.
    'eval_metric': ['logloss', 'auc'],  # Evaluation metric to optimize. Logloss measures classification accuracy, AUC measures area under the ROC curve.
}
xgb_model = {}
xgb_model['model']='XGB_Classifier'
outer_loop_arr = []
xgb_model_lite = {}
xgb_model_lite['model']='XGB_Classifier'
outer_loop_arr_lite = []
print('loading bar:\n')
for fold_num, (outer_train_index, outer_test_index) in enumerate(outer_cv.split(X, y)):
    X_outer_train, X_outer_test = Xarr[outer_train_index], Xarr[outer_test_index]
    y_outer_train, y_outer_test = yarr[outer_train_index], yarr[outer_test_index]
    
    best_model,best_param,param_arr,param_arr_lite = grid_search(xgb_clf, param_grid, X_outer_train, y_outer_train,False,True)
    outer_loop,outer_loop_lite=outer_metric(best_model, best_param, X_outer_train, y_outer_train,outer_train_index,\
                 X_outer_test,y_outer_test,outer_test_index,fold_num + 1,False,True)
    outer_loop['param_comb']= param_arr
    outer_loop_arr.append(outer_loop)
    outer_loop_lite['param_comb']= param_arr_lite
    outer_loop_arr_lite.append(outer_loop_lite)
    print(f'{(fold_num+1)*10}%')
xgb_model['outer_loop']= outer_loop_arr
xgb_model_lite['outer_loop']= outer_loop_arr_lite
all_models.append(xgb_model)
all_models_lite.append(xgb_model_lite)

to_json(xgb_model_lite,xgb_model,'xgb')

loading bar:

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


## making the JSON file

In [40]:


# Convert the array of dictionaries to a JSON array using the custom encoder
#json_array = json.dumps(all_models, cls=NumpyArrayEncoder,indent=2)

# Save the JSON array to a file
#with open(f'result_dataset{momental_dataset}/allmodels_lite.json', 'w') as json_file:
    #json_file.write(json_array)


json_array = json.dumps(all_models_lite, cls=NumpyArrayEncoder,indent=2)

# Save the JSON array to a file
with open(f'result_dataset{momental_dataset}/allmodels_lite.json', 'w') as json_file:
    json_file.write(json_array)
