In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Load the dataset
df = pd.read_csv('pima_indians_diabetes.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
columns_with_zeros = ['plasma_concentration', 'diastolic_blood_pressure', 'triceps_skinfold_thickness', 
                      'serum_insulin', 'bmi']
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.nan)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df[columns_with_zeros] = imputer.fit_transform(df[columns_with_zeros])

# Standardize the features
scaler = StandardScaler()
features = df.drop(columns=['class'])
scaled_features = scaler.fit_transform(features)

# Combine scaled features with the target variable
df_scaled = pd.DataFrame(scaled_features, columns=features.columns)
df_scaled['class'] = df['class']

In [None]:
df_scaled.head()

In [3]:
from sklearn.model_selection import train_test_split
X_scaled=df_scaled.drop(columns='class')
y=df_scaled['class']
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [35]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
n_estimators_options = [10, 15, 20]
max_features_options = [ 'sqrt', 'log2',None]
max_depth_options = [15, 15, 20]

# Function to perform cross-validation
def cross_val_score_rf(X, y, n_estimators, max_features, max_depth, cv=10):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        rf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, random_state=42)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        accuracies.append(accuracy_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        f1_scores.append(f1_score(y_val, y_pred))

    
    return [np.mean(accuracies), np.mean(precisions), np.mean(recalls), np.mean(f1_scores) ]

# Perform cross-validation for each combination of parameters
best_score = 0
best_params = {}

for n_estimators in n_estimators_options:
    for max_features in max_features_options:
        for max_depth in max_depth_options:
            temp_accuracies,temp_precisions,temp_recalls,temp_f1_scores = cross_val_score_rf(X_scaled, y, n_estimators, max_features, max_depth, cv=10)
            print(f'n_estimators: {n_estimators}, max_features: {max_features}, max_depth: {max_depth},accuracy: {temp_accuracies}, precision: {temp_precisions}, recall: {temp_recalls}, f1_score: {temp_f1_scores}')
            if temp_f1_scores > best_score:
                best_score = temp_f1_scores
                best_params = {
                    'n_estimators': n_estimators,
                    'max_features': max_features,
                    'max_depth': max_depth

                }

print(best_score, best_params)

n_estimators: 10, max_features: sqrt, max_depth: 15,accuracy: 0.7551435406698563, precision: 0.6727917396613049, recall: 0.5727906561777529, f1_score: 0.6160042624976325
n_estimators: 10, max_features: sqrt, max_depth: 15,accuracy: 0.7551435406698563, precision: 0.6727917396613049, recall: 0.5727906561777529, f1_score: 0.6160042624976325
n_estimators: 10, max_features: sqrt, max_depth: 20,accuracy: 0.7538619275461381, precision: 0.6817583492022853, recall: 0.5482875296746265, f1_score: 0.605146055561536
n_estimators: 10, max_features: log2, max_depth: 15,accuracy: 0.7434381408065618, precision: 0.6549095975275648, recall: 0.542762758149855, f1_score: 0.5873461200409931
n_estimators: 10, max_features: log2, max_depth: 15,accuracy: 0.7434381408065618, precision: 0.6549095975275648, recall: 0.542762758149855, f1_score: 0.5873461200409931
n_estimators: 10, max_features: log2, max_depth: 20,accuracy: 0.7421394395078605, precision: 0.6590078279252509, recall: 0.5360130490775652, f1_score: 0.

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score

# Function to perform cross-validation for multiple ensemble methods
def cross_val_score_ensemble(X, y, models, cv=10):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    model_accuracies = {name: [] for name in models.keys()}
    model_precisions = {name: [] for name in models.keys()}
    model_recalls = {name: [] for name in models.keys()}
    model_f1_scores = {name: [] for name in models.keys()}
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            model_accuracies[name].append(accuracy_score(y_val, y_pred))
            model_precisions[name].append(precision_score(y_val, y_pred))
            model_recalls[name].append(recall_score(y_val, y_pred))
            model_f1_scores[name].append(f1_score(y_val, y_pred))

    
    avg_accuracies = {name: np.mean(scores) for name, scores in model_accuracies.items()}
    avg_precisions = {name: np.mean(scores) for name, scores in model_precisions.items()}
    avg_recalls = {name: np.mean(scores) for name, scores in model_recalls.items()}
    avg_f1_scores = {name: np.mean(scores) for name, scores in model_f1_scores.items()}

    return [avg_accuracies, avg_precisions, avg_recalls, avg_f1_scores]

# Define the models
models = {
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}



# Perform cross-validation for ensemble methods
ensemble_scores = cross_val_score_ensemble(X_scaled, y, models, cv=10)
print(f'Ensemble Models Accuracies: {ensemble_scores[0]}'+ '\n'+'Ensemble Models Precisions:',ensemble_scores[1],'\n'+'Ensemble Models Recalls:',ensemble_scores[2],'\n'+'Ensemble Models F1 Scores:',ensemble_scores[3])    



Ensemble Models Accuracies: {'Gradient Boosting': 0.7603212576896787, 'AdaBoost': 0.753879015721121, 'XGBoost': 0.7473513328776487}
Ensemble Models Precisions: {'Gradient Boosting': 0.6769140630412, 'AdaBoost': 0.6620417780217558, 'XGBoost': 0.6579206669289789} 
Ensemble Models Recalls: {'Gradient Boosting': 0.6203810609939643, 'AdaBoost': 0.5978312774441806, 'XGBoost': 0.6116826482955514} 
Ensemble Models F1 Scores: {'Gradient Boosting': 0.6421174304489917, 'AdaBoost': 0.624575230812355, 'XGBoost': 0.6276340177982755}
