In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import pickle
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from scipy.stats import zscore
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
from sklearn.tree import DecisionTreeRegressor

# Load the dataset
df = pd.read_csv("diabetes - DS.csv")

# Preprocessing
df_copy = df.copy(deep=True)
df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df_copy[
    ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace=True)
df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(), inplace=True)
df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(), inplace=True)
df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace=True)
df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace=True)

# Split the data into features (X) and target labels (y)
X = df_copy.drop(columns=['Outcome'])
y = df_copy['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=194)

# Convert X_test back to a DataFrame 
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Combine X_test_df and y_test into a single DataFrame
test_df = pd.concat([X_test_df, y_test], axis=1)

# Export only the test set to CSV
test_df.to_csv("preprocessed_test_dataset.csv", index=False)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler using pickle
with open('standard_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Remove outliers using z-score
z_scores = zscore(X_train)
threshold = 3
X_train_no_outliers = X_train[(np.abs(z_scores) < threshold).all(axis=1)]
y_train_no_outliers = y_train[(np.abs(z_scores) < threshold).all(axis=1)]

# Use RandomOverSampler to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train_no_outliers, y_train_no_outliers)

# Concatenate X_test and y_test and convert to DataFrame
test_data = pd.concat([pd.DataFrame(X_test, columns=X.columns), pd.DataFrame(y_test, columns=['Outcome'])], axis=1)

# Save the preprocessed data to a CSV file
df_copy.to_csv('preprocessed_data.csv', index=False)

class GradientBoostingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=300, learning_rate=0.1, max_depth=2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []

    def fit(self, X, y):
        residuals = np.copy(y).astype(float)

        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            self.models.append(tree)

            predictions = tree.predict(X)
            residuals -= self.learning_rate * predictions

    def predict(self, X):
        predictions = np.zeros(len(X))
        for tree in self.models:
            predictions += self.learning_rate * tree.predict(X)
        return (predictions > 0.5).astype(int)

    def predict_proba(self, X):
        sum_predictions = np.zeros(len(X))
        for tree in self.models:
            sum_predictions += self.learning_rate * tree.predict(X)

        proba_positive_class = 1 / (1 + np.exp(-sum_predictions))
        proba_negative_class = 1 - proba_positive_class

        return np.column_stack((proba_negative_class, proba_positive_class))

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

class CustomKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [np.sqrt(np.sum((x - x_train)**2)) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        return Counter(k_nearest_labels).most_common(1)[0][0]

    def predict_proba(self, X):
        probabilities = []
        for x in X:
            distances = [np.sqrt(np.sum((x - x_train)**2)) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            class_counts = Counter(k_nearest_labels)
            prob_class_0 = class_counts[0] / self.k
            prob_class_1 = class_counts[1] / self.k
            probabilities.append([prob_class_0, prob_class_1])
        return np.array(probabilities)

class CustomVotingClassifier:
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights
        if weights is None:
            self.weights = [1/len(models)] * len(models)  

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
        for i, model in enumerate(self.models):
            y_pred_val = model.predict(X_val)
            accuracy = accuracy_score(y_val, y_pred_val)
            self.weights[i] = accuracy

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        weighted_votes = np.average(predictions, axis=0, weights=self.weights)
        return (weighted_votes > 0.5).astype(int)

# Create instances of CustomKNN and GradientBoostingClassifier
knn_model = CustomKNN()
gb_model = GradientBoostingClassifier()

# Create the CustomVotingClassifier with CustomKNN and GradientBoostingClassifier
voting_clf = CustomVotingClassifier(models=[knn_model, gb_model])

# Fit the VotingClassifier
voting_clf.fit(X_train, y_train)

# Make predictions using the VotingClassifier
voting_preds_train = voting_clf.predict(X_train)
voting_preds_test = voting_clf.predict(X_test)

# Calculate accuracy on the test set
accuracy_test = accuracy_score(y_test, voting_preds_test)

print("Accuracy on Test Set:", accuracy_test)

# Calculate F1 score, precision, recall, and confusion matrix
f1 = f1_score(y_test, voting_preds_test, average='weighted')
precision = precision_score(y_test, voting_preds_test, average='weighted')
recall = recall_score(y_test, voting_preds_test, average='weighted')
conf_matrix = confusion_matrix(y_test, voting_preds_test)

print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", conf_matrix)

# Save the model using pickle
with open('hybrid_model.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)

Accuracy on Test Set: 0.8441558441558441
F1 Score: 0.8459448714550755
Precision: 0.850370147247935
Recall: 0.8441558441558441
Confusion Matrix:
 [[86 15]
 [ 9 44]]
