In [207]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.utils import resample

In [208]:
class LogisticRegression:
    def __init__ (self, lr=0.0001, iter=1000, tol=1e-6):
        self.lr = lr
        self.iter = iter
        self.weights = None
        self.tol = tol
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def update_weights(self, X, y):
        y_pred = self.sigmoid(np.dot(X, self.weights))
        error = y - y_pred
        gradient = np.dot(X.T, error)
        self.weights += self.lr * gradient

    def train(self, X, y):
        n_samples, n_features = X.shape
        X = np.hstack((np.ones((n_samples, 1)), X))
        self.weights = np.zeros(n_features + 1)
        for _ in range(self.iter):
            self.update_weights(X, y)
            
    def predict(self, X, final=True):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        if final:
            return np.round(self.sigmoid(np.dot(X, self.weights)))
        return self.sigmoid(np.dot(X, self.weights))

In [209]:
class Bagging:
    def __init__ (self, base_estimator = LogisticRegression, n_estimators=9, random_state=40, sample_size=1.0):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.sample_size = sample_size
        self.estimators_ = []
    
    def train(self, X, y):
        self.estimators_ = []
        np.random.seed(self.random_state)  
        for _ in range(self.n_estimators):
            estimator = self.base_estimator()
            X_resampled, y_resampled = resample(X, y, n_samples=int(self.sample_size * X.shape[0]))
            estimator.train(X_resampled, y_resampled)
            self.estimators_.append(estimator)
    
    def predict(self, X, final=True):
        predictions = []
        for estimator in self.estimators_:
            predictions.append(estimator.predict(X, final))
        predictions = np.array(predictions)
        return predictions
    
    def predict_majority(self, X, final=True):
        predictions = self.predict(X, final)
        majority_votes = []
        for i in range(predictions.shape[1]):
            feature_predictions = predictions[:, i].astype(int)
            majority_vote = np.argmax(np.bincount(feature_predictions))
            majority_votes.append(majority_vote)
        return np.array(majority_votes)
    
    def predict_prob(self, X):
        predictions = self.predict(X, final=False)
        return np.mean(predictions, axis=0)

In [210]:
class Stacking:
    def __init__ (self, base_estimator = LogisticRegression, meta_estimator= LogisticRegression):
        self.base_estimators = base_estimator
        self.meta_estimator = meta_estimator
        self.meta_model = None
        self.base_models = None

    def train(self, X, y, X_meta, y_meta):
        self.base_models = Bagging(self.base_estimators, n_estimators=9)
        self.base_models.train(X, y)
        meta_samples = self.base_models.predict(X_meta, False).T
        meta_samples = np.hstack([ X_meta , meta_samples])
        self.meta_model = self.meta_estimator()
        self.meta_model.train(meta_samples, y_meta)

    def predict(self, X, final=True):
        meta_samples = self.base_models.predict(X, False).T
        meta_samples = np.hstack([X, meta_samples])
        prediction = self.meta_model.predict(meta_samples, final)
        return prediction

In [211]:
df= pd.read_csv('B1.csv')
df.shape

(3000, 3)

In [212]:
df[df.isnull().any(axis=1)]

Unnamed: 0,X1,X2,y


In [213]:
df[df.duplicated(keep=False)]

Unnamed: 0,X1,X2,y


In [214]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column '{column}' has values {unique_values}")

Column 'X1' has values [ 0.02114873  0.17373477  0.25104132 ... -0.06410917  0.77160769
 -0.79093564]
Column 'X2' has values [ 0.41824576 -0.99352424  0.44758846 ... -1.0313122  -0.79397815
 -0.37937787]
Column 'y' has values [1 0]


In [215]:
df.dtypes

X1    float64
X2    float64
y       int64
dtype: object

In [216]:
X = df[['X1', 'X2']].values

In [217]:
y = df['y'].values
y

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [218]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

def scaling(scalingmethod):
    if scalingmethod == 'minmax':
        return MinMaxScaler()
    elif scalingmethod == 'standard':
        return StandardScaler()
    else:
        raise ValueError("Invalid scaling method. Choose 'minmax' or 'standard'.")

In [219]:
scaler = scaling('standard')

In [220]:
X = scaler.fit_transform(X)

In [221]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Code from implemented LR and Stacking

## My implemented LR and Stacking

In [222]:
# y_train = y_train.to_numpy().ravel()
# y_test = y_test.to_numpy().ravel()

# from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# y_pred_prob = clf.predict_proba(X_test)[:, 1]

clf.train(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict(X_test, False)

accuracy = accuracy_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_prob)
aupr = average_precision_score(y_test, y_pred_prob)

print(f"Accuracy of Logistic Regression classifier: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")

Accuracy of Logistic Regression classifier: 0.5617
Sensitivity (Recall): 0.9118
Specificity: 0.2713
Precision: 0.5092
F1-score: 0.6535
AUROC: 0.4559
AUPR: 0.3892


In [223]:
# y_train = y_train.to_numpy().ravel()
# y_test = y_test.to_numpy().ravel()

from sklearn.linear_model import LogisticRegression

# clf = LogisticRegression()

# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# y_pred_prob = clf.predict_proba(X_test)[:, 1]

clf.train(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict(X_test, False)

accuracy = accuracy_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_prob)
aupr = average_precision_score(y_test, y_pred_prob)

print(f"Accuracy of Logistic Regression classifier: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUROC: {auroc:.4f}")
print(f"AUPR: {aupr:.4f}")

Accuracy of Logistic Regression classifier: 0.5617
Sensitivity (Recall): 0.9118
Specificity: 0.2713
Precision: 0.5092
F1-score: 0.6535
AUROC: 0.4559
AUPR: 0.3892


In [224]:
clf = Bagging()
clf.train(X_train, y_train)
y_pred = clf.predict_majority(X_test)
y_pred_prob = clf.predict_prob(X_test)

majority_accuracy = accuracy_score(y_test, y_pred)
majority_sensitivity = recall_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
majority_specificity = tn / (tn + fp)
majority_precision = precision_score(y_test, y_pred)
majority_f1 = f1_score(y_test, y_pred)
majority_auroc = roc_auc_score(y_test, y_pred_prob)
majority_aupr = average_precision_score(y_test, y_pred_prob)

print(f"Accuracy of Bagging: {majority_accuracy:.4f}")
print(f"Sensitivity (Recall): {majority_sensitivity:.4f}")
print(f"Specificity: {majority_specificity:.4f}")
print(f"Precision: {majority_precision:.4f}")
print(f"F1-score: {majority_f1:.4f}")
print(f"AUROC: {majority_auroc:.4f}")
print(f"AUPR: {majority_aupr:.4f}")

Accuracy of Bagging: 0.5367
Sensitivity (Recall): 0.9963
Specificity: 0.1555
Precision: 0.4945
F1-score: 0.6610
AUROC: 0.4563
AUPR: 0.3894


In [225]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, precision_score, f1_score, roc_auc_score, average_precision_score

# Initialize 10 BaggingClassifiers with Logistic Regression as the base estimator
classifiers = [('bag' + str(i), BaggingClassifier(estimator=LogisticRegression(), n_estimators=10)) for i in range(10)]

# Create a VotingClassifier with 10 BaggingClassifiers
voting_clf = VotingClassifier(estimators=classifiers, voting='soft')  # Use 'hard' for majority voting, 'soft' for weighted voting

# Train the VotingClassifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test)
y_pred_prob = voting_clf.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
majority_accuracy = accuracy_score(y_test, y_pred)
majority_sensitivity = recall_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
majority_specificity = tn / (tn + fp)
majority_precision = precision_score(y_test, y_pred)
majority_f1 = f1_score(y_test, y_pred)
majority_auroc = roc_auc_score(y_test, y_pred_prob)
majority_aupr = average_precision_score(y_test, y_pred_prob)

# Print evaluation metrics
print(f"Accuracy of VotingClassifier: {majority_accuracy:.4f}")
print(f"Sensitivity (Recall): {majority_sensitivity:.4f}")
print(f"Specificity: {majority_specificity:.4f}")
print(f"Precision: {majority_precision:.4f}")
print(f"F1-score: {majority_f1:.4f}")
print(f"AUROC: {majority_auroc:.4f}")
print(f"AUPR: {majority_aupr:.4f}")

Accuracy of VotingClassifier: 0.5667
Sensitivity (Recall): 0.9228
Specificity: 0.2713
Precision: 0.5122
F1-score: 0.6588
AUROC: 0.4561
AUPR: 0.3894


In [226]:

X_train, X_meta, y_train, y_meta = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier

clf = Stacking()
clf.train(X_train, y_train, X_meta, y_meta)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict(X_test, False)

stacking_accuracy = accuracy_score(y_test, y_pred)
stacking_sensitivity = recall_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
stacking_specificity = tn / (tn + fp)
stacking_precision = precision_score(y_test, y_pred)
stacking_f1 = f1_score(y_test, y_pred)
stacking_auroc = roc_auc_score(y_test, y_pred_prob)
stacking_aupr = average_precision_score(y_test, y_pred_prob)

print(f"Accuracy of Stacking: {stacking_accuracy:.4f}")
print(f"Sensitivity (Recall): {stacking_sensitivity:.4f}")
print(f"Specificity: {stacking_specificity:.4f}")
print(f"Precision: {stacking_precision:.4f}")
print(f"F1-score: {stacking_f1:.4f}")
print(f"AUROC: {stacking_auroc:.4f}")
print(f"AUPR: {stacking_aupr:.4f}")

Accuracy of Stacking: 0.5167
Sensitivity (Recall): 0.5037
Specificity: 0.5274
Precision: 0.4692
F1-score: 0.4858
AUROC: 0.5310
AUPR: 0.4233
