In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif, SelectKBest
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
def data_scaling(features, number_type, scale_type='minmax'):
    if scale_type == 'minmax':
        scaler = MinMaxScaler()
    elif scale_type == 'standard':
        scaler = StandardScaler()

    features_scaled = scaler.fit_transform(features[number_type])
    features_scaled_df = pd.DataFrame(features_scaled, columns=number_type)
    features_nonnum = features.drop(columns=number_type)

    features_df = pd.concat([features_scaled_df, features_nonnum], axis=1)
    return features_df

In [20]:
def encode_labels(labels):
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)
    return labels

In [21]:
def encode_features(features):
    encoder = LabelEncoder()
    all_columns = features.columns
    for col in all_columns:
        if features[col].nunique() == 1:
            features = features.drop(columns=col)
    
    object_type = features.select_dtypes(include='object').columns
    features[object_type] = features[object_type].astype('category')
    object_binary = []

    for obj in object_type:
        if features[obj].nunique() == 2:
            object_binary.append(obj)

    for obj in object_binary:
        object_type = object_type.drop(obj)

    for obj in object_binary:
        features[obj] = encoder.fit_transform(features[obj])

    features = pd.get_dummies(features).astype(int)

    return features

In [22]:
def feature_selection(features, labels, method=1):
    if method == 1:
        correlation = features.corrwith(labels)
        corr_cols = correlation.abs().nlargest(20).keys().to_list()
        selected_features = features[corr_cols]

    else:
        info_gain = SelectKBest(score_func=mutual_info_classif, k=20)
        info_gain.fit(features, labels)
        top_features = info_gain.get_support(indices=True)
        selected_features = features.iloc[:, top_features]
    return selected_features

In [23]:
def normalize(df):
    col_list = df.columns.to_list()
    df_arr = np.array(df)
    normalizer = Normalizer(norm='l1').fit(df_arr)
    normalizer.transform(df_arr)
    df_ = pd.DataFrame(df_arr, columns=col_list)
    return df_
    

In [24]:
def data_preprocessing(dataset):
    test_features_s = None
    test_labels_df = None
    if dataset == 1:
        df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

        df.replace({' ': np.nan}, inplace=True)
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        #df.dropna(subset=['TotalCharges'], inplace=True)
        df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].mean())
        df = df.drop('customerID', axis=1)
        df.drop_duplicates(inplace=True)

    elif dataset == 2:
        df = pd.read_csv('adult/adult.data', header=None)
        test_df = pd.read_csv('adult/adult.test', header=None, skiprows=1)

        df.columns = df.columns.astype(str)
        test_df.columns = test_df.columns.astype(str)

        object_type = df.select_dtypes(include='object').columns
        for col in object_type:
            df[col] = df[col].str.strip()
            test_df[col] = test_df[col].str.strip()
            
        test_df[test_df.columns[-1]] = test_df[test_df.columns[-1]].str.rstrip('.')

        df.replace({'?' : np.nan}, inplace=True)
        test_df.replace({'?' : np.nan}, inplace=True)

        df.drop_duplicates(inplace=True)

        for col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])

        for col in test_df.columns:
            test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

        test_label_col = test_df.columns[-1]
        test_features = test_df.drop(test_label_col, axis=1)
        test_labels = test_df[test_label_col]
        test_labels = encode_labels(test_labels)
        number_type = test_features.select_dtypes(include='number').columns

        test_features = encode_features(test_features)
        test_features = data_scaling(test_features, number_type)

    elif dataset == 3:
        df_ = pd.read_csv('creditcard.csv')
        label = df_.columns[-1]
        df_1 = df_[df_[label] == 1]
        df_0_sampled = df_[df_[label] == 0].sample(n=20000, random_state=42)
        df = pd.concat([df_1, df_0_sampled])

    elif dataset == 4:
        df = pd.read_csv('A1.csv')
        df = df.fillna(df.mean(numeric_only=True))
        df.columns = df.columns.astype(str)
    
    df.reset_index(drop=True, inplace=True)

    label_col = df.columns[-1]
    features = df.drop(label_col, axis=1)
    labels = df[label_col]

    if dataset != 4:
        labels = encode_labels(labels)
    number_type = features.select_dtypes(include='number').columns
    
    features = encode_features(features)
    features = data_scaling(features, number_type)

    label_df = pd.DataFrame(labels, columns=[label_col])
    label_series = label_df[label_col]
    selected_features = feature_selection(features, label_series)
    # selected_features = normalize(selected_features)

    if dataset == 2:
        cols = selected_features.columns
        test_features_s = test_features[cols]
        test_labels_df = pd.DataFrame(test_labels, columns=[test_label_col])
        
    return selected_features, label_df, test_features_s, test_labels_df

In [25]:
class LogisticRegressionImp:
    def __init__(self, alpha=0.001, iterations=1000):
        self.alpha = alpha
        self.iterations = iterations
        self.w = None
        self.bias = 0

    def sigmoid(self, z):
        return 1/(1+np.exp(-z))

    def fit(self, features, labels):
        features_arr = np.array(features)
        labels_arr = np.array(labels)
        labels_arr = labels_arr.reshape(-1, 1)
        n = features.shape[1]   #how many columns
        self.w = np.zeros((n, 1))
        # self.bias = 0
        
        for i in range(self.iterations):
            h = self.sigmoid(np.dot(features_arr, self.w))
            dz = labels_arr - h
            
            gradient = np.dot(features_arr.T, dz)

            self.w = self.w + self.alpha*gradient
            # self.bias += (self.alpha*np.sum(dz))

    def predict(self, features):
        features_arr = np.array(features)
        probs = self.sigmoid(np.dot(features_arr, self.w))
        predictions = []
        for p in probs.flatten():
            if p >= 0.5:
                predictions.append(1)
            else:
                predictions.append(0)
        return predictions
    
    def prob_predict(self, features):
        features_arr = np.array(features)
        probs = self.sigmoid(np.dot(features_arr, self.w))
        return probs


In [26]:
def majority_voting(X_test, models):
    all_predictions = []
    # all_prob = []

    for i in range(len(models)):
        clf = models[i]
        y_pred = clf.predict(X_test)
        # y_prob = clf.prob_predict(X_test)
        y_pred_df = pd.DataFrame(y_pred)
        # y_prob_df = pd.DataFrame(y_prob)
        all_predictions.append(y_pred_df)
        # all_prob.append(y_prob_df)

    votes = pd.concat(all_predictions, axis='columns')
    # probs = pd.concat(all_prob, axis='columns')
    predictions = votes.mode(axis='columns').to_numpy()
    # probabilities = probs.mean(axis='columns').to_numpy()
    return predictions

In [31]:
class stacking:
    def __init__(self, models, meta):
        self.models = models
        self.meta = meta

    def fit(self, features, labels): #use validation set here
        meta_features = []

        for model in self.models:
            pred = model.predict(features)
            meta_features.append(pred)

        meta_features = np.array(meta_features).T
        meta_df = pd.DataFrame(meta_features)
        fpred_df = pd.concat([features, meta_df], axis=1)
        #concat meta_features with validation set
        fpred_df.columns = fpred_df.columns.astype(str)

        self.meta.fit(fpred_df, labels)

    def predict(self, X_test):
        meta_features = [model.predict(X_test) for model in self.models]
        meta_features = np.array(meta_features)
        meta_features = meta_features.T
        meta_df = pd.DataFrame(meta_features)

        fpred_df = pd.concat([X_test.reset_index(drop=True), meta_df.reset_index(drop=True)], axis=1)
        fpred_df.columns = fpred_df.columns.astype(str)
        #append test with meta featues
        
        y_pred = self.meta.predict(fpred_df)
        # y_prob = self.meta.prob_predict(fpred_df)
        return y_pred
    
    # def prob_predict(self, features):
    #     return self.meta.prob_predict(features)

In [33]:
def bagging(features, labels):
    label_col = labels.columns[0]
    bagged_sets = []
    n = len(features)
    print(n)
    print(features.shape)
    
    training_set = pd.concat([features, labels], axis=1)
    for i in range(9):
        sample = resample(training_set, n_samples=n, replace=True)
        bagged_sets.append(sample)

    models = []
    for i in range(9):
        bag = bagged_sets[i]
        bag_x = bag.drop(label_col, axis=1)
        bag_y = bag[label_col].to_list()
        clf = LogisticRegression()
        clf.fit(bag_x, bag_y)
        models.append(clf)

    return models

In [29]:
def performance(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    # auroc = roc_auc_score(y_test, y_pred_prob)
    # aupr = average_precision_score(y_test, y_pred_prob)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    print(tn, fp, fn, tp)
    return {
        'Accuracy': accuracy,
        'Sensitivity': recall,
        'Specificity': specificity,
        'Precision': precision,
        'F1-score': f1
    }
    

In [38]:
dataset = int(input("Enter the dataset"))
features, labels, test_features, test_labels = data_preprocessing(dataset)

label_col = labels.columns[0]
labels_arr = labels[label_col].to_list()


if dataset == 1 or dataset == 3 or dataset == 4:
    X_train_val, X_test, y_train_val, y_test = train_test_split(features, labels_arr, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=16)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    X_val.reset_index(drop=True, inplace=True)
    y_train_df = pd.DataFrame(y_train, columns=[label_col])

elif dataset == 2:
    test_labels_arr = test_labels[label_col].to_list()
    test_labels_col = test_labels.columns[0]
    y_test = test_labels[test_labels_col].to_list()
    X_train, X_val, y_train, y_val = train_test_split(features, labels_arr, train_size=0.2, random_state=42)
    X_test = test_features
    y_test = test_labels_arr
    y_train_df = pd.DataFrame(y_train, columns=[label_col])
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    X_val.reset_index(drop=True, inplace=True)


# clf = LogisticRegressionImp()
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# y_prob = clf.prob_predict(X_test)
# metrics = performance(y_test, y_pred, y_prob)
# for key, val in metrics.items():
#     print(f"{key} : {val}")


models = bagging(X_train, y_train_df)
all_metrics = {
        'Accuracy': [],
        'Sensitivity': [],
        'Specificity': [],
        'Precision': [],
        'F1-score': []
    }

for model in models:
    y_pred = model.predict(X_test)
    # y_prob = model.prob_predict(X_test)
    metrics = performance(y_test, y_pred)
    for key, val in metrics.items():
        all_metrics[key].append(val)

print("LR avg +- stdev")
for metric, vals in all_metrics.items():
    avg = np.mean(vals)
    stdev = np.std(vals)
    print(f"{metric}: {avg} +- {stdev}")

# met_df = pd.DataFrame(all_metrics)
# df_melted = met_df.melt(var_name='Metric', value_name='Score')

# # plt.figure(figsize=(10, 6))
# # sns.violinplot(x='Metric', y='Score', data=df_melted, inner="point", palette="Set2", hue='Metric', legend=False)
# # plt.title("Violin Plot of Performance Metrics", fontsize=16)
# # plt.xlabel("Metric", fontsize=12)
# # plt.ylabel("Score", fontsize=12)
# # plt.xticks(rotation=45)
# # plt.tight_layout()
# # plt.show()

# y_pred, y_prob = majority_voting(X_test, models)
# metrics = performance(y_test, y_pred, y_prob)
# print("Voting Ensemble")
# for key, val in metrics.items():
#     print(f"{key} : {val}")
# print('\n')

meta = LogisticRegression()
stack = stacking(models, meta)
stack.fit(X_val, y_val)
y_pred= stack.predict(X_test)
metrics = performance(y_test, y_pred)
print("Stacking Ensemble")
for key, val in metrics.items():
    print(f"{key} : {val}")

1920
(1920, 2)
20 308 0 272
32 296 0 272
32 296 0 272
32 296 0 272
32 296 0 272
296 32 272 0
32 296 0 272
12 316 0 272
32 296 0 272
LR avg +- stdev
Accuracy: 0.49925925925925924 +- 0.011523221619336346
Sensitivity: 0.8888888888888888 +- 0.3142696805273545
Specificity: 0.1761517615176152 +- 0.257616550262406
Precision: 0.42275444309718924 +- 0.14956944664240762
F1-score: 0.5729744530923699 +- 0.2026420276169207
76 252 0 272
Stacking Ensemble
Accuracy : 0.58
Sensitivity : 1.0
Specificity : 0.23170731707317074
Precision : 0.5190839694656488
F1-score : 0.6834170854271356
