# Data Preprocessing


## Dataset 1

In [1]:
import pandas as pd

In [2]:
train_file_path = "Adult/adult.data"
df  = pd.read_csv(train_file_path)

In [None]:
df.shape


In [None]:
df.describe()

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.isnull().sum().sum()

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [11]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [13]:
test_file_path = "Adult/adult.test"
df_test = pd.read_csv(test_file_path, skiprows=1)


In [None]:
df_test.isnull().sum().sum() 

In [None]:
df_test.duplicated().sum()

In [16]:
df_test.drop_duplicates(inplace=True)

In [None]:
df_test.duplicated().sum()

In [None]:
object_columns = df.select_dtypes(include=['object']).columns
object_columns

In [19]:
# Define the column headers
column_headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
                  'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
                  'hours-per-week', 'native-country', 'income']

# Assign column headers to the training DataFrame
df.columns = column_headers

# Assign column headers to the test DataFrame
df_test.columns = column_headers

In [None]:
# Concatenate df and df_test
df_combined = pd.concat([df, df_test], ignore_index=True)
df_combined.shape

In [None]:
df_combined.isna().sum()

In [None]:
df_combined.duplicated().sum()

In [None]:
df_combined.dtypes

In [None]:
object_columns = df_combined.select_dtypes(include=['object']).columns
for column in object_columns:
    df_combined[column] = df_combined[column].astype('category')

df_combined.dtypes

# df_combined['income'].value_counts()

In [25]:
# for column in object_columns:
#     print(f"Value counts for {column}:")
#     print(df_combined[column].value_counts())
#     print("\n")

In [None]:
df_combined['income']

In [None]:
df_combined['income'] = df_combined['income'].str.replace('.', '', regex=False)
df_combined['income'].value_counts()

In [28]:
# df_combined['income'] = df_combined['income'].map({'<=50K': '<=50K', '>50K': '>50k', '<=50K.': 0, '>50K.': 1})

In [None]:
df_combined['income'].value_counts()

In [30]:
from sklearn.preprocessing import LabelEncoder
feature_col = "income"

label_encoder = LabelEncoder()
df_combined[feature_col] = label_encoder.fit_transform(df_combined[feature_col])

for col in df_combined.select_dtypes(include=['category']).columns:
    if col != feature_col and df_combined[col].nunique() == 2:

        df_combined[col] = label_encoder.fit_transform(df_combined[col])

df_combined.dtypes

df_combined = pd.get_dummies(df_combined, drop_first=True)


In [None]:
# Print columns grouped by their data types
int_columns = df_combined.select_dtypes(include=['int64']).columns.tolist()
bool_columns = df_combined.select_dtypes(include=['bool']).columns.tolist()

print("Integer Columns:")
print(int_columns)
print("\nBoolean Columns:")
print(bool_columns)




In [None]:
# Convert all boolean columns to int64
df_combined[bool_columns] = df_combined[bool_columns].astype('int64')

# Verify the conversion
int_columns = df_combined.select_dtypes(include=['int64']).columns.tolist()
int_columns

In [33]:
df_combined.drop_duplicates(inplace=True)

In [None]:
df_combined.duplicated().sum()

In [None]:
df_combined.isna().sum().sum()

In [None]:
# Convert all int64 columns to float32
df_combined[int_columns] = df_combined[int_columns].astype('float32')

# Verify the conversion
df_combined.dtypes

In [None]:
df.isna().sum().sum()

In [38]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def scale_features(df, scaling_method='standard', target_column=None):
    """
    Scale all features except the target column.
    
    Parameters:
    - df: DataFrame, the input dataset.
    - scaling_method: str, 'standard' for Standard Scaling or 'minmax' for Min-Max Scaling.
    - target_column: str, name of the target column (optional).
    
    Returns:
    - df_scaled: DataFrame, DataFrame with scaled features.
    """
    if scaling_method == 'standard':
        scaler = StandardScaler()
    elif scaling_method == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid scaling method. Choose 'standard' or 'minmax'.")

    # Separate features and target variable
    if target_column:
        features = df.drop(columns=[target_column])
        target = df[target_column]
    else:
        features = df.copy()
        target = None

    # Scale all features
    features_scaled = scaler.fit_transform(features)
    df_scaled = pd.DataFrame(features_scaled, columns=features.columns, index=df.index)

    # Add the target column back if it was provided
    if target_column:
        df_scaled[target_column] = target

    return df_scaled



In [39]:
df_combined = scale_features(df_combined, 'standard', 'income')

In [None]:
df_combined.isna().sum().sum()

In [41]:
def get_top_n_correlations(df, n):
    import numpy as np
    correlations = df.corr().abs().unstack().sort_values(ascending=False)
    correlations = correlations[correlations < 1]
    return correlations[:n]

In [42]:
def get_top_n_correlations_with_target(df, target, n):
    correlations = df.corr().abs().unstack().sort_values(ascending=False)
    correlations = correlations[correlations < 1]
    return correlations[target][:n]

In [None]:
get_top_n_correlations_with_target(df_combined, 'income', 20)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Extract the top 20 features and the target variable
top_20_features = get_top_n_correlations_with_target(df_combined, 'income', 20).index.tolist()
X = df_combined[top_20_features]
y = df_combined['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [45]:
df_first_dataset_final = df_combined.copy(deep=True)

## Seconed Dataset

In [None]:
third_df_file_path = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
df_third_dataset = pd.read_csv(third_df_file_path)
df = df_third_dataset.copy(deep=True)
df.head()


df.drop('customerID', axis=1, inplace=True)
df.dtypes


df.drop_duplicates(inplace=True)


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype('float64')

# Verify the conversion and filling
df['TotalCharges'].isna().sum()



print(f'Null values in each column:\n{df.isna().sum()}')
print(f"duplicated rows: {df.duplicated().sum()}")



# Convert all object types into category types
object_columns = df.select_dtypes(include=['object']).columns
for column in object_columns:
    df[column] = df[column].astype('category')

# Verify the conversion
df.dtypes




# Label encode the target column 'Churn' in the third dataset
df['Churn'] = label_encoder.fit_transform(df['Churn'])

# Verify the encoding
df['Churn'].value_counts()



for column in df.columns:
    print(f"Unique values and counts for {column}:")
    print(df[column].value_counts())
    print("\n")



feature_col = 'Churn'
for col in df.select_dtypes(include=['category']).columns:
    if col != feature_col and df[col].nunique() == 2:

        df[col] = label_encoder.fit_transform(df[col])

df.dtypes

df = pd.get_dummies(df).astype('float64')



for column in df.columns:
    print(f"Unique values and counts for {column}:")
    print(df[column].value_counts())
    print("\n")


df.shape

df.duplicated().sum().sum()


df.isna().sum().sum()


In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [50]:
df['Churn'] = df['Churn'].astype('int64')

In [None]:
df.dtypes

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Extract the top 20 features and the target variable
top_20_features_df = get_top_n_correlations_with_target(df, 'Churn', 20).index.tolist()
X = df[top_20_features_df]
y = df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
log_reg = LogisticRegression(max_iter=100000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [53]:
df_second_dataset_final = df.copy(deep=True)

## Third Dataset

In [54]:
third_df_file_path = "creditcard.csv"
df_third_dataset = pd.read_csv(third_df_file_path)

df = df_third_dataset.copy(deep=True)

In [None]:
df.shape

In [None]:
df.isna().sum().sum()

In [None]:
df.duplicated().sum().sum()

In [58]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum().sum()

In [None]:
df.isna().sum().sum()

In [None]:
df.dtypes

In [62]:
df = scale_features(df, 'standard', 'Class')

In [None]:
df.isna().sum().sum()

In [None]:
df.describe()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Extract the top 20 features and the target variable
top_20_features_df = get_top_n_correlations_with_target(df, 'Class', 20).index.tolist()
X = df[top_20_features_df]
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [66]:
df_third_dataset_final = df.copy(deep=True)

In [None]:
df_first_dataset_final.describe()
print(df_first_dataset_final.shape)

In [None]:
df_second_dataset_final.describe()
print(df_second_dataset_final.shape)

In [None]:
df_third_dataset_final.describe()
print(df_third_dataset_final.shape)

In [70]:
# import numpy as np

# class LogisticRegression:
#     def __init__(self, learning_rate=0.01, num_iterations=1000, regularization_strength=0.01):
#         self.learning_rate = learning_rate
#         self.num_iterations = num_iterations
#         self.regularization_strength = regularization_strength  # L2 regularization strength
    
#     def sigmoid(self, z):
#         # Clip values to avoid overflow
#         z = np.clip(z, -500, 500)
#         return 1 / (1 + np.exp(-z))
    
#     def fit(self, X, y):
#         self.m, self.n = X.shape
#         self.weights = np.zeros(self.n)
#         self.bias = 0
#         X = np.array(X)
#         y = np.array(y)
        
#         for i in range(self.num_iterations):
#             linear_model = np.dot(X, self.weights) + self.bias
#             y_predicted = self.sigmoid(linear_model)
            
#             dw = (1 / self.m) * np.dot(X.T, (y_predicted - y)) + (self.regularization_strength / self.m) * self.weights
#             db = (1 / self.m) * np.sum(y_predicted - y)
            
#             self.weights -= self.learning_rate * dw
#             self.bias -= self.learning_rate * db
    
#     def predict(self, X):
#         linear_model = np.dot(X, self.weights) + self.bias
#         y_predicted = self.sigmoid(linear_model)
#         y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
#         return y_predicted_cls
    
#     def accuracy(self, y_true, y_pred):
#         accuracy = np.sum(y_true == y_pred) / len(y_true)
#         return accuracy

# LR , Bagging

## Logistic Regression

In [71]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000, regularization_strength=0.01):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.regularization_strength = regularization_strength  # L2 regularization strength
    
    def sigmoid(self, z):
        # Clip values to avoid overflow
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        self.m, self.n = X.shape
        self.weights = np.zeros(self.n)
        self.bias = 0
        X = np.array(X)
        y = np.array(y)
        
        for i in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)
            
            dw = (1 / self.m) * np.dot(X.T, (y_predicted - y)) + (self.regularization_strength / self.m) * self.weights
            db = (1 / self.m) * np.sum(y_predicted - y)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls
    
    def accuracy(self, y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy
    
    def compute_loss(self, X, y):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        # Clip predicted probabilities to avoid log(0)
        y_predicted = np.clip(y_predicted, 1e-15, 1 - 1e-15)
        loss = -1 / self.m * (np.dot(y, np.log(y_predicted)) + np.dot((1 - y), np.log(1 - y_predicted)))
        return loss





## Bagging

In [72]:
def bagging_logistic_regression(df, target_column, n_estimators=9, learning_rate=0.01, l2_penalty=0.01, random_state=42, test_size=0.2,iterations=1000):
    """
    Perform bagging with logistic regression on the given dataset.
    
    Parameters:
    - df: DataFrame, the input dataset.
    - target_column: str, the name of the target column.
    - n_estimators: int, the number of bootstrap samples and models.
    - learning_rate: float, the learning rate for logistic regression.
    - l2_penalty: float, the L2 regularization strength.
    
    Returns:
    - models: list, trained logistic regression models.
    - X_test: DataFrame, the test set features.
    - y_test: Series, the test set target.
    - y_pred: ndarray, aggregated predictions from all models.
    - accuracies: list, accuracies of each model.
    - losses: list, losses of each model.
    """
    # Split the dataset into features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    models = []
    predictions = []
    accuracies = []
    losses = []
    
    for i in range(n_estimators):
        # Generate a bootstrap sample
        X_resampled, y_resampled = resample(X_train, y_train, replace=True, random_state=i)
        
        # Create and train a logistic regression model
        model = LogisticRegression(learning_rate=learning_rate, num_iterations=iterations, regularization_strength=l2_penalty)
        model.fit(X_resampled, y_resampled)
        
        # Store the model
        models.append(model)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        predictions.append(y_pred)
        
        # Calculate accuracy and loss
        accuracy = model.accuracy(y_test, y_pred)
        loss = model.compute_loss(X_test, y_test)
        
        accuracies.append(accuracy)
        losses.append(loss)
        
        # Print accuracy and loss
        # print(f"Model {i+1} - Accuracy: {accuracy}, Loss: {loss}")
    
    # Aggregate predictions by majority vote
    predictions = np.array(predictions)
    y_pred_final = np.round(np.mean(predictions, axis=0)).astype(int)
    
    # Calculate and print the final accuracy after majority voting
    final_accuracy = accuracy_score(y_test, y_pred_final)
    print(f"Final Accuracy after Majority Voting: {final_accuracy}")
    
    # # Evaluate the model
    # print(confusion_matrix(y_test, y_pred_final))
    # print(classification_report(y_test, y_pred_final))
    
    return models, X_test, y_test, y_pred_final

## Violine Plot

In [73]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# def generate_violin_plot(df, target_column, n_estimators=9, learning_rate=0.01, l2_penalty=0.01, num_iterations=1000):
#     """
#     Generate a violin plot for the predictions of a bagging ensemble of logistic regression models.
    
#     Parameters:
#     - df: DataFrame, the input dataset.
#     - target_column: str, the name of the target column.
#     - n_estimators: int, the number of bootstrap samples and models.
#     - learning_rate: float, the learning rate for logistic regression.
#     - l2_penalty: float, the L2 regularization strength.
#     - num_iterations: int, the number of iterations for logistic regression.
#     """
#     # Perform bagging logistic regression
#     models, X_test, y_test, y_pred_final, predictions = bagging_logistic_regression(df, target_column, n_estimators, learning_rate, l2_penalty, num_iterations)
    
#     # Convert predictions to DataFrame for plotting
#     predictions_df = pd.DataFrame(predictions.T, columns=[f'Model_{i+1}' for i in range(n_estimators)])
#     predictions_df['True_Label'] = y_test.values
    
#     # Generate violin plot
#     plt.figure(figsize=(12, 8))
#     sns.violinplot(data=predictions_df.drop('True_Label', axis=1))
#     plt.title('Violin Plot of Predictions from Bagging Logistic Regression Models')
#     plt.xlabel('Models')
#     plt.ylabel('Predicted Probability')
#     plt.show()

# # Example usage
# generate_violin_plot(df_first_dataset_final, target_column='income', n_estimators=9, learning_rate=0.01, l2_penalty=0.01, num_iterations=1000)

## Accuracy Metrics


In [74]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix, log_loss

def calculate_metrics(y_test, y_pred):
    """
    Calculate and print the accuracy, sensitivity, specificity, precision, F1-score, AUROC, and AUPR for the given model and test data.
    
    Parameters:
    - model: trained logistic regression model.
    - X_test: DataFrame, the test set features.
    - y_test: Series, the test set target.
    
    Returns:
    - metrics: dict, containing accuracy, loss, sensitivity, specificity, precision, F1-score, AUROC, and AUPR.
    """
    
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    loss = log_loss(y_test, y_pred)
    sensitivity = recall_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auroc = roc_auc_score(y_test, y_pred)
    aupr = average_precision_score(y_test, y_pred)
    
    # Print metrics
    print(f"Accuracy: {accuracy}")
    print(f"Loss: {loss}")
    print(f"Sensitivity: {sensitivity}")
    print(f"Specificity: {specificity}")
    print(f"Precision: {precision}")
    print(f"F1-score: {f1}")
    print(f"AUROC: {auroc}")
    print(f"AUPR: {aupr}")
    
    metrics = {
        'accuracy': accuracy,
        'loss': loss,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'precision': precision,
        'f1_score': f1,
        'auroc': auroc,
        'aupr': aupr
    }
    
    return metrics

# Example usage
# Assuming you have a trained model and test data
# model, X_test, y_test = ... (from your previous code)
# metrics = calculate_metrics(model, X_test, y_test)

## Evaluate Metrics

In [75]:

def evaluate_models(df, target_column, learning_rate_lr=0.01, l2_penalty_for_lr=0.01, iterations_lr = 1000,learning_rate_bagging=0.01, l2_penalty_bagging=0.01, n_estimators=9,iterations_bagging=1000,test_size=0.2, random_state=42):
    """
    Train and evaluate both a single logistic regression model and a bagging ensemble of logistic regression models on the given dataset.
    
    Parameters:
    - df: DataFrame, the input dataset.
    - target_column: str, the name of the target column.
    - learning_rate: float, the learning rate for logistic regression.
    - l2_penalty: float, the L2 regularization strength.
    
    Returns:
    - metrics_single: dict, metrics for the single logistic regression model.
    - metrics_bagging: dict, metrics for the bagging ensemble of logistic regression models.
    """
    # Split the dataset into features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Train and evaluate a single logistic regression model
    print("Metrics for Single Logistic Regression Model:")
    single_model = LogisticRegression(learning_rate=learning_rate_lr, num_iterations=iterations_lr, regularization_strength=l2_penalty_for_lr)
    single_model.fit(X_train, y_train)
    y_pred_single = single_model.predict(X_test)
    metrics_single = calculate_metrics(y_test, y_pred_single)
    
    # Train and evaluate a bagging ensemble of logistic regression models
    print("\nMetrics for Bagging Logistic Regression Model:")
    models, X_test, y_test, y_pred_final = bagging_logistic_regression(df, target_column, n_estimators=n_estimators, learning_rate=learning_rate_bagging, l2_penalty=l2_penalty_bagging)
    metrics_bagging = calculate_metrics(y_test, y_pred_final)

    # print("\nMetrics for Single Logistic Regression Model:")
    # print(metrics_single)
    # print("\nMetrics for Bagging Logistic Regression Model:")
    # print(metrics_bagging)
    
    return metrics_single, metrics_bagging



### Evaluation of each dataset

In [None]:
# Example usage
evaluate_models(df_first_dataset_final, target_column='income', learning_rate_lr=0.01, l2_penalty_for_lr=0.01, iterations_lr = 1000,learning_rate_bagging=0.01, l2_penalty_bagging=0.01, n_estimators=9,iterations_bagging=1000,test_size=0.2, random_state=42)

In [None]:
evaluate_models(df_second_dataset_final, target_column='Churn', learning_rate_lr=0.01, l2_penalty_for_lr=0.01, iterations_lr = 1000,learning_rate_bagging=0.01, l2_penalty_bagging=0.01, n_estimators=9,iterations_bagging=1000,test_size=0.2, random_state=42)

In [None]:
evaluate_models(df_third_dataset_final, target_column='Class', learning_rate_lr=0.01, l2_penalty_for_lr=0.01, iterations_lr = 1000,learning_rate_bagging=0.01, l2_penalty_bagging=0.01, n_estimators=9,iterations_bagging=1000,test_size=0.2, random_state=42)

## Accuracy Vs Learning Rate Curve

In [83]:
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

def plot_accuracy_vs_learning_rate(dataset, feature_name, learning_rates):
    """
    Plots the accuracy vs learning rate for Logistic Regression using the custom class.

    Parameters:
    - dataset: DataFrame, the input dataset.
    - feature_name: str, the name of the target feature.
    - learning_rates: list, a list of learning rates to evaluate.
    """

    # Extract features and target variable
    X = dataset.drop(columns=[feature_name])
    y = dataset[feature_name]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    accuracies = []

    for lr in learning_rates:
        # Initialize the custom logistic regression model
        custom_log_reg = LogisticRegression(learning_rate=lr, num_iterations=1000)

        # Train the model
        custom_log_reg.fit(X_train, y_train)

        # Make predictions
        y_pred_custom = custom_log_reg.predict(X_test)

        # Calculate accuracy
        accuracy_custom = custom_log_reg.accuracy(y_test, y_pred_custom)
        accuracies.append(accuracy_custom)

    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.plot(learning_rates, accuracies, marker='o')
    plt.title('Accuracy vs Learning Rate')
    plt.xlabel('Learning Rate')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.show()



# Stacking


In [93]:
def stacking_ensemble(df, target_column, learning_rate=0.01, num_iterations=1000, random_state=42,l2_penalty=0.01):
    """
    Implement stacking ensemble with logistic regression models.
    
    Parameters:
    - df: DataFrame, the input dataset.
    - target_column: str, the name of the target column.
    - learning_rate: float, the learning rate for logistic regression.
    - num_iterations: int, the number of iterations for logistic regression.
    - random_state: int, the random state for reproducibility.
    
    Returns:
    - X_test: DataFrame, the test set features.
    - y_test: Series, the test set target.
    - y_pred_final: ndarray, predictions from the stacking ensemble.
    """
    # Split the dataset into features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    # Further split the training set into training and validation sets
    X_train_main, X_val, y_train_main, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)
    
    base_learners = []
    meta_features = np.zeros((X_val.shape[0], 9))
    
    for i in range(9):
        # Generate a bootstrap sample
        X_resampled, y_resampled = resample(X_train_main, y_train_main, replace=True, random_state=i)
        
        # Create and train a logistic regression model
        model = LogisticRegression(learning_rate=learning_rate, num_iterations=num_iterations, regularization_strength=l2_penalty)
        model.fit(X_resampled, y_resampled)
        
        # Store the base learner
        base_learners.append(model)
        
        # Generate meta features using the validation set
        meta_features[:, i] = model.predict(X_val)
    
    # Train the meta classifier
    meta_classifier = LogisticRegression(learning_rate=learning_rate, num_iterations=num_iterations, regularization_strength=l2_penalty)
    meta_classifier.fit(meta_features, y_val)
    
    # Generate meta features for the test set
    meta_features_test = np.zeros((X_test.shape[0], 9))
    for i, model in enumerate(base_learners):
        meta_features_test[:, i] = model.predict(X_test)
    
    # Make final predictions using the meta classifier
    y_pred_final = meta_classifier.predict(meta_features_test)
    
    return X_test, y_test, y_pred_final



In [95]:
# Example usage
X_test, y_test, y_pred_final = stacking_ensemble(df_first_dataset_final, target_column='income', learning_rate=0.01, num_iterations=1000, random_state=42)

calculate_metrics(y_test, y_pred_final)

Accuracy: 0.8392088542734167
Loss: 5.7955003246079935
Sensitivity: 0.596989966555184
Specificity: 0.9178658702144991
Precision: 0.7024102311854402
F1-score: 0.6454237288135594
AUROC: 0.7574279183848416
AUPR: 0.5181225962299327


{'accuracy': 0.8392088542734167,
 'loss': 5.7955003246079935,
 'sensitivity': 0.596989966555184,
 'specificity': 0.9178658702144991,
 'precision': 0.7024102311854402,
 'f1_score': 0.6454237288135594,
 'auroc': 0.7574279183848416,
 'aupr': 0.5181225962299327}

In [96]:
X_test, y_test, y_pred_final = stacking_ensemble(df_second_dataset_final, target_column='Churn', learning_rate=0.01, num_iterations=1000, random_state=42)

calculate_metrics(y_test, y_pred_final)

Accuracy: 0.7594306049822064
Loss: 8.67099989005096
Sensitivity: 0.26988636363636365
Specificity: 0.9230769230769231
Precision: 0.5397727272727273
F1-score: 0.35984848484848486
AUROC: 0.5964816433566434
AUPR: 0.3285954480199112


{'accuracy': 0.7594306049822064,
 'loss': 8.67099989005096,
 'sensitivity': 0.26988636363636365,
 'specificity': 0.9230769230769231,
 'precision': 0.5397727272727273,
 'f1_score': 0.35984848484848486,
 'auroc': 0.5964816433566434,
 'aupr': 0.3285954480199112}

In [98]:
df_third_dataset_final['Class'].value_counts()

Class
0    283253
1       473
Name: count, dtype: int64

In [99]:
# Separate positive and negative samples
positive_samples = df_third_dataset_final[df_third_dataset_final['Class'] == 1]
negative_samples = df_third_dataset_final[df_third_dataset_final['Class'] == 0]

# Randomly select 20,000 negative samples
negative_samples_selected = negative_samples.sample(n=20000, random_state=42)

# Combine the positive samples with the selected negative samples
df_sampled = pd.concat([positive_samples, negative_samples_selected])

# Shuffle the combined dataframe
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

df_sampled.shape

(20473, 31)

In [100]:
X_test, y_test, y_pred_final = stacking_ensemble(df_sampled, target_column='Class', learning_rate=0.5, num_iterations=1000, random_state=42,l2_penalty=0.0005)

calculate_metrics(y_test, y_pred_final)

Accuracy: 0.9938949938949939
Loss: 0.22004672398728442
Sensitivity: 0.7967479674796748
Specificity: 1.0
Precision: 1.0
F1-score: 0.8868778280542986
AUROC: 0.8983739837398375
AUPR: 0.8028529735846809


{'accuracy': 0.9938949938949939,
 'loss': 0.22004672398728442,
 'sensitivity': 0.7967479674796748,
 'specificity': 1.0,
 'precision': 1.0,
 'f1_score': 0.8868778280542986,
 'auroc': 0.8983739837398375,
 'aupr': 0.8028529735846809}