In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from statistics import median
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


### Faster but less thorough version.

In [5]:
csv_file_path = 'renamed_reco_train.csv'
df = pd.read_csv(csv_file_path)

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning, message=".*ConvergenceWarning.*")

# Separate features and target variables
prediction_columns = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']
X = df.drop(columns=prediction_columns)  # First 86 columns (features)
y = df[prediction_columns]

# Normalize the feature columns
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Create a class for model evaluation
class ModelEvaluator:
    def __init__(self):
        self.models = {}
        
    def train_models(self, X_train, y_train):
        # Define the logistic regression model
        lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear')

        # Set up hyperparameters for tuning
        param_dist = {
            'C': [0.001, 0.01, 0.1, 1],  # Reduced number of hyperparameters
            'penalty': ['l1', 'l2'],  # Only two regularization types
        }

        # Use StratifiedKFold for cross-validation
        skf = StratifiedKFold(n_splits=3)  # Reduced number of folds

        # Perform random search with cross-validation
        random_search = RandomizedSearchCV(lr_model, param_dist, n_iter=10, cv=skf, scoring='f1_macro', n_jobs=-1)

        for i in range(y_train.shape[1]):
            print(f"Training model for target column {y_train.columns[i]}")
            # Train the model
            random_search.fit(X_train, y_train.iloc[:, i])
            self.models[y_train.columns[i]] = random_search.best_estimator_
            print(f"Best parameters for {y_train.columns[i]}: {random_search.best_params_}")

    def evaluate_model(self, model, X_test, y_test, column_name):
        y_pred = model.predict(X_test)
        print(f"Confusion Matrix for {column_name}:\n", confusion_matrix(y_test[column_name], y_pred))
        print(f"Classification Report for {column_name}:\n", classification_report(y_test[column_name], y_pred))
        
    def evaluate_all_models(self, X_test, y_test):
        for column_name, model in self.models.items():
            self.evaluate_model(model, X_test, y_test, column_name)

# Create and train the models
model_evaluator = ModelEvaluator()
model_evaluator.train_models(X_train, y_train)

# Evaluate all trained models
model_evaluator.evaluate_all_models(X_test, y_test)

# After training, you can use the models to make predictions on new data
def predict_proba(new_data):
    new_data_normalized = scaler.transform(new_data)
    probabilities = {}
    for column_name, model in model_evaluator.models.items():
        probabilities[column_name] = model.predict_proba(new_data_normalized)[:, 1]  # Probability of class 1
    return probabilities


Training model for target column fixed_deposits




Best parameters for fixed_deposits: {'penalty': 'l1', 'C': 0.1}
Training model for target column loan




Best parameters for loan: {'penalty': 'l1', 'C': 1}
Training model for target column credit_card_debit_card




Best parameters for credit_card_debit_card: {'penalty': 'l2', 'C': 1}
Training model for target column account




Best parameters for account: {'penalty': 'l1', 'C': 0.001}
Confusion Matrix for fixed_deposits:
 [[1821  155]
 [   3   11]]
Classification Report for fixed_deposits:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96      1976
           1       0.07      0.79      0.12        14

    accuracy                           0.92      1990
   macro avg       0.53      0.85      0.54      1990
weighted avg       0.99      0.92      0.95      1990

Confusion Matrix for loan:
 [[1930   58]
 [   2    0]]
Classification Report for loan:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      1988
           1       0.00      0.00      0.00         2

    accuracy                           0.97      1990
   macro avg       0.50      0.49      0.49      1990
weighted avg       1.00      0.97      0.98      1990

Confusion Matrix for credit_card_debit_card:
 [[1377  447]
 [  21  145]]
Classification Rep

### This model takes a while to run, its the more thorough version. Select the code and command + / for mac to undo the # . 

In [6]:
# # Separate features and target variables
# prediction_columns = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']
# X = df.drop(columns=prediction_columns)  # First 86 columns (features)
# y = df[prediction_columns]

# # Normalize the feature columns
# scaler = StandardScaler()
# X_normalized = scaler.fit_transform(X)

# # Should use, stratified train-test split to maintain class distribution, but errors encountered ()
# # one of the classes is has too few rows
# X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# # Create a class for model evaluation
# class ModelEvaluator:
#     def __init__(self):
#         self.models = {}
        
#     def train_models(self, X_train, y_train):
#         # Define the logistic regression model
#         lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')

#         # Set up hyperparameters for tuning
#         param_grid = {
#             'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
#             'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Regularization type
#             'solver': ['liblinear', 'saga']  # Solver to use
#         }

#         # Use StratifiedKFold for cross-validation
#         skf = StratifiedKFold(n_splits=5)

#         # Perform grid search with cross-validation
#         grid_search = GridSearchCV(lr_model, param_grid, cv=skf, scoring='f1_macro', n_jobs=-1)
        
#         for i in range(y_train.shape[1]):
#             print(f"Training model for target column {y_train.columns[i]}")
#             # Train the model
#             grid_search.fit(X_train, y_train.iloc[:, i])
#             self.models[y_train.columns[i]] = grid_search.best_estimator_
#             print(f"Best parameters for {y_train.columns[i]}: {grid_search.best_params_}")



#     def evaluate_model(self, model, X_test, y_test, column_name):
#         y_pred = model.predict(X_test)
#         print(f"Confusion Matrix for {column_name}:\n", confusion_matrix(y_test[column_name], y_pred))
#         print(f"Classification Report for {column_name}:\n", classification_report(y_test[column_name], y_pred))
        
#     def evaluate_all_models(self, X_test, y_test):
#         for column_name, model in self.models.items():
#             self.evaluate_model(model, X_test, y_test, column_name)

# # Create and train the models
# model_evaluator = ModelEvaluator()
# model_evaluator.train_models(X_train, y_train)

# # Evaluate all trained models
# model_evaluator.evaluate_all_models(X_test, y_test)

# # After training, you can use the models to make predictions on new data
# def predict_proba(new_data):
#     new_data_normalized = scaler.transform(new_data)
#     probabilities = {}
#     for column_name, model in model_evaluator.models.items():
#         probabilities[column_name] = model.predict_proba(new_data_normalized)[:, 1]  # Probability of class 1
#     return probabilities


