In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.utils import resample
import pickle


In [2]:

# Load the data
file_paths = [
    'Test data.xlsx - Query result.csv',
    'Model test data.csv',
    'Loan Status prediction data.xlsx - Query result.csv'
]

data_frames = []
for file_path in file_paths:
    df = pd.read_csv(file_path, encoding='ISO-8859-1')
    data_frames.append(df)

# Select one of the DataFrames to work with, e.g., DataFrame 1
df = data_frames[0]

# Fill missing values for simplicity, here with mode for categorical and median for numerical
for column in df.select_dtypes(include=['object']).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

for column in df.select_dtypes(include=[np.number]).columns:
    df[column].fillna(df[column].median(), inplace=True)

# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split the data into features and target
X = df.drop(columns=['status_id'])
y = df['status_id']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are s

In [3]:

# Handle class imbalance using undersampling
train_data = pd.concat([pd.DataFrame(X_train_scaled), pd.Series(y_train, name='label')], axis=1)

# Separate the majority and minority classes
majority_class = train_data[train_data['label'] == 2]
minority_class = train_data[train_data['label'] == 5]

# Undersample the majority class
majority_class_undersampled = resample(majority_class, 
                                       replace=False,    # sample without replacement
                                       n_samples=len(minority_class),     # match minority class count
                                       random_state=42)   # reproducible results

# Combine the undersampled majority class with the minority class
undersampled_data = pd.concat([majority_class_undersampled, minority_class])

# Separate the features and labels
X_train_undersampled = undersampled_data.drop(columns='label')
y_train_undersampled = undersampled_data['label']

# Handle missing values in the undersampled dataset
for column in X_train_undersampled.select_dtypes(include=['object']).columns:
    X_train_undersampled[column].fillna(X_train_undersampled[column].mode()[0], inplace=True)

for column in X_train_undersampled.select_dtypes(include=[np.number]).columns:
    X_train_undersampled[column].fillna(X_train_undersampled[column].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_undersampled[column].fillna(X_train_undersampled[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_undersampled[column].fillna(X_train_undersampled[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace m

In [4]:

# Train the logistic regression model with undersampled data
logistic_model_undersampled = LogisticRegression(max_iter=1000)
logistic_model_undersampled.fit(X_train_undersampled, y_train_undersampled)

# Make predictions on the test set
y_pred_logistic_undersampled = logistic_model_undersampled.predict(X_test_scaled)

# Map the predictions to 'Paid' and 'Unpaid'
prediction_mapping = {2: 'Paid', 5: 'Unpaid'}
y_pred_logistic_undersampled_mapped = [prediction_mapping.get(pred, 'Unpaid') for pred in y_pred_logistic_undersampled]

# Evaluate the model
accuracy_logistic_undersampled = accuracy_score(y_test.map(prediction_mapping), y_pred_logistic_undersampled_mapped)
classification_report_logistic_undersampled = classification_report(y_test.map(prediction_mapping), y_pred_logistic_undersampled_mapped, labels=['Paid', 'Unpaid'])

print(f"Logistic Regression Model (Undersampled)
Accuracy: {accuracy_logistic_undersampled}
")
print(classification_report_logistic_undersampled)


SyntaxError: unterminated string literal (detected at line 16) (1934820898.py, line 16)

In [None]:

# Train the XGBoost model with undersampled data
xgb_model_undersampled = XGBClassifier(n_estimators=50, max_depth=2, learning_rate=0.2, use_label_encoder=False, eval_metric='mlogloss')
xgb_model_undersampled.fit(X_train_undersampled, y_train_undersampled)

# Make predictions on the test set
y_pred_xgb_undersampled = xgb_model_undersampled.predict(X_test_scaled)

# Map the predictions to 'Paid' and 'Unpaid'
y_pred_xgb_undersampled_mapped = [prediction_mapping.get(pred, 'Unpaid') for pred in y_pred_xgb_undersampled]

# Evaluate the model
accuracy_xgb_undersampled = accuracy_score(y_test.map(prediction_mapping), y_pred_xgb_undersampled_mapped)
classification_report_xgb_undersampled = classification_report(y_test.map(prediction_mapping), y_pred_xgb_undersampled_mapped, labels=['Paid', 'Unpaid'])

print(f"XGBoost Model (Undersampled)
Accuracy: {accuracy_xgb_undersampled}
")
print(classification_report_xgb_undersampled)


In [None]:

# Save the updated logistic regression model (undersampled)
logistic_model_undersampled_path = 'logistic_regression_model_undersampled.pkl'
with open(logistic_model_undersampled_path, 'wb') as file:
    pickle.dump(logistic_model_undersampled, file)

# Save the updated XGBoost model (undersampled)
xgb_model_undersampled_path = 'xgboost_model_undersampled.pkl'
with open(xgb_model_undersampled_path, 'wb') as file:
    pickle.dump(xgb_model_undersampled, file)

logistic_model_undersampled_path, xgb_model_undersampled_path
