In [4]:

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Load the original DataFrame
df = pd.read_csv("../data.csv")  

# Data cleaning steps
df_clean = df.copy()

# Replace null values in 'Gender', 'Self_Employed', and 'Dependents' columns with mode
cols = ['Gender', 'Self_Employed', 'Dependents']
for col in cols:
    df_clean[col].fillna(df_clean[col].mode().iloc[0], inplace=True)

# Replace null values in 'Married' column based on 'CoapplicantIncome'
df_clean['Married'] = df_clean.apply(lambda row: 'No' if pd.isnull(row['Married']) and row['CoapplicantIncome'] == 0 else 'Yes' if pd.isnull(row['Married']) else row['Married'], axis=1)

# Calculate the median loan term for each loan amount category
median_loan_term_by_amount = df_clean.groupby('LoanAmount')['Loan_Amount_Term'].median()

# Fill the null values based on the loan amount
df_clean['Loan_Amount_Term'] = df_clean.apply(lambda row: median_loan_term_by_amount[row['LoanAmount']] if pd.isnull(row['Loan_Amount_Term']) else row['Loan_Amount_Term'], axis=1)

# Calculate the mean loan amount for each loan term category
mean_loan_amount_by_term = df_clean.groupby('Loan_Amount_Term')['LoanAmount'].mean()

# Fill the null values based on the loan term
df_clean['LoanAmount'] = df_clean.apply(lambda row: mean_loan_amount_by_term[row['Loan_Amount_Term']] if pd.isnull(row['LoanAmount']) else row['LoanAmount'], axis=1)

# Fill null values in 'Credit_History' column with mode
mode_credit_history = df_clean['Credit_History'].mode()[0]
df_clean['Credit_History'].fillna(mode_credit_history, inplace=True)

# Separate the features and target variable
X = df_clean.drop('Loan_Status', axis=1)
y = df_clean['Loan_Status'].replace({'Y': 1, 'N': 0})

# Define the numeric and categorical columns
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

# Create the preprocessing transformers for numeric and categorical columns
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define the hyperparameter grid
param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the test set
y_pred = grid_search.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)



Accuracy: 0.7837837837837838
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.42      0.57        65
           1       0.76      0.98      0.86       120

    accuracy                           0.78       185
   macro avg       0.84      0.70      0.71       185
weighted avg       0.82      0.78      0.76       185



In [5]:
# Save the trained model
with open('loan_model.pkl', 'wb') as file:
    pickle.dump(grid_search, file)