In [None]:
# Install the Faker library to generate fake data
!pip install faker

In [1]:
# Import necessary libraries: pandas, numpy, Faker, and random for data manipulation and generation
import pandas as pd
import numpy as np
from faker import Faker
import  random

In [2]:
# Import fsolve from scipy.optimize for solving non-linear equations
from scipy.optimize import fsolve

In [3]:
# Initialize the Faker object to generate fake data
fake = Faker()

In [4]:
# Generate Loan Data with Faker
def generate_fake_loan_data(num_records=20000):
    data = {
        "customer_id": [i for i in range(1, num_records + 1)], 
        "name": [fake.name() for _ in range(num_records)],  
        "age": [fake.random_int(min=21, max=65) for _ in range(num_records)],  
        "income": [fake.random_int(min=30000, max=150000) for _ in range(num_records)],  
        "credit_score": [fake.random_int(min=300, max=850) for _ in range(num_records)],  
        "loan_term": [np.random.choice([12, 24, 36, 48, 60]) for _ in range(num_records)],  
     
    }
    df = pd.DataFrame(data)
    
    
    
    def calculate_loan_amount(row):
        if row['credit_score'] < 580:
            max_loan = 10000
        elif row['credit_score'] < 670:
            max_loan = 20000
        elif row['credit_score'] < 740:
            max_loan = 35000
        else:
            max_loan = 50000
        
        if row['income'] < 50000:
            loan_amount = max_loan * 0.4
        elif row['income'] <= 100000:
            loan_amount = max_loan * 0.7
        else:
            loan_amount = max_loan * 1.0
        
        return loan_amount
    
    df["loan_amount"] = df.apply(calculate_loan_amount, axis=1)
    
    
    def calculate_monthly_installment(loan_amount, loan_term):
        temp_interest_rate = np.random.uniform(5, 15) / 100
        r = temp_interest_rate / 12
        n = loan_term
        emi = loan_amount * r * (1 + r)*n / ((1 + r)*n - 1)
        return emi
    
    df["monthly_installment"] = df.apply(lambda x: calculate_monthly_installment(x["loan_amount"], x["loan_term"]), axis=1)
    

    def calculate_interest_rate(row):
        def interest_function(r):
            P = row["loan_amount"]
            n = row["loan_term"]
            EMI = row["monthly_installment"]
            return EMI - (P * r * (1 + r)*n) / ((1 + r)*n - 1)
        
        r_initial_guess = 0.01
        monthly_rate = fsolve(interest_function, r_initial_guess)[0]
        annual_rate = monthly_rate * 12 * 100
        return annual_rate
    
    df["interest_rate"] = df.apply(calculate_interest_rate, axis=1)

    def calculate_repayment_status(row):
        
        if row['income'] < 50000 and row['credit_score'] < 600:
            return 0  
        
        if row['age'] > 55 and row['income'] < 80000:
            return 0  
        
        if row['loan_term'] in [48, 60] or row['interest_rate'] > 12:
            return 0
        
      
        debt_to_income_ratio = row['loan_amount'] / row['income'] * 100
        if debt_to_income_ratio > 30: 
            return 0  
        
       
        return 1  

    df["repayment_status"] = df.apply(calculate_repayment_status, axis=1)

    
    return df


loan_data = generate_fake_loan_data(num_records=20000)

In [5]:
# Display the first few rows of the loan_data DataFrame
loan_data.head()

Unnamed: 0,customer_id,name,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
0,1,Tristan Hawkins,63,117621,356,60,10000.0,116.529808,13.753158,0
1,2,Frederick Haynes,55,120798,606,60,20000.0,247.131725,14.583739,0
2,3,Makayla Murray,50,105520,678,24,35000.0,204.442696,6.719029,0
3,4,Melissa Mckinney,27,64201,436,48,7000.0,39.969669,6.709988,0
4,5,Joel Pineda,45,116904,577,24,10000.0,121.676384,13.9998,0


In [6]:
loan_data.to_csv("loan_default_data.csv",index=False)

In [35]:
loan_data = pd.read_csv("loan_default_data.csv")

In [None]:
# Check for missing values in the loan_data DataFrame and count them
loan_data.isnull().sum()

In [None]:
# Display summary information about the loan_data DataFrame, including data types and non-null counts
loan_data.info()

In [None]:
# Check the data types of each column in the loan_data DataFrame
loan_data.dtypes

In [11]:
# Display the first 5 rows of the loan_data DataFrame
loan_data.head()

Unnamed: 0,customer_id,name,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
0,1,Tristan Hawkins,63,117621,356,60,10000.0,116.529808,13.753158,0
1,2,Frederick Haynes,55,120798,606,60,20000.0,247.131725,14.583739,0
2,3,Makayla Murray,50,105520,678,24,35000.0,204.442696,6.719029,0
3,4,Melissa Mckinney,27,64201,436,48,7000.0,39.969669,6.709988,0
4,5,Joel Pineda,45,116904,577,24,10000.0,121.676384,13.9998,0


In [12]:
# Convert the 'loan_amount' column in loan_data to integers
loan_data['loan_amount']=loan_data['loan_amount'].astype(int)

In [13]:
# Round the 'monthly_installment' column to 3 decimal places and convert it to float type
loan_data['monthly_installment']=loan_data['monthly_installment'].round(3).astype(float)

In [14]:
# Round the 'interest_rate' column to 2 decimal places and convert it to float type
loan_data['interest_rate']=loan_data['interest_rate'].round(2).astype(float)

In [16]:
# Display the first 5 rows of the loan_data DataFrame after modifications
loan_data.head()

Unnamed: 0,customer_id,name,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
0,1,Tristan Hawkins,63,117621,356,60,10000,116.53,13.75,0
1,2,Frederick Haynes,55,120798,606,60,20000,247.132,14.58,0
2,3,Makayla Murray,50,105520,678,24,35000,204.443,6.72,0
3,4,Melissa Mckinney,27,64201,436,48,7000,39.97,6.71,0
4,5,Joel Pineda,45,116904,577,24,10000,121.676,14.0,0


In [17]:
# Filter and display rows where 'repayment_status' is 0 in the loan_data DataFrame
loan_data[loan_data['repayment_status']==0]

Unnamed: 0,customer_id,name,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
0,1,Tristan Hawkins,63,117621,356,60,10000,116.530,13.75,0
1,2,Frederick Haynes,55,120798,606,60,20000,247.132,14.58,0
2,3,Makayla Murray,50,105520,678,24,35000,204.443,6.72,0
3,4,Melissa Mckinney,27,64201,436,48,7000,39.970,6.71,0
4,5,Joel Pineda,45,116904,577,24,10000,121.676,14.00,0
...,...,...,...,...,...,...,...,...,...,...
19994,19995,Matthew Dunn,59,87827,492,36,7000,82.453,13.75,0
19996,19997,Patrick Mcdonald,40,117888,555,12,10000,133.425,14.69,0
19997,19998,Alexis Thomas,30,91828,556,60,7000,71.426,12.04,0
19998,19999,Tammy Brown,46,84390,553,36,7000,87.233,14.54,0


In [None]:
# Count the non-null values in each column of the loan_data DataFrame
loan_data.count()

In [None]:
# Install the imbalanced-learn library for handling imbalanced datasets
!pip install imbalanced-learn

In [None]:
# Apply SMOTE to balance the dataset by oversampling the minority class and display the class distribution before and after SMOTE
from imblearn.over_sampling import SMOTE
import pandas as pd

# Dropping non-feature columns (e.g., 'customer_id', 'name')
X = loan_data.drop(columns=['customer_id', 'name', 'repayment_status']) 
y = loan_data['repayment_status']  

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

balanced_loan_data = pd.concat([X_smote, y_smote], axis=1)

print("Before SMOTE:")
print(y.value_counts())

print("\nAfter SMOTE:")
print(y_smote.value_counts())


In [23]:
# Display the balanced loan_data DataFrame after applying SMOTE for oversampling
balanced_loan_data.head()

Unnamed: 0,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
0,63,117621,356,60,10000,116.53,13.75,0
1,55,120798,606,60,20000,247.132,14.58,0
2,50,105520,678,24,35000,204.443,6.72,0
3,27,64201,436,48,7000,39.97,6.71,0
4,45,116904,577,24,10000,121.676,14.0,0


In [24]:
# Filter and display rows where 'repayment_status' is 0 in the balanced loan_data DataFrame
balanced_loan_data[balanced_loan_data['repayment_status']==0]

Unnamed: 0,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
0,63,117621,356,60,10000,116.530,13.75,0
1,55,120798,606,60,20000,247.132,14.58,0
2,50,105520,678,24,35000,204.443,6.72,0
3,27,64201,436,48,7000,39.970,6.71,0
4,45,116904,577,24,10000,121.676,14.00,0
...,...,...,...,...,...,...,...,...
19994,59,87827,492,36,7000,82.453,13.75,0
19996,40,117888,555,12,10000,133.425,14.69,0
19997,30,91828,556,60,7000,71.426,12.04,0
19998,46,84390,553,36,7000,87.233,14.54,0


In [25]:
# Filter and display rows where 'repayment_status' is 1 in the balanced loan_data DataFrame
balanced_loan_data[balanced_loan_data['repayment_status']==1]

Unnamed: 0,age,income,credit_score,loan_term,loan_amount,monthly_installment,interest_rate,repayment_status
7,32,79822,376,24,7000,35.009000,5.750000,1
10,22,128352,524,12,10000,75.712000,8.330000,1
13,65,144492,322,24,10000,68.200000,7.850000,1
14,46,95864,676,36,24500,164.967000,7.860000,1
15,32,89831,614,12,14000,82.563000,6.490000,1
...,...,...,...,...,...,...,...,...
29819,34,143844,646,20,20000,176.748593,9.911713,1
29820,30,148025,475,22,10000,90.104237,10.150999,1
29821,60,94784,318,35,7000,54.704201,9.113197,1
29822,26,112676,382,12,10000,79.429465,8.746054,1


In [None]:
# Split the balanced dataset into training and testing sets, standardize the features, 
# and train a Logistic Regression model to predict repayment status,
#  then evaluate accuracy on training and test sets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


X = balanced_loan_data[['age', 'income', 'credit_score', 'loan_term', 'loan_amount', 'monthly_installment', 'interest_rate']]
y = balanced_loan_data['repayment_status']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)        


model = LogisticRegression()
model.fit(X_train_scaled, y_train)


print("Training Accuracy:", model.score(X_train_scaled, y_train))
print("Test Accuracy:", model.score(X_test_scaled, y_test))

Training Accuracy: 0.8448803386562722
Test Accuracy: 0.8439228834870075


In [27]:
# Training and testing accuracy
print("Training Accuracy:", model.score(X_train_scaled, y_train))
print("Test Accuracy:", model.score(X_test_scaled, y_test))


Training Accuracy: 0.8448803386562722
Test Accuracy: 0.8439228834870075


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Predictions on the test data
y_pred = model.predict(X_test_scaled)

# Precision, Recall, F1-Score
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Precision: 0.8410138248847926
Recall: 0.8508158508158508
F1-Score: 0.8458864426419467


In [30]:
# Train a Random Forest Classifier model on the scaled training data to predict repayment status
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(X_train_scaled, y_train)

In [34]:
# Predictions on the test data
y1_pred = model1.predict(X_test_scaled)

# Precision, Recall, F1-Score
print("Precision:", precision_score(y_test, y1_pred))
print("Recall:", recall_score(y_test, y1_pred))
print("F1-Score:", f1_score(y_test, y1_pred))

Precision: 1.0
Recall: 0.999000999000999
F1-Score: 0.9995002498750625


In [33]:
# Evaluate and display the training and test accuracy of the Random Forest model
print("Training Accuracy:", model1.score(X_train_scaled, y_train))
print("Test Accuracy:", model1.score(X_test_scaled, y_test))

Training Accuracy: 1.0
Test Accuracy: 0.9994970662196144


In [29]:
# Import the joblib library for saving and loading models
import joblib

In [None]:
# Save the trained Random Forest model to a .pkl file at the specified location
joblib.dump(model, r"D:\1 DS PROJECTS\DS final project 1.pkl")