# loan-prediction-project
## Author: Amal Chebbi
Date : 01/22/2025

## Import Libraries

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Load Training Data

In [4]:
# Load the dataset
training_data_path = 'training_loan_data.csv'
train_data = pd.read_csv(training_data_path, header=None, encoding='latin1')

# Set the second row as column headers
train_data.columns = train_data.iloc[1]

# Drop the first two rows
train_data = train_data[2:].reset_index(drop=True)

train_data.head()

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


  train_data = pd.read_csv(training_data_path, header=None, encoding='latin1')


1,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,10000001,11983056,7550,36 months,16.24%,3 years,RENT,28000,,debt_consolidation,...,0,17,72%,4000,,3828.953801,5759,1,99,0
1,10000002,12002921,27050,36 months,10.99%,10+ years,OWN,55000,Borrower added on 12/31/13 > Combining high ...,debt_consolidation,...,0,8,61.20%,35700,,34359.94073,114834,1,353,0
2,10000003,11983096,12000,36 months,10.99%,4 years,RENT,60000,Borrower added on 12/31/13 > I would like to...,debt_consolidation,...,1,3,24%,18100,,16416.61776,7137,1,157,0
3,10000004,12003142,28000,36 months,7.62%,5 years,MORTGAGE,325000,,debt_consolidation,...,1,3,54.60%,42200,,38014.14976,799592,1,365,0
4,10000005,11993233,12000,36 months,13.53%,10+ years,RENT,40000,,debt_consolidation,...,0,17,68.80%,7000,53.0,6471.462236,13605,1,157,0


In [5]:
#shape of the training data
train_data.shape

(199121, 23)

In [6]:
# Inspect the dataset
print("Initial Dataset Info:")
train_data.info()

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199121 entries, 0 to 199120
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   id                           199121 non-null  object
 1   member_id                    189457 non-null  object
 2   loan_amnt                    199121 non-null  object
 3   term                         189457 non-null  object
 4   int_rate                     189457 non-null  object
 5   emp_length                   181531 non-null  object
 6   home_ownership               189457 non-null  object
 7   annual_inc                   189457 non-null  object
 8   desc                         82004 non-null   object
 9   purpose                      189457 non-null  object
 10  percent_bc_gt_75             180419 non-null  object
 11  bc_util                      180333 non-null  object
 12  dti                          189457 non-null  obje

In [7]:
train_data['bad_flag'].unique()

array(['0', '1', 0, 1, nan], dtype=object)

In [8]:
# Clean 'bad_flag' column
train_data['bad_flag'] = pd.to_numeric(train_data['bad_flag'], errors='coerce')
print("Unique values in 'bad_flag' after cleaning:", train_data['bad_flag'].unique())

Unique values in 'bad_flag' after cleaning: [ 0.  1. nan]


In [9]:
def preprocess_data(df, is_test=False):
    """
    Cleans and preprocesses the dataset for modeling, addressing specific observations.
    
    Parameters:
        df (pd.DataFrame): The raw dataset to preprocess.
        is_test (bool): Flag indicating if the dataset is a test set (default: False).
    
    Returns:
        pd.DataFrame: The cleaned and preprocessed dataset.
    """
    # Drop unnecessary columns
    columns_to_drop = [
        'id', 'member_id', 'desc', 
        'mths_since_last_major_derog', 'application_approved_flag'
    ]
    df = df.drop(columns=columns_to_drop, errors='ignore')
    
    # Clean and convert 'emp_length'
    def clean_emp_length(value):
        if pd.isnull(value) or value == "n/a":
            return np.nan
        elif "<" in value:
            return 0
        elif "10+" in value:
            return 10
        else:
            try:
                return int(value.split()[0])
            except:
                return np.nan

    if 'emp_length' in df.columns:
        df['emp_length'] = df['emp_length'].apply(clean_emp_length)
    
    # Clean and convert percentage columns
    columns_to_process = ['bc_util', 'revol_util', 'int_rate']
    for col in columns_to_process:
        if col in df.columns:
            df[col] = df[col].str.rstrip('%').astype(float, errors='ignore')

    # Convert 'mths_since_recent_inq' to numeric
    if 'mths_since_recent_inq' in df.columns:
        df['mths_since_recent_inq'] = pd.to_numeric(df['mths_since_recent_inq'], errors='coerce')

    # Convert numeric columns to float
    numeric_columns = [
        'loan_amnt', 'int_rate', 'annual_inc', 'dti', 
        'inq_last_6mths', 'bc_util', 'total_bc_limit',
        'tot_hi_cred_lim', 'tot_cur_bal', 'internal_score', 
        'percent_bc_gt_75', 'mths_since_recent_inq', 'revol_util'
    ]
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Convert categorical columns to 'category'
    categorical_columns = ['term', 'home_ownership', 'purpose']
    if not is_test:
        categorical_columns.append('bad_flag')  # Include the target column for training data
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category')
    
    # Handle missing values
    # Numeric columns: Fill with median
    for col in numeric_columns:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    # Categorical columns: Fill with "Unknown"
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].cat.add_categories("Unknown").fillna("Unknown")
    
    # Log preprocessing summary
    print("Preprocessing Summary:")
    print(f"Numeric Columns: {len(numeric_columns)} processed")
    print(f"Categorical Columns: {len(categorical_columns)} processed")
    print(f"Final Shape: {df.shape}")
    
    return df


In [10]:
# Apply preprocessing to training data
train_data_cleaned = preprocess_data(train_data)

# Display the cleaned data info
print("Cleaned Training Data Info:")
train_data_cleaned.info()

Preprocessing Summary:
Numeric Columns: 13 processed
Categorical Columns: 4 processed
Final Shape: (199121, 18)
Cleaned Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199121 entries, 0 to 199120
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   loan_amnt              199121 non-null  int64   
 1   term                   199121 non-null  category
 2   int_rate               199121 non-null  float64 
 3   emp_length             181531 non-null  float64 
 4   home_ownership         199121 non-null  category
 5   annual_inc             199121 non-null  float64 
 6   purpose                199121 non-null  category
 7   percent_bc_gt_75       199121 non-null  float64 
 8   bc_util                199121 non-null  float64 
 9   dti                    199121 non-null  float64 
 10  inq_last_6mths         199121 non-null  float64 
 11  mths_since_recent_inq  199121 non-null  fl