<a href="https://colab.research.google.com/github/Bhavyaveer44/ML-proj/blob/main/Loan_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("Loan_default.csv")  # Change filename if needed
print(df.head())

# Drop LoanID as it's irrelevant
df.drop(columns=['LoanID'], inplace=True)

# EDA - Check for missing values
print(df.isnull().sum())
df.dropna(inplace=True)  # Drop missing values for now

# Convert categorical columns to numeric
categorical_cols = ['Education', 'EmploymentType', 'LoanPurpose']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert LoanTerm from months to years
df['LoanTermYears'] = df['LoanTerm'] / 12

# Select only the most important features
selected_features = ['Income', 'LoanAmount', 'CreditScore', 'InterestRate', 'DTIRatio', 'Education', 'EmploymentType', 'LoanPurpose']
X = df[selected_features]
y = df['LoanTermYears']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train XGBoost Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.1, max_depth=6)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

# Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, RMSE: {rmse}, R2 Score: {r2}")

# Hyperparameter Tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror'), param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

# Train Best Model
best_model = grid_search.best_estimator_
best_pred = best_model.predict(X_test)
print(f"Optimized MAE: {mean_absolute_error(y_test, best_pred)}")

# Take User Input
income = 55000
loan_amount = 150000
credit_score = 720
interest_rate = 6.5
dti_ratio = 0.35
education = "Bachelor's"
employment_type = "Full-time"
loan_purpose = "Business"

# Encode categorical inputs
education_encoded = label_encoders['Education'].transform([education])[0]
employment_encoded = label_encoders['EmploymentType'].transform([employment_type])[0]
loan_purpose_encoded = label_encoders['LoanPurpose'].transform([loan_purpose])[0]

# Create DataFrame for input
new_loan = pd.DataFrame({
    'Income': [income],
    'LoanAmount': [loan_amount],
    'CreditScore': [credit_score],
    'InterestRate': [interest_rate],
    'DTIRatio': [dti_ratio],
    'Education': [education_encoded],
    'EmploymentType': [employment_encoded],
    'LoanPurpose': [loan_purpose_encoded]
})

# Scale input
new_loan_scaled = scaler.transform(new_loan)

# Predict Loan Term (Years)
predicted_years = best_model.predict(new_loan_scaled)
print(f"Predicted Loan Term: {predicted_years[0]:.2f} years")


       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes           Yes       Other   
1      Full-time    