In [1]:
#Importing all the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load dataset
df = pd.read_csv('credit_risk_dataset.csv')

In [9]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
# Convert loan_int_rate to numeric if needed
df['loan_int_rate'] = df['loan_int_rate'].astype(str).str.replace('%', '', regex=False).astype(float)

# Drop rows where target is missing
df = df.dropna(subset=['loan_int_rate'])

# Split features and target
X = df.drop(columns=['loan_int_rate'])
y = df['loan_int_rate']


In [4]:
# Identify numeric and categorical columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
])

In [6]:

# Full pipeline with Linear Regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model trained")
print(f"MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")


Model trained
MAE: 0.789, RMSE: 0.998, R²: 0.908


In [7]:
# Predict on test set
y_pred = model.predict(X_test)

# Print the first 10 
print("Predicted loan interest rates:")
print(y_pred[:10]) 




Predicted loan interest rates:
[15.32513758 10.99838748 13.49875377 10.9133098   7.19007886  7.34813502
 13.51612608 10.87702506 11.00718038  7.19425607]


In [8]:
#Print actual against predicted
pred_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})

print(pred_df.head(10))  


   Actual  Predicted
0   16.07  15.325138
1   10.99  10.998387
2   14.35  13.498754
3   10.59  10.913310
4    6.62   7.190079
5    7.88   7.348135
6   13.92  13.516126
7   10.65  10.877025
8   11.99  11.007180
9    5.79   7.194256
