
# DAT 613 – Programming for Data Science  
## Loan Approval Prediction using Random Forest

**Course:** DAT 613 – Programming for Data Science  
**Project Type:** Group Machine Learning Project  
**Model:** Random Forest Classifier  


In [None]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib


## Load Dataset

In [None]:

train_df = pd.read_csv("loan_train.csv")
test_df = pd.read_csv("loan_test.csv")

train_df.head()


## Exploratory Data Analysis

In [None]:

train_df.info()


In [None]:

train_df.isnull().sum()


### Visualization: Loan Status Distribution

In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x="Loan_Status", data=train_df)
plt.title("Loan Approval Distribution")
plt.show()


## Data Preprocessing & Model Training

In [None]:

TARGET = "Loan_Status"

X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])


In [None]:

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)


## Model Evaluation

In [None]:

preds = pipeline.predict(X_val)

accuracy = accuracy_score(y_val, preds)
f1 = f1_score(y_val, preds, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(classification_report(y_val, preds))


## Save Trained Model

In [None]:

joblib.dump(pipeline, "model.joblib")



## Next Steps
- Build Gradio interface (`app.py`)
- Deploy model on Hugging Face Spaces
- Document Git & GitHub workflow
