In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

Check whether there are missing values `b.i`

In [None]:
df = pd.read_csv("loan_old.csv")
empty = df.isnull().sum().sum()
print("There are " + str(empty) + " empty values")

Records containing missing values are removed `c.i`

In [None]:
def preprocess_df(df):
    clean_df = df.drop(columns=["Loan_ID"]) # No need for id as well
    clean_df = clean_df.dropna()
    return clean_df

clean_df = preprocess_df(df)
display(clean_df)

Check the type of each feature, and the scale of numerical features (implies separating the features and the targets)
`b.ii`, `b.iii`, `c.ii`

In [None]:
features_df = clean_df.drop(columns=["Max_Loan_Amount", "Loan_Status"])
targets_df = clean_df[["Max_Loan_Amount", "Loan_Status"]]

categorical_features_df = features_df.select_dtypes(include=["object"])
numerical_features_df = features_df.select_dtypes(exclude=["object"])

print("Categorical features:")
for col in categorical_features_df.columns:
    print(f"\t- {col}")
print("Numerical features:")
for col in numerical_features_df.columns:
    print(
        f"\t- {col} ({numerical_features_df[col].min()} - {numerical_features_df[col].max()})"
    )

Visualize a pairplot between numerical columns `b.iv`

In [None]:
sns.pairplot(clean_df.select_dtypes(exclude=["object"]))
plt.show()

The data is shuffled and split into training and testing sets `c.iii`

In [None]:
test_size = 0.2
train_size = 1 - test_size

(
    features_train,
    features_test,
    max_loan_train,
    max_loan_test,
    loan_status_train,
    loan_status_test,
) = train_test_split(
    features_df,
    targets_df["Max_Loan_Amount"],
    targets_df["Loan_Status"],
    test_size=test_size,
    train_size=train_size,
    random_state=30,
)

print("Features training set")
display(features_train)
print("Features testing set")
display(features_test)
print("Max loan (target) training")
display(pd.DataFrame(max_loan_train))
print("Max loan (target) testing")
display(pd.DataFrame(max_loan_test))
print("Loan status (target) training")
display(pd.DataFrame(loan_status_train))
print("Loan status (target) testing")
display(pd.DataFrame(loan_status_test))

Training data:
- Categorical features are encoded `c.iv`
- Numerical features are standardized `c.vi`
- Categorical targets are encoded `c.v`

In [None]:
label_encoders = {}
standard_scalers = {}
processed_features_train = pd.DataFrame()

for col in features_train.columns:
    if features_train[col].dtype == "object":
        print(f"Encoding {col}")
        label_encoders[col] = LabelEncoder()
        processed_features_train[col] = label_encoders[col].fit_transform(features_train[col])
        print(f"\t- Before: {label_encoders[col].classes_}")
        print(f"\t- After: {np.unique(processed_features_train[col])}")
    else:
        print(f"Standardizing {col}")
        print(
            f"\t- Before: {np.min(features_train[col])} to {np.max(features_train[col])}"
        )
        standard_scalers[col] = StandardScaler()
        processed_features_train[col] = standard_scalers[col].fit_transform(features_train[[col]])
        print(
            f"\t- After: {np.min(processed_features_train[col])} to {np.max(processed_features_train[col])}"
        )

print(f"Encoding training Loan_Status")
loan_status_encoder = LabelEncoder()
processed_loan_status_train = pd.Series(
    loan_status_encoder.fit_transform(loan_status_train),
    name=loan_status_train.name,
)
print(f"\t- Before: {loan_status_encoder.classes_}")
print(f"\t- After: {np.unique(processed_loan_status_train)}")


Fit a linear regression model to the data to predict the loan amount. `d`

In [None]:
linear_model = LinearRegression()
linear_model.fit(processed_features_train, max_loan_train)
print(linear_model.feature_names_in_)
print(linear_model.coef_)

Evaluate the linear regression model using sklearn's R2 score. `e`

In [None]:
processed_features_test = pd.DataFrame()

for col in features_test.columns:
    if features_test[col].dtype == "object":
        print(f"Encoding {col}")
        processed_features_test[col] = label_encoders[col].transform(features_test[col])
        print(f"\t- Before: {label_encoders[col].classes_}")
        print(f"\t- After: {np.unique(processed_features_test[col])}")
    else:
        print(f"Standardizing {col}")
        print(
            f"\t- Before: {np.min(features_test[col])} to {np.max(features_test[col])}"
        )
        processed_features_test[col] = standard_scalers[col].transform(features_test[[col]])
        print(
            f"\t- After: {np.min(processed_features_test[col])} to {np.max(processed_features_test[col])}"
        )

print(f"R^2 score: {linear_model.score(processed_features_test, max_loan_test)}")
