In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Load the dataset
file_path = 'credit_score.csv'
credit_data = pd.read_csv(file_path)

# Feature Engineering
# Assuming the following mappings for the variables:
# P -> R_DEBT_INCOME (debt to income ratio)
# U -> R_SAVINGS_INCOME (savings to income ratio, as proxy for utilization ratio)
# L -> T_CLOTHING_12 (years of account history as proxy, assuming it relates to length)
# M -> CAT_CREDIT_CARD and CAT_MORTGAGE (Credit mix, combined into a binary feature)
# N -> DEFAULT (as a proxy for new credit inquiries)

credit_data['P'] = credit_data['R_DEBT_INCOME']
credit_data['U'] = credit_data['R_SAVINGS_INCOME']
credit_data['L'] = credit_data['T_CLOTHING_12']  # Using T_CLOTHING_12 as proxy for length of history
credit_data['M'] = credit_data['CAT_CREDIT_CARD'] + credit_data['CAT_MORTGAGE']  # Sum of binary credit mix indicators
credit_data['N'] = credit_data['DEFAULT']  # Using DEFAULT as a proxy for new credit inquiries

# Normalize the features to the range 0-1
scaler = StandardScaler()
credit_data[['P', 'U', 'L', 'M', 'N']] = scaler.fit_transform(credit_data[['P', 'U', 'L', 'M', 'N']])

# Calculate the score using the given formula
min_score = 300
max_score = 900
credit_data['Predicted_Score'] = min_score + (max_score - min_score) * (0.35 * credit_data['P'] + 0.30 * (1 - credit_data['U']) + 0.15 * credit_data['L'] + 0.10 * credit_data['M'] + 0.10 * (1 - credit_data['N']))

# Model Selection and Training
X = credit_data[['P', 'U', 'L', 'M', 'N']]
y = credit_data['CREDIT_SCORE']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict with the linear regression model
y_pred_lr = lr_model.predict(X_test)

# Model evaluation for Linear Regression
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Train a Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predict with the random forest model
y_pred_rf = rf_model.predict(X_test)

# Model evaluation for Random Forest
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Cross-validation for Random Forest model
cv_rf = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()

# Grid Search for Hyperparameter Tuning of Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

# Evaluation of the best Random Forest model
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

# Displaying Results
print("Linear Regression Model:")
print(f"MAE: {mae_lr:.2f}, R²: {r2_lr:.2f}")

print("\nRandom Forest Model:")
print(f"MAE: {mae_rf:.2f}, R²: {r2_rf:.2f}")

print("\nBest Random Forest Model after Hyperparameter Tuning:")
print(f"MAE: {mae_best_rf:.2f}, R²: {r2_best_rf:.2f}")
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Display cross-validation score for Random Forest
print(f"\nCross-Validation MAE for Random Forest: {cv_rf:.2f}")


Linear Regression Model:
MAE: 23.39, R²: 0.75

Random Forest Model:
MAE: 24.25, R²: 0.74

Best Random Forest Model after Hyperparameter Tuning:
MAE: 22.78, R²: 0.75
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}

Cross-Validation MAE for Random Forest: -26.40
