## Data loading

### Subtask:
Load the credit risk dataset.


**Reasoning**:
Load the credit risk dataset into a pandas DataFrame and display its first few rows and shape.



In [2]:
import pandas as pd

try:
    df = pd.read_csv('credit_risk_dataset.csv')
    display(df.head())
    print(df.shape)
except FileNotFoundError:
    print("Error: 'credit_risk_dataset.csv' not found.")
    df = None
except Exception as e:
    print(f"An error occurred: {e}")
    df = None

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


(32581, 12)


## Data cleaning

### Subtask:
Clean the data by handling missing values and outliers, and converting categorical features to numerical representations.


In [3]:
# Fill missing values
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)

# Outlier treatment
df = df[df['person_age'] <= 100]

# Convert categorical features to numerical using one-hot encoding
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Data validation
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 32576 entries, 0 to 32580
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   person_age                   32576 non-null  int64  
 1   person_income                32576 non-null  int64  
 2   person_emp_length            32576 non-null  float64
 3   loan_amnt                    32576 non-null  int64  
 4   loan_int_rate                32576 non-null  float64
 5   loan_status                  32576 non-null  int64  
 6   loan_percent_income          32576 non-null  float64
 7   cb_person_cred_hist_length   32576 non-null  int64  
 8   person_home_ownership_OTHER  32576 non-null  bool   
 9   person_home_ownership_OWN    32576 non-null  bool   
 10  person_home_ownership_RENT   32576 non-null  bool   
 11  loan_intent_EDUCATION        32576 non-null  bool   
 12  loan_intent_HOMEIMPROVEMENT  32576 non-null  bool   
 13  loan_intent_MEDICAL  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)


## Data splitting

### Subtask:
Split the cleaned dataset into training and testing sets.


**Reasoning**:
Split the data into training and testing sets using train_test_split.



In [4]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Data preparation

### Subtask:
Scale the numerical features in the training and testing datasets.


**Reasoning**:
Scale the numerical features of the training and testing datasets using StandardScaler.



In [5]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

# Initialize StandardScaler
scaler = StandardScaler()

# Fit on training data
scaler.fit(X_train[numerical_cols])

# Transform training and testing data
X_train[numerical_cols] = scaler.transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

## Model training

### Subtask:
Train a Ridge Regression model on the prepared training data.


**Reasoning**:
Train a Ridge Regression model on the prepared training data.



In [6]:
from sklearn.linear_model import Ridge

# Initialize and train the Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

## Model evaluation

### Subtask:
Evaluate the trained Ridge Regression model's performance on the test set.


**Reasoning**:
Evaluate the trained Ridge Regression model's performance on the test set using the provided metrics.



In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

try:
    # Make predictions on the test set
    y_pred = ridge_model.predict(X_test)

    # Convert probabilities to binary predictions (assuming 0.5 threshold)
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    auc_roc = roc_auc_score(y_test, y_pred)

    # Print the metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"AUC-ROC: {auc_roc}")

except Exception as e:
    print(f"An error occurred during evaluation: {e}")

Accuracy: 0.8577348066298343
Precision: 0.7482447342026078
Recall: 0.5246132208157525
F1-score: 0.6167837949565936
AUC-ROC: 0.8621382150590005


In [8]:
# prompt: write a python code to create the model file\

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

try:
    df = pd.read_csv('credit_risk_dataset.csv')
    # ... (rest of your existing code for data loading, exploration, cleaning, splitting, and preparation)

    # Initialize and train the Ridge Regression model
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_train, y_train)

    # Save the trained model to a file
    joblib.dump(ridge_model, 'ridge_regression_model.joblib')

    print("Model saved to ridge_regression_model.joblib")

except FileNotFoundError:
    print("Error: 'credit_risk_dataset.csv' not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Model saved to ridge_regression_model.joblib


In [9]:
# prompt: write a script to test the the model with the split data and log the generated value vs expected value in a csv file

import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load the trained model
try:
    ridge_model = joblib.load('ridge_regression_model.joblib')
    print("Model loaded successfully.")
except FileNotFoundError:
    print("Error: 'ridge_regression_model.joblib' not found. Please train the model first.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the model: {e}")
    exit()

# Assuming X_test and y_test are already defined from the previous code
try:
    # Make predictions on the test set
    y_pred = ridge_model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Create a DataFrame for logging
    results_df = pd.DataFrame({'Predicted': y_pred_binary, 'Actual': y_test})
    results_df['Difference'] = results_df['Predicted'] - results_df['Actual']

    #Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    auc_roc = roc_auc_score(y_test, y_pred)


    # Save the results to a CSV file
    results_df.to_csv('model_predictions.csv', index=False)
    print("Predictions logged to 'model_predictions.csv'")

    #Print Metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"AUC-ROC: {auc_roc}")


except NameError:
    print("Error: X_test or y_test not found. Please run the data splitting and preparation steps first.")
except Exception as e:
    print(f"An error occurred during prediction or logging: {e}")


Model loaded successfully.
Predictions logged to 'model_predictions.csv'
Accuracy: 0.8577348066298343
Precision: 0.7482447342026078
Recall: 0.5246132208157525
F1-score: 0.6167837949565936
AUC-ROC: 0.8621382150590005
