## CRISP-DM Phase 4. Modelling

In this phase, various modeling techniques are selected and applied, and their parameters are calibrated to optimal values. Typically, there are several techniques for the same data mining problem type. Some techniques have specific requirements on the form of data. Therefore, going back to the data preparation phase is often necessary.

In [3]:
# Dependencies
import sys
sys.path.append('../src')
import utils
import data_processor
import cross_validator
import model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pickle as pkl
from sklearn.model_selection import train_test_split
import warnings 
warnings.filterwarnings('ignore')

# Load data
data: pd.DataFrame = utils.load_preprocessed_data("../data/processed/financial_data_processed.pkl")

# Data preprocessing object
processor = data_processor.DataProcessor(data)
X, y = processor.create_feature_matrix_and_target_vector(target_column="Financial Distress")

In [4]:
# Train-test split (70-30) without shuffling (preserve time series order)
X_train, _ , y_train, _ = train_test_split(X, y, test_size=0.3, shuffle=False)

# TimeSeriesCrossValidator object 
tscv = cross_validator.TimeSeriesCrossValidator(X_train, y_train)

### Model Building

In [None]:
# get multiple ML models to test
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# only models that handle missing values
models = [RandomForestClassifier(), SVC(), KNeighborsClassifier(), LogisticRegression()]

# Evaluate models with loop to get all metrics and store them in lists
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
auc_list = []
model_names = []

for model in models:
    accuracy, precision, recall, f1, auc = tscv.evaluate(model)
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    auc_list.append(auc)
    model_names.append(model.__class__.__name__)

# Create dataframe with all metrics
metrics_df = pd.DataFrame({'Model': model_names, 'Accuracy': accuracy_list, 'Precision': precision_list, 'Recall': recall_list, 'F1': f1_list, 'AUC': auc_list})
metrics_df

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


# Create a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train-test split (70-30) without shuffling (preserve time series order)
X_train, _ , y_train, _ = train_test_split(X, y, test_size=0.3, shuffle=False)

# TimeSeriesCrossValidator object 
tscv = cross_validator.TimeSeriesCrossValidator(X_train, y_train)

# Evaluate the model
scores = tscv.evaluate(rf)

print('Scores:', scores)
print('Mean Score:', np.mean(scores))

# Create a confusion matrix
y_pred = rf.predict(X_test)
confusion_matrix(y_test, y_pred)

Scores: [0.9732888146911519, 0.9717813051146384, 0.957169459962756, 0.9599198396793587, 0.967391304347826]
Mean Score: 0.9659101447591463


array([[1046,    9],
       [  41,    6]], dtype=int64)

In [22]:
# import roc_auc_score
from sklearn.metrics import roc_auc_score
print('ROC AUC score:')
print(roc_auc_score(y_test, y_pred))

ROC AUC score:
0.5595643843904407
