In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# XGBoost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, accuracy_score
# CatBoost
from catboost import CatBoostClassifier
# SVM
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
collected_data = pd.read_csv('data/collected_data.csv') # Read the CSV file
data = collected_data.copy()
# Setting column 'date' as index.
data = data.set_index('date', drop=True)
del collected_data

data

Unnamed: 0_level_0,sahm,indpro,sp500,tr10,t10yff,unrate,pcepi,payems,houst,recession
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1962-02-01,-0.17,0.016229,0.016139,-0.043737,1.650556,0.055,0.042,0.827913,0.396825,0
1962-03-01,-0.17,0.005350,-0.005878,-0.108990,1.078182,0.056,0.021,0.819544,0.478671,0
1962-04-01,-0.10,0.002130,-0.063973,-0.087455,1.043000,0.056,0.018,0.829109,0.518849,0
1962-05-01,-0.07,-0.001066,-0.089914,0.030636,1.441818,0.055,0.010,0.817113,0.498512,0
1962-06-01,0.00,-0.002132,-0.085381,0.035411,1.206667,0.055,0.010,0.816714,0.459325,0
...,...,...,...,...,...,...,...,...,...,...
2024-04-01,0.37,-0.000690,-0.042506,0.330591,-0.790909,0.039,0.322,0.820381,0.445933,0
2024-05-01,0.37,0.007466,0.046904,-0.056818,-0.847727,0.040,-0.010,0.824685,0.415179,0
2024-06-01,0.43,0.000622,0.034082,-0.177010,-1.024737,0.041,0.145,0.820780,0.422123,0
2024-07-01,0.53,-0.009475,0.011258,-0.056627,-1.081364,0.043,0.189,0.819624,0.376488,0


In [3]:
# get X and y 
X = data.drop(['recession'], axis=1)
y = data['recession']

# We define the training period.
X_train, y_train = X.loc["1962-02-01":"2012-12-01"], y.loc["1962-02-01":"2012-12-01"]
# We define the test period.
# X_test, y_test = X.loc["2013-01-01":], y.loc["2013-01-01":]
# y_train.describe()

### Metrics (repeat from previous file)

Here are the main metrics we can use to get the **final score** for each **model**:

1. **Precision**: It measures the proportion of true positive predictions relative to all positive predictions. It is **important** when you **want to minimize false positives**.<br>Precision = TP/(TP+FP)
   
2. **Recall** (Sensitivity): It measures the proportion of true positive predictions to all actual positive cases. It is useful when you want to minimize false negatives.<br>Recall = TP/(TP+FN)
   
3. **F1-score**: Is a metric that balances precision and recall. It is calculated as the harmonic mean of precision and recall. F1 Score is useful when seeking a balance between high precision and high recall, as it penalizes extreme negative values of either component.<br>F1 = 2\*Precision\*Recall/(Precision+Recall)
   
4. **Confusion Matrix**: Visualizes true and predicted classes, which can help better understand model performance.
    
5. **Specificity**: It measures the proportion of true negative predictions relative to all actual negative cases. It is useful when you want to **minimize false positives**.<br>Specificity = TN/(TN+FP)
    
6. **Accuracy**: It measures the proportion of correctly predicted cases (both positive and negative) relative to the total number of cases. Accuracy **can be misleading with unbalanced data**, as it can be high even if the model does not predict the small class well.

### 5. XGBoost

XGBoost has the **scale_pos_weight** parameter which can correct the imbalance between classes. This makes it one of the best models for unbalanced data. One of the most suitable models **to deal with severe imbalance**.

In [6]:
# Дефиниране на параметрите за търсене
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'scale_pos_weight': [1, 3, 5]  # За балансиране на класовете
}

# Създаване на XGBoost класификатор
xgb_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss'
)

# Дефиниране на метрики за оценка
scoring = {
    'f1': make_scorer(f1_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'accuracy': make_scorer(accuracy_score)
}

# Създаване на GridSearchCV обект
grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',  # Оптимизираме спрямо F1-score
    cv=5,
    verbose=0,
    n_jobs=-1
)

# Изпълнение на търсенето
grid_search.fit(X_train, y_train)

# Извеждане на най-добрите параметри и резултати
print("Най-добри параметри:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

print("\nНай-добри резултати:")
print(f"F1 Score: {grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]:.4f}")
print(f"Recall: {grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]:.4f}")
print(f"Precision: {grid_search.cv_results_['mean_test_precision'][grid_search.best_index_]:.4f}")
print(f"Accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")

Най-добри параметри:
colsample_bytree: 0.9
gamma: 0.1
learning_rate: 0.05
max_depth: 6
min_child_weight: 3
n_estimators: 100
scale_pos_weight: 1
subsample: 1.0

Най-добри резултати:
F1 Score: 0.8557
Recall: 0.8088
Precision: 0.9267
Accuracy: 0.9657


### 6. CatBoost

CatBoost also supports imbalance correction parameters (**class_weights**) and can automatically detect imbalance in data. Good **for dealing with unbalanced classes**.

In [8]:
# Дефиниране на параметрите за търсене
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'iterations': [100, 200, 300],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128],
    'bagging_temperature': [0, 1],
    'random_strength': [1, 10],
    # Автоматично балансиране на класовете
    'auto_class_weights': ['Balanced'],
    # Можете да използвате и конкретни тегла, например:
    # 'class_weights': [[1, 2], [1, 3], [1, 4]],
}

# Създаване на CatBoost класификатор
cat_classifier = CatBoostClassifier(
    eval_metric='F1',
    random_seed=42,
    verbose=False,  # Изключваме verbose output
    thread_count=-1  # Използване на всички налични процесори
)

# Дефиниране на метрики за оценка
scoring = {
    'f1': make_scorer(f1_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'accuracy': make_scorer(accuracy_score)
}

# Създаване на GridSearchCV обект
grid_search = GridSearchCV(
    estimator=cat_classifier,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',  # Оптимизираме спрямо F1-score
    cv=5,
    verbose=0,
    n_jobs=1  # CatBoost има собствена паралелизация
)

# Изпълнение на търсенето
grid_search.fit(X_train, y_train)

# Извеждане на най-добрите параметри и резултати
print("Най-добри параметри:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

print("\nНай-добри резултати:")
print(f"F1 Score: {grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]:.4f}")
print(f"Recall: {grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]:.4f}")
print(f"Precision: {grid_search.cv_results_['mean_test_precision'][grid_search.best_index_]:.4f}")
print(f"Accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")

Най-добри параметри:
auto_class_weights: Balanced
bagging_temperature: 0
border_count: 32
depth: 8
iterations: 200
l2_leaf_reg: 1
learning_rate: 0.05
random_strength: 10

Най-добри резултати:
F1 Score: 0.8675
Recall: 0.9051
Precision: 0.8575
Accuracy: 0.9624


### 7. SVM

SVM can handle unbalanced classes by using the **class_weight='balanced'** parameter. However, with highly unbalanced data, SVM may not be the best solution. It works well with moderate imbalance, but **may struggle with more imbalance**.

In [10]:
# Стандартизация на данните (важно за SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Дефиниране на параметрите за търсене
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto', 0.1, 0.01],
    'class_weight': ['balanced', {0:1, 1:2}, {0:1, 1:3}],  # Различни опции за балансиране
    'random_state': [42]
}

# Създаване на SVM класификатор
svm_classifier = SVC(
    probability=True,  # Нужно за някои метрики
    max_iter=1000,    # Увеличаваме максималния брой итерации
)

# Дефиниране на метрики за оценка
scoring = {
    'f1': make_scorer(f1_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'accuracy': make_scorer(accuracy_score)
}

# Създаване на GridSearchCV обект
grid_search = GridSearchCV(
    estimator=svm_classifier,
    param_grid=param_grid,
    scoring=scoring,
    refit='f1',      # Оптимизираме спрямо F1-score
    cv=5,
    verbose=0,
    n_jobs=-1        # Използване на всички налични процесори
)

# Изпълнение на търсенето
grid_search.fit(X_train_scaled, y_train)

# Извеждане на най-добрите параметри и резултати
print("Най-добри параметри:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

print("\nНай-добри резултати:")
print(f"F1 Score: {grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]:.4f}")
print(f"Recall: {grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]:.4f}")
print(f"Precision: {grid_search.cv_results_['mean_test_precision'][grid_search.best_index_]:.4f}")
print(f"Accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")

# Извеждане на допълнителна информация за дисбаланса в данните
unique, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("\nРазпределение на класовете в тренировъчните данни:")
for class_label, count in class_distribution.items():
    print(f"Клас {class_label}: {count} примера ({count/len(y_train)*100:.2f}%)")

Най-добри параметри:
C: 10
class_weight: {0: 1, 1: 2}
gamma: 0.01
kernel: rbf
random_state: 42

Най-добри резултати:
F1 Score: 0.7677
Recall: 0.7978
Precision: 0.7810
Accuracy: 0.9330

Разпределение на класовете в тренировъчните данни:
Клас 0: 528 примера (86.42%)
Клас 1: 83 примера (13.58%)
