In [31]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import (
    KFold,
    ShuffleSplit,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    median_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
    make_scorer
)
from tqdm import tqdm
from scipy.stats import uniform

# Part 1: Data preparation

In [47]:
df_train = pd.read_csv('train_fe.csv')
print(df_train.shape)
print(df_train.columns)
print(df_train.head(10))
df_train.info()

(3993, 116)
Index(['Unnamed: 0', 'Ticker', 'Nazwa2', 'rok', 'ta', 'txt', 'pi', 'str',
       'xrd', 'ni',
       ...
       'intan_ma', 'ppe_ma', 'sale_ma', 'cash_holdings_ma', 'roa_past',
       'lev_past', 'intan_past', 'ppe_past', 'sale_past',
       'cash_holdings_past'],
      dtype='object', length=116)
   Unnamed: 0         Ticker             Nazwa2   rok         ta      txt  \
0           0  11B PW Equity  11 bit studios SA  2005  21.127613  1.24185   
1           1  11B PW Equity  11 bit studios SA  2006  21.127613  1.24185   
2           2  11B PW Equity  11 bit studios SA  2007  21.127613  1.24185   
3           3  11B PW Equity  11 bit studios SA  2008  21.127613  1.24185   
4           4  11B PW Equity  11 bit studios SA  2009  21.127613  1.24185   
5           5  11B PW Equity  11 bit studios SA  2010   1.352400 -0.05370   
6           6  11B PW Equity  11 bit studios SA  2011   2.986200  0.29240   
7           7  11B PW Equity  11 bit studios SA  2012   7.336500  0.34030

In [33]:
df_test = pd.read_csv('test_fe.csv')
print(df_test.shape)
print(df_test.head(10))

(363, 116)
   Unnamed: 0         Ticker                        Nazwa2   rok  \
0          11  11B PW Equity             11 bit studios SA  2016   
1          23  1AT PW Equity                Atal SA/Poland  2016   
2          35  4FM PW Equity                 4Fun Media SA  2016   
3          47  AAL LN Equity            Anglo American PLC  2016   
4          59  ABC PW Equity                   ABC Data SA  2016   
5          71  ABE PW Equity                         AB SA  2016   
6          83  ABF LN Equity  Associated British Foods PLC  2016   
7          95  ABS PW Equity  Asseco Business Solutions SA  2016   
8         107   AC FP Equity                      Accor SA  2016   
9         119  ACG PW Equity                         Ac SA  2016   

             ta          txt           pi     str     xrd            ni  ...  \
0  4.564940e+01     2.801500    15.730800  0.1900     0.0     12.929300  ...   
1  1.513553e+03    15.915000   109.042999  0.1900     0.0     89.442001  ...   


In [48]:
conditions = [
    df_train['etr'] > 0.25,
    df_train['etr'] <= 0.15
]

choices = [0, 2]

df_train['etr_classification'] = np.select(conditions, choices, default=1)

In [49]:
etr_summary = df_train.groupby('etr_classification').size()
etr_summary_pct = (etr_summary / len(df_train) * 100).round(2)

print("Classification Summary:")
for class_val, count in etr_summary.items():
    pct = etr_summary_pct[class_val]
    print(f"Class {class_val}: {count} records ({pct}%)")

Classification Summary:
Class 0: 1267 records (31.73%)
Class 1: 1668 records (41.77%)
Class 2: 1058 records (26.5%)


There exists imbalance in the dataset, with class1 having significantly more samples than class2.

In [50]:
conditions = [
    df_test['etr'] > 0.25,
    df_test['etr'] <= 0.15
]

choices = [0, 2]

df_test['etr_classification'] = np.select(conditions, choices, default=1)

In [51]:
from sklearn.feature_selection import f_classif
X_train = df_train.drop(columns=['etr', 'etr_classification','Ticker','Nazwa2',"Unnamed: 0"])
y_train = df_train['etr_classification']

f_scores, p_values = f_classif(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'f_score': f_scores,
    'p_value': p_values
}).sort_values('f_score', ascending=False)

print("Feature importance for multi-class target (ANOVA F-value):")
print(feature_importance)

significant_features = feature_importance[feature_importance['p_value'] < 0.05]['feature'].tolist()
print(f"\nSignificant features (p < 0.05): {significant_features}")

top_k_features = feature_importance.head(10)['feature'].tolist()
print(f"Top 10 features: {top_k_features}")

Feature importance for multi-class target (ANOVA F-value):
                     feature     f_score        p_value
98                  etr_y_ma  473.525179  2.972696e-185
63   str_cat_(0.0875, 0.192]  454.959863  1.032622e-178
32                  WB_GDPpc  408.775301  3.182863e-162
4                        str  380.545761  5.471431e-152
65       str_cat_(0.28, inf]  364.313099  4.773705e-146
..                       ...         ...            ...
38             sektor_energy    0.526236   5.908658e-01
48                  gielda_5    0.510716   6.001050e-01
74  dltt_cat_(39.38, 327.85]    0.428391   6.515865e-01
93        cash_holdings_sqrt    0.350097   7.046412e-01
30            y_BR_Democracy         NaN            NaN

[112 rows x 3 columns]

Significant features (p < 0.05): ['etr_y_ma', 'str_cat_(0.0875, 0.192]', 'WB_GDPpc', 'str', 'str_cat_(0.28, inf]', 'cfc', 'etr_y_past', 'revenue_cat_(0.174, 1248.817]', 'xrd_exists', 'ta_log', 'cce_cat_(63.321, inf]', 'txt_cat_(0.488, 24.415]',

  f = msb / msw


In [52]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X_train, y_train, random_state=42)

mi_importance = pd.DataFrame({
    'feature': X_train.columns,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print("\nTop features by Mutual Information:")
print(mi_importance.head(15))

top_features_mi = mi_importance.head(20)['feature'].tolist()


Top features by Mutual Information:
              feature  mi_score
97         etr_y_past  0.293676
98           etr_y_ma  0.283371
99            diff_ma  0.178419
2                 txt  0.172439
102          intan_ma  0.169218
70        intant_sqrt  0.167975
15               diff  0.167096
8              intant  0.166209
6                  ni  0.165523
3                  pi  0.163908
104           sale_ma  0.144568
105  cash_holdings_ma  0.138192
103            ppe_ma  0.138118
1                  ta  0.136832
12            revenue  0.136394


In [77]:
def combined_feature_ranking(X_train, y_train, top_k=15):
    f_scores, p_values = f_classif(X_train, y_train)

    mi_scores = mutual_info_classif(X_train, y_train, random_state=42)

    combined_df = pd.DataFrame({
        'feature': X_train.columns,
        'f_score': f_scores,
        'f_rank': pd.Series(f_scores).rank(ascending=False),
        'mi_score': mi_scores,
        'mi_rank': pd.Series(mi_scores).rank(ascending=False),
        'p_value': p_values
    })

    combined_df['combined_rank'] = combined_df['f_rank'] + combined_df['mi_rank']
    combined_df = combined_df.sort_values('combined_rank')

    significant_combined = combined_df[
        (combined_df['p_value'] < 0.05) & 
        (combined_df['combined_rank'] <= top_k * 2)  
    ].sort_values('combined_rank')
    
    return significant_combined.head(top_k)

selected_features_df = combined_feature_ranking(X_train, y_train, top_k=25)
selected_features = selected_features_df['feature'].tolist()
print(f"Selected features: {selected_features}")

  f = msb / msw


Selected features: ['etr_y_ma', 'etr_y_past', 'str', 'intant_sqrt', 'ta_log', 'txt', 'intant', 'str_cat_(0.0875, 0.192]', 'WB_GDPpc', 'intan_ma', 'ppe_ma', 'str_cat_(0.28, inf]', 'cfc']


In [78]:
selected_X = X_train[selected_features]
corr_matrix = selected_X.corr().abs()

high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > 0.8:  
            high_corr_pairs.append((
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ))

print(f"Highly correlated feature pairs: {len(high_corr_pairs)}")
for pair in high_corr_pairs:
    print(f"  {pair[0]} - {pair[1]}: {pair[2]:.3f}")

Highly correlated feature pairs: 3
  str - str_cat_(0.28, inf]: 0.847
  intant_sqrt - intant: 0.923
  str_cat_(0.0875, 0.192] - WB_GDPpc: 0.943


As the score of str_cat_(0.0875, 0.192] is higher than WB_GDPpc, I will keep str_cat_(0.0875, 0.192] and drop WB_GDPpc.

In [84]:
selected_features_final = ['etr_y_ma', 'etr_y_past', 'ta_log', 'txt', 'intant', 'str_cat_(0.0875, 0.192]', 'intan_ma', 'ppe_ma', 'cfc','str_cat_(0.28, inf]']

X_train_selected = df_train[selected_features_final]

In [85]:
X_test_selected = df_test[selected_features_final]
y_test = df_test['etr_classification']

# Part 2: Model training

## Logistic Regression

In [86]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

In [87]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
multi_lr = LogisticRegression(
    multi_class='multinomial',  
    solver='lbfgs',            
    max_iter=1000,             
    random_state=42,
    C=1.0                      
)

multi_lr.fit(X_train_scaled, y_train)

y_pred = multi_lr.predict(X_test_scaled)
y_pred_proba = multi_lr.predict_proba(X_test_scaled)

print("Logistic regression result:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Logistic regression result:
Accuracy: 0.5813

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.63      0.56       103
           1       0.73      0.59      0.65       195
           2       0.41      0.48      0.44        65

    accuracy                           0.58       363
   macro avg       0.55      0.57      0.55       363
weighted avg       0.61      0.58      0.59       363


Confusion Matrix:
[[ 65  25  13]
 [ 49 115  31]
 [ 17  17  31]]




Class 1 performs the best in terms of precision and F1-score. The overall accuray is 0.5813, which is not very high. The model seems to struggle with class 0 and class 2.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.model_selection import TimeSeriesSplit, cross_val_score

def complete_modeling_pipeline_timeseries(X_train_selected, y_train, X_test_selected, y_test, n_splits=5):

    tscv = TimeSeriesSplit(n_splits=n_splits)

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_selected)):
        print(f"  Fold {fold+1}: Train Sample {len(train_idx)}, Validation Sample {len(val_idx)}")
    
    # 1. Hyperparameter tuning with Time Series CV
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(multi_class='multinomial', random_state=42, max_iter=1000))
    ])
    
    param_grid = {
        'logreg__C': [0.01, 0.1, 1, 10],
        'logreg__solver': ['lbfgs', 'newton-cg'],
        'logreg__class_weight': [None, 'balanced']
    }
    
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=tscv,  
        scoring='accuracy', 
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train_selected, y_train)
    
    print(f"   Best parameter: {grid_search.best_params_}")
    print(f"   Best time series CV score: {grid_search.best_score_:.4f}")

    best_lr = grid_search.best_estimator_

    cv_scores = cross_val_score(best_lr, X_train_selected, y_train, cv=tscv)
    print(f"   Time series CV accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

    for i, score in enumerate(cv_scores):
        print(f"     Fold {i+1}: {score:.4f}")
    
    # 2. Final evaluation on the test set
    print("Performance on Test Set:")

    y_pred = best_lr.predict(X_test_selected)

    print(f"Accuracy:                {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision (weighted):    {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall (weighted):       {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1-Score (weighted):     {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return best_model, grid_search, cv_scores

best_model, grid_search, cv_scores = complete_modeling_pipeline_timeseries(
    X_train_selected, y_train, X_test_selected, y_test, n_splits=5
)

  Fold 1: Train Sample 668, Validation Sample 665
  Fold 2: Train Sample 1333, Validation Sample 665
  Fold 3: Train Sample 1998, Validation Sample 665
  Fold 4: Train Sample 2663, Validation Sample 665
  Fold 5: Train Sample 3328, Validation Sample 665
Fitting 5 folds for each of 16 candidates, totalling 80 fits
   Best parameter: {'logreg__C': 0.01, 'logreg__class_weight': None, 'logreg__solver': 'newton-cg'}
   Best time series CV score: 0.6081
   Time series CV accuracy: 0.6081 (±0.0232)
     Fold 1: 0.6060
     Fold 2: 0.6406
     Fold 3: 0.6105
     Fold 4: 0.5684
     Fold 5: 0.6150
Performance on Test Set:
Accuracy:                0.5868
Precision (weighted):    0.6109
Recall (weighted):       0.5868
F1-Score (weighted):     0.5926

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.63      0.55       103
           1       0.73      0.60      0.66       195
           2       0.45      0.48      0.46        65

    accu

The model improves slightly after hyperparameter tuning, but the overall performance is still not very satisfactory.

## K-Nearest Neighbors Classifier

In [96]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=8)  

knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
y_pred_proba = knn.predict_proba(X_test_scaled)

print("K-Nearest Neighbors result:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

K-Nearest Neighbors result:
Accuracy: 0.5950

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.56      0.56       103
           1       0.69      0.65      0.67       195
           2       0.43      0.49      0.46        65

    accuracy                           0.60       363
   macro avg       0.56      0.57      0.56       363
weighted avg       0.60      0.60      0.60       363


Confusion Matrix:
[[ 58  32  13]
 [ 40 126  29]
 [  8  25  32]]


This model performs slightly better than logistic regression in terms of accuracy, but still struggles with class 0 and class 2.

In [99]:
pip install imblearn

Collecting imblearnNote: you may need to restart the kernel to use updated packages.

  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.14.0 imblearn-0.0


In [103]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Step 1: Create imbalanced data handling strategies

# Strategy 1: SMOTE (Synthetic Minority Over-sampling Technique)
print("\nStrategy 1: Using SMOTE for oversampling minority classes...")
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print(f"Original training set size: {X_train_scaled.shape[0]}")
print(f"After SMOTE: {X_train_smote.shape[0]}")
print("Class distribution after SMOTE:")
unique, counts = np.unique(y_train_smote, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count} records ({count/len(y_train_smote)*100:.2f}%)")

# Strategy 2: Combination of SMOTE and undersampling
print("\n\nStrategy 2: Using SMOTE + Random Undersampling...")
# First oversample minority classes to match majority class (1668)
sampling_strategy_oversample = {0: 1668, 2: 1668}
# Then undersample majority class
sampling_strategy_undersample = {1: 1400, 0: 1400, 2: 1400}
smote_combined = SMOTE(sampling_strategy=sampling_strategy_oversample, random_state=42)
under = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=42)
X_train_combined, y_train_combined = smote_combined.fit_resample(X_train_scaled, y_train)
X_train_combined, y_train_combined = under.fit_resample(X_train_combined, y_train_combined)
print(f"After SMOTE + Undersampling: {X_train_combined.shape[0]}")
print("Class distribution after combined resampling:")
unique, counts = np.unique(y_train_combined, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count} records ({count/len(y_train_combined)*100:.2f}%)")

# Step 2: Define hyperparameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Step 3: Use GridSearchCV with TimeSeriesSplit for hyperparameter tuning
tscv = TimeSeriesSplit(n_splits=5)
print("Hyperparameter Tuning with SMOTE-resampled Data")

grid_search = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=tscv,
    scoring='f1_weighted',  # Using F1-weighted score for imbalanced data
    n_jobs=-1,
    verbose=1
)

print("\nPerforming hyperparameter tuning...")
grid_search.fit(X_train_smote, y_train_smote)

# Print best parameters
print("\n" + "="*60)
print("Best Hyperparameters:")
print(grid_search.best_params_)
print(f"Best cross-validation F1-weighted score: {grid_search.best_score_:.4f}")
print("="*60 + "\n")

# Step 4: Get best model
best_knn = grid_search.best_estimator_

# Step 5: Detailed cross-validation evaluation on training set
print("Cross-Validation Results on Training Set (TimeSeriesSplit, 5 folds):")
cv_results = cross_validate(
    best_knn,
    X_train_smote,
    y_train_smote,
    cv=tscv,
    scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'],
    return_train_score=True
)

print(f"Training Accuracy:      {cv_results['train_accuracy'].mean():.4f} (+/- {cv_results['train_accuracy'].std():.4f})")
print(f"Validation Accuracy:    {cv_results['test_accuracy'].mean():.4f} (+/- {cv_results['test_accuracy'].std():.4f})")
print(f"Validation Precision:   {cv_results['test_precision_weighted'].mean():.4f} (+/- {cv_results['test_precision_weighted'].std():.4f})")
print(f"Validation Recall:      {cv_results['test_recall_weighted'].mean():.4f} (+/- {cv_results['test_recall_weighted'].std():.4f})")
print(f"Validation F1-Score:    {cv_results['test_f1_weighted'].mean():.4f} (+/- {cv_results['test_f1_weighted'].std():.4f})\n")

# Step 6: Evaluate on test set
print("Performance on Test Set:")

y_pred = best_knn.predict(X_test_scaled)
y_pred_proba = best_knn.predict_proba(X_test_scaled)

print(f"Accuracy:                {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision (weighted):    {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall (weighted):       {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score (weighted):     {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 7: Print top 10 GridSearchCV results
print("Top 10 GridSearchCV Results:")
results_df = pd.DataFrame(grid_search.cv_results_)
top_results = results_df[['param_n_neighbors', 'param_weights', 'param_metric', 'mean_test_score', 'std_test_score']].head(10)
print(top_results.to_string(index=False))

# Step 8: Compare models trained on different resampled datasets
print("Comparison: Different Resampling Strategies")

# Train on SMOTE data
y_pred_smote = best_knn.predict(X_test_scaled)
f1_smote = f1_score(y_test, y_pred_smote, average='weighted')
print(f"\nSMOTE only - F1-weighted: {f1_smote:.4f}")

# Retrain on combined resampling
grid_search_combined = GridSearchCV(
    KNeighborsClassifier(),
    param_grid,
    cv=tscv,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=0
)
grid_search_combined.fit(X_train_combined, y_train_combined)
best_knn_combined = grid_search_combined.best_estimator_
y_pred_combined = best_knn_combined.predict(X_test_scaled)
f1_combined = f1_score(y_test, y_pred_combined, average='weighted')
print(f"SMOTE + Undersampling - F1-weighted: {f1_combined:.4f}")



Strategy 1: Using SMOTE for oversampling minority classes...
Original training set size: 3993
After SMOTE: 5004
Class distribution after SMOTE:
  Class 0: 1668 records (33.33%)
  Class 1: 1668 records (33.33%)
  Class 2: 1668 records (33.33%)


Strategy 2: Using SMOTE + Random Undersampling...
After SMOTE + Undersampling: 4200
Class distribution after combined resampling:
  Class 0: 1400 records (33.33%)
  Class 1: 1400 records (33.33%)
  Class 2: 1400 records (33.33%)
Hyperparameter Tuning with SMOTE-resampled Data

Performing hyperparameter tuning...
Fitting 5 folds for each of 42 candidates, totalling 210 fits

Best Hyperparameters:
{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
Best cross-validation F1-weighted score: 0.6787

Cross-Validation Results on Training Set (TimeSeriesSplit, 5 folds):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training Accuracy:      0.9890 (+/- 0.0007)
Validation Accuracy:    0.6705 (+/- 0.0608)
Validation Precision:   0.7043 (+/- 0.1118)
Validation Recall:      0.6705 (+/- 0.0608)
Validation F1-Score:    0.6787 (+/- 0.0861)

Performance on Test Set:
Accuracy:                0.5978
Precision (weighted):    0.6342
Recall (weighted):       0.5978
F1-Score (weighted):     0.6078

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.56      0.58       103
           1       0.73      0.61      0.67       195
           2       0.38      0.62      0.47        65

    accuracy                           0.60       363
   macro avg       0.57      0.60      0.57       363
weighted avg       0.63      0.60      0.61       363

Confusion Matrix:
[[ 58  25  20]
 [ 31 119  45]
 [  7  18  40]]
Top 10 GridSearchCV Results:
 param_n_neighbors param_weights param_metric  mean_test_score  std_test_score
                 3       uniform    euclidean    

KNN also improves after hyperparameter tuning.

## SVC

In [107]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Step 1: Train a basic SVC model
svc = SVC(kernel='rbf', random_state=42, probability=True)
svc.fit(X_train_scaled, y_train)

# Step 2: Make predictions
y_pred = svc.predict(X_test_scaled)
y_pred_proba = svc.predict_proba(X_test_scaled)

print("SVC Performance on Test Set:")

print(f"Accuracy:                {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision (weighted):    {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall (weighted):       {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score (weighted):     {f1_score(y_test, y_pred, average='weighted'):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

SVC Performance on Test Set:
Accuracy:                0.6364
Precision (weighted):    0.6561
Recall (weighted):       0.6364
F1-Score (weighted):     0.6435

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.60      0.61       103
           1       0.76      0.69      0.72       195
           2       0.41      0.54      0.46        65

    accuracy                           0.64       363
   macro avg       0.59      0.61      0.60       363
weighted avg       0.66      0.64      0.64       363

Confusion Matrix:
[[ 62  22  19]
 [ 29 134  32]
 [ 10  20  35]]


In [110]:
# Step 1: Handle class imbalance with SMOTE
print("Handling Class Imbalance with SMOTE")

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training set size: {X_train_scaled.shape[0]}")
print(f"After SMOTE: {X_train_smote.shape[0]}")
print("Class distribution after SMOTE:")
unique, counts = np.unique(y_train_smote, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count} records ({count/len(y_train_smote)*100:.2f}%)")

# Step 2: Define hyperparameter grid for SVC
param_grid = {
    'C': [1,10],
    'kernel': ['rbf', 'poly'],
    'gamma': ['scale'],
}

# Step 3: Use GridSearchCV with TimeSeriesSplit for hyperparameter tuning
tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    SVC(random_state=42, probability=True),
    param_grid,
    cv=tscv,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_smote, y_train_smote)

# Print best parameters
print("Best Hyperparameters:")
print(grid_search.best_params_)
print(f"Best cross-validation F1-weighted score: {grid_search.best_score_:.4f}")

# Step 4: Get best model
best_svc = grid_search.best_estimator_

# Step 5: Detailed cross-validation evaluation on training set
print("Cross-Validation Results on Training Set")
print("(TimeSeriesSplit, 5 folds)")

cv_results = cross_validate(
    best_svc,
    X_train_smote,
    y_train_smote,
    cv=tscv,
    scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'],
    return_train_score=True
)

print("\nTraining Set Performance:")
print(f"  Accuracy:   {cv_results['train_accuracy'].mean():.4f} (+/- {cv_results['train_accuracy'].std():.4f})")
print(f"  Precision:  {cv_results['train_precision_weighted'].mean():.4f} (+/- {cv_results['train_precision_weighted'].std():.4f})")
print(f"  Recall:     {cv_results['train_recall_weighted'].mean():.4f} (+/- {cv_results['train_recall_weighted'].std():.4f})")
print(f"  F1-Score:   {cv_results['train_f1_weighted'].mean():.4f} (+/- {cv_results['train_f1_weighted'].std():.4f})")

print("\nValidation Set Performance:")
print(f"  Accuracy:   {cv_results['test_accuracy'].mean():.4f} (+/- {cv_results['test_accuracy'].std():.4f})")
print(f"  Precision:  {cv_results['test_precision_weighted'].mean():.4f} (+/- {cv_results['test_precision_weighted'].std():.4f})")
print(f"  Recall:     {cv_results['test_recall_weighted'].mean():.4f} (+/- {cv_results['test_recall_weighted'].std():.4f})")
print(f"  F1-Score:   {cv_results['test_f1_weighted'].mean():.4f} (+/- {cv_results['test_f1_weighted'].std():.4f})")

# Step 6: Evaluate on test set
print("Performance on Test Set:")

y_pred = best_svc.predict(X_test_scaled)
y_pred_proba = best_svc.predict_proba(X_test_scaled)

print(f"Accuracy:                {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision (weighted):    {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall (weighted):       {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score (weighted):     {f1_score(y_test, y_pred, average='weighted'):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 7: Print top 10 GridSearchCV results
print("Top 10 GridSearchCV Results:")
results_df = pd.DataFrame(grid_search.cv_results_)
top_results = results_df[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score', 'std_test_score']].head(10)
print(top_results.to_string(index=False))

Handling Class Imbalance with SMOTE
Original training set size: 3993
After SMOTE: 5004
Class distribution after SMOTE:
  Class 0: 1668 records (33.33%)
  Class 1: 1668 records (33.33%)
  Class 2: 1668 records (33.33%)
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Hyperparameters:
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation F1-weighted score: 0.6543
Cross-Validation Results on Training Set
(TimeSeriesSplit, 5 folds)

Training Set Performance:
  Accuracy:   0.7471 (+/- 0.0226)
  Precision:  0.7486 (+/- 0.0232)
  Recall:     0.7471 (+/- 0.0226)
  F1-Score:   0.7457 (+/- 0.0224)

Validation Set Performance:
  Accuracy:   0.6391 (+/- 0.0432)
  Precision:  0.6873 (+/- 0.1198)
  Recall:     0.6391 (+/- 0.0432)
  F1-Score:   0.6543 (+/- 0.0697)
Performance on Test Set:
Accuracy:                0.6253
Precision (weighted):    0.6625
Recall (weighted):       0.6253
F1-Score (weighted):     0.6359

Classification Report:
              precision    recall 

# 3. Final Evaluation

In [118]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model_comparison = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

# Logistic Regression
y_pred_lr = best_model.predict(X_test_selected)
model_comparison.loc[len(model_comparison)] = [
    'Logistic Regression',
    accuracy_score(y_test, y_pred_lr),
    precision_score(y_test, y_pred_lr, average='weighted'),
    recall_score(y_test, y_pred_lr, average='weighted'),
    f1_score(y_test, y_pred_lr, average='weighted')
]

# KNN
y_pred_knn = best_knn.predict(X_test_scaled)
model_comparison.loc[len(model_comparison)] = [
    'KNN',
    accuracy_score(y_test, y_pred_knn),
    precision_score(y_test, y_pred_knn, average='weighted'),
    recall_score(y_test, y_pred_knn, average='weighted'),
    f1_score(y_test, y_pred_knn, average='weighted')
]
# SVC

y_pred_svc = best_svc.predict(X_test_scaled)
model_comparison.loc[len(model_comparison)] = [
    'SVC',
    accuracy_score(y_test, y_pred_svc),
    precision_score(y_test, y_pred_svc, average='weighted'),
    recall_score(y_test, y_pred_svc, average='weighted'),
    f1_score(y_test, y_pred_svc, average='weighted')
]


print("\nModel Performance Comparison on Test Set\n")
print(model_comparison.to_string(index=False))

print("\nSorted by F1-Score (descending)\n")
print(model_comparison.sort_values(by='F1-Score', ascending=False).to_string(index=False))



Model Performance Comparison on Test Set

              Model  Accuracy  Precision   Recall  F1-Score
Logistic Regression  0.586777   0.610907 0.586777  0.592583
                KNN  0.597796   0.634247 0.597796  0.607792
                SVC  0.625344   0.662451 0.625344  0.635870

Sorted by F1-Score (descending)

              Model  Accuracy  Precision   Recall  F1-Score
                SVC  0.625344   0.662451 0.625344  0.635870
                KNN  0.597796   0.634247 0.597796  0.607792
Logistic Regression  0.586777   0.610907 0.586777  0.592583


SVC performs the best among the three models, with an accuracy of 0.6253 and better precision and F1-scores across all classes.

In [119]:
import pickle
import os

os.makedirs("model", exist_ok=True)

with open("model/best_svc.pkl", "wb") as f:
    pickle.dump(best_svc, f)