In [1]:
import pandas as pd

In [2]:
data_path = '../final_data.csv'
df = pd.read_csv(data_path)

## Data preparation:

### Splitting data into train and test:

In [3]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Time'] <= year_split]
    test_data = df[df['Time'] > year_split]
    
    return train_data, test_data

In [4]:
train_data, test_data = split_data(df)
print(f"train_data's shape: {train_data.shape}")
print(f"test_data's shape: {test_data.shape}")

train_data's shape: (324, 33)
test_data's shape: (72, 33)


### Scaling data using StandardScaler:

In [5]:
import joblib

def scale_data(data, scaler_path):
    scaler = joblib.load(scaler_path)
    
    scaled_col = data.drop(['Company name', 'TSR', 'Time'], axis=1)
    
    data_scaled = scaler.transform(scaled_col)
    scaled_df = pd.DataFrame(data_scaled, columns=scaled_col.columns, index=data.index)
    final_data_scaled = data[['Company name', 'TSR', 'Time']].join(scaled_df)
    
    return final_data_scaled

In [6]:
scaler_path = './scaler_folder/standard_scaler.joblib'

In [7]:
train_data_scaled = scale_data(train_data, scaler_path)
test_data_scaled = scale_data(test_data, scaler_path)

## Models defining:

### Support Vector Regression definition:

In [8]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import r2_score
# Define
def svr(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    
    param_grid = {
        'kernel': ['rbf', 'linear'],  
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.2, 0.5],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    }
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    svr = SVR()
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Calculate R² on the training data
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    return best_model, best_params, r2_train

### Recurrent Support Vector Machine Regression definition:

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Use rnn model to generate representations for each timestep
def create_rnn(input_shape):
    model = Sequential()
    model.add(LSTM(units=50, activation='relu', return_sequences=False, input_shape=input_shape))
    model.add(Dense(units=1, activation='linear')) 
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model



In [10]:
def rsvm(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    # Convert data for sequence input
    X_train_seq = train_data[feature_columns].values.reshape((train_data.shape[0], 1, len(feature_columns)))
    y_train_seq = train_data[target_column].values
    
    # Train RNN
    rnn = create_rnn(input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]))
    rnn_model = rnn.fit(X_train_seq, y_train_seq, epochs=3, batch_size=16, verbose=2)

    # Extract representations from rnn model -> use as input for svr
    train_features = rnn.predict(X_train_seq)  
    print("Train features for SVR shape:", train_features.shape)
    
    # Set up params for svr
    param_grid = {
        'kernel': ['rbf', 'linear'],  
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.01, 0.1, 0.2, 0.5],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    }
    svr = SVR()
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(train_features, y_train_seq)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # Calculate R² on training data
    y_train_pred = best_model.predict(train_features)
    r2_train = r2_score(y_train_seq, y_train_pred)

    return best_model, best_params, r2_train, rnn


## Model Evaluation definition:

In [11]:
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def evaluate_model(model, test_data, target_column, feature_columns):
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    y_pred = model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse, f"The RMSE of the {model} is {rmse}"

In [12]:
def evaluate_rsvm_model(model, rnn, test_data, target_column, feature_columns):
    # Preprocess the test data to match the training format
    X_test_seq = test_data[feature_columns].values.reshape((test_data.shape[0], 1, len(feature_columns)))
    y_test = test_data[target_column].values
    
    # Get features from the RNN model
    test_features = rnn.predict(X_test_seq)  # Using the trained rnn model to get features
    
    # Evaluate the SVR model using the test features
    y_pred = model.predict(test_features)
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return rmse, f"The RMSE of the {model} is {rmse}"


### Model's dumping:

In [16]:
import joblib
def save_file(scaler, filename):
    joblib.dump(scaler, filename)
    return f"Saving {filename} successfully executed"

## Model training:

In [13]:
feature_columns = [col for col in train_data_scaled.columns if col != "TSR"]

### Support Vector Regression:

In [18]:
best_model, best_params, r2_train = svr(train_data_scaled, target_column="TSR", feature_columns=feature_columns)
# evaluation
rmse = evaluate_model(best_model, test_data_scaled, target_column="TSR", feature_columns=feature_columns)

print(f"\n Best Hyperparameters: {best_params}"),
print(f"\n R2 on Train-set: {r2_train}"),
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

Fitting 5 folds for each of 192 candidates, totalling 960 fits

 Best Hyperparameters: {'C': 1, 'epsilon': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}

 R2 on Train-set: -0.001394812972826065

 Root Mean Squared Error (RMSE) on Test Set: (0.09091483235003991, 'The RMSE of the SVR(C=1, gamma=0.001) is 0.09091483235003991')




In [23]:
svr_path = "./model_folder/svr.joblib"
save_file(best_model, svr_path)

'Saving ./model_folder/svr.joblib successfully executed'

### Recurrent Support Vector Machine Regression:

In [14]:
best_model, best_params, r2_train, rnn_model = rsvm(train_data_scaled, target_column="TSR", feature_columns=feature_columns)

# Evaluation
rmse, rmse_message = evaluate_rsvm_model(best_model, rnn_model, test_data_scaled, target_column="TSR", feature_columns=feature_columns)

# Output results
print(f"\n Best Hyperparameters: {best_params}")
print(f"\n R2 on Train-set: {r2_train}")
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")
print(rmse_message)

Epoch 1/3


  super().__init__(**kwargs)


21/21 - 2s - 104ms/step - loss: 451.3327
Epoch 2/3
21/21 - 0s - 5ms/step - loss: 14.9226
Epoch 3/3
21/21 - 0s - 5ms/step - loss: 10.9368
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Train features for SVR shape: (324, 1)
Fitting 5 folds for each of 192 candidates, totalling 960 fits
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

 Best Hyperparameters: {'C': 100, 'epsilon': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}

 R2 on Train-set: -0.001613857272398933

 Root Mean Squared Error (RMSE) on Test Set: 0.09182651751960685
The RMSE of the SVR(C=100, gamma=0.1) is 0.09182651751960685




In [17]:
rsvm_path = "./model_folder/rsvm.joblib"
save_file(best_model, rsvm_path)

'Saving ./model_folder/rsvm.joblib successfully executed'