In [5]:
## Runned using python 3.12.1

## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR



In [None]:
# Import data
train_data = pd.read_csv('train_data_features_extracted_filtered.csv', index_col=0)
evaluation_data = pd.read_csv('evaluation_data_features_extracted_filtered.csv', index_col=0)

In [7]:
# Define the Support Vector Regressor
svr = SVR()

# Create a pipeline with MinMaxScaler
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # MinMax scaling for features
    ('svr', svr)                 # Support Vector Regressor
])

# Define the parameter grid for tuning
param_grid = {
    'svr__C': [1,5,8,10,20,30],            # Regularization parameter
    'svr__epsilon': [4,6,8],               # Epsilon parameter for the SVR
    'svr__kernel': ['rbf','poly'],         # Kernel types
    'svr__degree' : [2,3,4]
}

In [None]:
# Random state
random_state = 42

X = train_data.drop(columns=['age'])
y = train_data['age']

# Loop through each random state
# Split the data into train and test sets using the current random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Since the train_data is not balanced, we can assign weights to each sample based on the age group.
age_bins = [0,20,30,45,101]                                 # define the age bins
y_train_bins = pd.cut(y_train, bins=age_bins, right=False)  # right=False: the intervals are left-closed, right-open [0, 10), [10, 20), ..., [90, 100)
group_counts = y_train_bins.value_counts() / len(y_train)   # number of samples in each age group (this is a series with the age groups as index)
group_weights = 1 - (group_counts/sum(group_counts))        # assign to a weight equal to the inverse of the frequency of his age group (this is a series with the age groups as index)

# map the weight to each sample based on the age group
train_sample_weights = y_train_bins.map(group_weights)

# Create and fit the GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                              # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Metric for evaluation
    n_jobs=-1                          # Use all available cores
)

grid_search.fit(X_train, y_train, svr__sample_weight=train_sample_weights)

# Get the best model and make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best Parameters: {grid_search.best_params_}")
print(f"RMSE: {rmse}\n")


In [None]:
# Plot predicted vs true age
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([0, 100], [0, 100], color='red', linestyle='--')
plt.xlabel('True Age', fontsize=22)
plt.ylabel('Predicted Age', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.grid(True)
plt.savefig('SVR.png', dpi=300) 
plt.show()

In [None]:
## Evaluation data

y_eval = best_model.predict(evaluation_data)
y_eval = pd.DataFrame(y_eval, columns=['Predicted'])
display(y_eval)

# Save the predictions to a CSV file
y_eval.to_csv('evaluation_predictions.csv', index=True, index_label='Id')