In [1]:
## Runned using python 3.12.1

## Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import os

In [2]:
DATA_DIR = "../datasets"

# Load CSV
train_data = pd.read_csv(os.path.join(DATA_DIR, "train_data_features_extracted_filtered.csv"), index_col=0)
evaluation_data = pd.read_csv(os.path.join(DATA_DIR, "evaluation_data_features_extracted_filtered.csv"), index_col=0)

In [5]:
# Define the KNN Regressor
# https://scikit-learn.org/1.6/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#kneighborsclassifier
knn_regressor = KNeighborsRegressor()

# Create a pipeline to include the StandardScaler, PCA, and KNN Regressor
pipeline = Pipeline([
    ('scaler', StandardScaler()),      # Standardize the features
    ('pca', PCA()),
    ('knn', knn_regressor)             # KNN regressor
])

# Define the parameter grid for tuning
param_grid = {
    'knn__n_neighbors': [10, 15, 20, 25, 30, 35],       # Test a wider range of neighbors
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2],                                   # Manhattan and Euclidean
    'pca__n_components': [0.5, 0.65, 0.75, 0.85, None]  # Try different PCA variance thresholds
}

In [None]:
# Random state
random_state = 42

X = train_data.drop(columns=['age'])
y = train_data['age']


# Split the data into train and test sets using the current random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create and fit the GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                              # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Metric for evaluation
    verbose=1,
    n_jobs=-1                          # Use all available cores
)

grid_search.fit(X_train, y_train)

# Get the best model and make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Parameters: {grid_search.best_params_}")
print(f"RMSE: {rmse}\n")

In [None]:
# Plot predicted vs true age
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([0, 100], [0, 100], color='red', linestyle='--')
plt.xlabel('True Age')
plt.ylabel('Predicted Age')
plt.title('Predicted vs True Age')
plt.show()

In [None]:
## Evaluation data

y_eval = best_model.predict(evaluation_data)
y_eval = pd.DataFrame(y_eval, columns=['Predicted'])
display(y_eval)

# Save the predictions to a CSV file
y_eval.to_csv('evaluation_predictions.csv', index=True, index_label='Id')