In [39]:
import pandas as pd
import os
import pickle
import numpy as np
import joblib

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
import pickle
import joblib
import os

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Save df

In [40]:
# load the data
data = pd.read_csv('../data/clean_data.csv')

# Start by selecting only the chosen columns
data = data[['realSum','room_type', 'person_capacity', 'biz', 'bedrooms', 'dist', 'metro_dist', 'city', 'period', 'attr_index', 'GDP']]

hot_cols = ['room_type', 'city', 'period'] # Columns to one-hot encode

norm_cols = ['person_capacity', # Columns to normalize
            'bedrooms',
            'dist',
            'metro_dist',
            'attr_index',
            'GDP'] 

# Split data

In [41]:
y = data[['realSum']]
X = data.drop(columns='realSum')



# Encoding and scaling

In [42]:
def one_hot_encode(df, categorical_cols):
    return pd.get_dummies(df, columns=categorical_cols)

def preprocess_data(X, categorical_cols, numerical_cols, y):
    # One-hot encoded data
    X_hot = one_hot_encode(X.copy(), categorical_cols)

    del X_hot['period_Weekdays']

    # Convert all Boolean columns to integers (0 and 1)
    for col in X_hot.columns:
        if X_hot[col].dtype == bool:
            X_hot[col] = X_hot[col].astype(int)
    
    # delete the column ['period_Weekdays']
   # X_hot = X_hot.drop(columns=['period_Weekdays'])

    
    hot = (X_hot, y)
    
    # Normalized data 
    scaler_X = MinMaxScaler()
    X_hot_norm = scaler_X.fit_transform(X_hot.copy())

    scaler_y = MinMaxScaler()
    y_hot_norm = scaler_y.fit_transform(y.copy())
    
    # Collect the final datasets
    hot_norm = (X_hot_norm, y_hot_norm)
    
    return hot, hot_norm, scaler_X, scaler_y




In [43]:


# Preprocess data
hot, hot_norm, scaler_X, scaler_y = preprocess_data(X=X, y=y,
                                                    categorical_cols=hot_cols, 
                                                    numerical_cols=norm_cols)



In [44]:
hot_norm[0]

array([[0.5, 1. , 0.1, ..., 0. , 0. , 0. ],
       [0. , 0. , 0.1, ..., 0. , 0. , 0. ],
       [0.5, 1. , 0.1, ..., 0. , 0. , 0. ],
       ...,
       [1. , 0. , 0.3, ..., 1. , 0. , 1. ],
       [0.5, 0. , 0.1, ..., 1. , 0. , 1. ],
       [1. , 1. , 0.2, ..., 1. , 0. , 1. ]])

In [45]:
# 15% saved for testing
X_train, X_test, y_train, y_test = train_test_split(hot_norm[0], 
                                                    hot_norm[1], 
                                                    test_size=0.15, 
                                                    random_state=42)

# 15% of the remaining saved for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train,
                                                  test_size=X_test.shape[0] / X_train.shape[0],  
                                                  random_state=42)

In [46]:
original = []

# Load model
knn = KNeighborsRegressor(n_neighbors=5)
# Fit model
knn.fit(X_train, y_train)
# Zip data and labels
for x, y, nsplit in zip([X_train, X_val, X_test], [y_train, y_val, y_test], ['train', 'val', 'test']):
    # Make predictions                
    preds = knn.predict(x)
    # Inverse transform normalized predictions
    preds_original = scaler_y.inverse_transform(preds.reshape(-1, 1)).flatten()
    # Inverse transform true values
    y_original = scaler_y.inverse_transform(y.reshape(-1, 1)).flatten()
    # Get MAE
    mae = mean_absolute_error(y_original, preds_original)
    # Append to performances
    original.append({
        'model': 'KNN-5',
        'split': nsplit,
        'mae': mae.round(4),
        'preds': preds_original,
        'y': y_original
    })

In [47]:
# Define the models and splits we are interested in
models_of_interest = ['KNN-5']
splits_of_interest = ['train', 'val', 'test']

# Header for the output
print(f"{'Model':<10} | {'Split':<5} | {'MAE':>5}")

# Loop through each model and split to print the MAE scores
for model in models_of_interest:
    for split in splits_of_interest:
        # Find the relevant performance entry
        entry = next((item for item in original if item['model'] == model and item['split'] == split), None)
        if entry is not None:
            # Print the model, split, and MAE score
            print(f"{model:<10} | {split:<5} | {entry['mae']:>5.4f}")
        else:
            # Handle cases where no entry is found
            print(f"{model:<10} | {split:<5} | {'N/A':>5}")


Model      | Split |   MAE
KNN-5      | train | 55.3467
KNN-5      | val   | 69.3309
KNN-5      | test  | 69.8080
