# **Advanced Machine Learning - Assignment 1**

Let's define the training dataset import function, fetching directly from the Github repo.

In [63]:
import pandas as pd

def merge_training_dataset(feature_set, label_set):
    merged = pd.merge(feature_set, label_set, on='id')
    return merged

def import_training_dataset():
  url_features = 'https://github.com/AlbezJelt/AML_Assignment1/raw/main/data/X_train.csv'
  url_labels = 'https://github.com/AlbezJelt/AML_Assignment1/raw/main/data/y_train.csv'
  feature_set = pd.read_csv(url_features).rename(columns={'Unnamed: 0':'id'})
  label_set = pd.read_csv(url_labels).rename(columns={'Unnamed: 0':'id'})
  train_set = merge_training_dataset(feature_set, label_set)
  train_set.drop('id', axis=1, inplace=True)
  return train_set

train_set = import_training_dataset()
test_set = train_set.sample(frac=0.2, random_state=42)
train_set = train_set.drop(test_set.index)


Now we can explore the pandas datasets:

In [11]:
import seaborn as sns

train_stats = train_set.describe()
train_stats.pop("latitude")
train_stats.pop("longitude")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
minimum_nights,27107.0,7.166562,19.936023,1.0,1.0,3.0,5.0,999.0
number_of_reviews,27107.0,23.31051,44.342583,0.0,1.0,5.0,23.0,629.0
reviews_per_month,27107.0,1.093855,1.618335,0.0,0.04,0.37,1.59,58.5
calculated_host_listings_count,27107.0,7.251743,33.307034,1.0,1.0,1.0,2.0,327.0
availability_365,27107.0,112.831925,131.551035,0.0,0.0,45.0,226.0,365.0
Private_room,27107.0,0.455307,0.498008,0.0,0.0,0.0,1.0,1.0
Entire_home/apt,27107.0,0.521194,0.49956,0.0,0.0,1.0,1.0,1.0
price,27107.0,153.695724,253.309864,0.0,69.0,105.0,176.0,10000.0


In [None]:
sns.pairplot(train_set[["minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365"]], diag_kind="kde")

Now we begin with the model creation. First of all we sepair the labels (in this case the price column) from the train set. 

In [64]:
import numpy as np

def prepare_data(train_set, test_set):
    Y_train = train_set.copy().pop("price").to_numpy(dtype=np.float32)
    Y_test = test_set.copy().pop("price").to_numpy(dtype=np.float32)
    return train_set.copy().to_numpy(dtype=np.float32), Y_train, test_set.copy().to_numpy(dtype=np.float32), Y_test

X_train, Y_train, X_test, Y_test = prepare_data(train_set, test_set)

Then we proceed to normalize data. Remember to normalize the validation set too.

In [73]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

def preprocess_data(X : np.ndarray, scaler=None):
    if not scaler:
        scaler = RobustScaler()
    if X.ndim == 1:
        X = np.squeeze(scaler.fit_transform(X.reshape(-1, 1)))
    else:
        X = scaler.fit_transform(X)      
    return X

X_train = preprocess_data(X_train, StandardScaler())
Y_train = preprocess_data(Y_train, StandardScaler())

(27107, 10)


In [76]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def build_model(train_dataset):
  model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1], )),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = keras.optimizers.RMSprop(0.001)

  model.compile(loss=root_mean_squared_error,
                optimizer=optimizer,
                metrics=[root_mean_squared_error, 'mse'])
  return model

model = build_model(X_train)
model.summary()


Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 64)                704       
_________________________________________________________________
dense_17 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 65        
Total params: 4,929
Trainable params: 4,929
Non-trainable params: 0
_________________________________________________________________


In [79]:
from sklearn.model_selection import KFold

# Define the K-fold Cross Validator
kfold = KFold(n_splits=10, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(X_train, Y_train):
    model.fit(X_train[train], Y_train[train], epochs=25, verbose=False)
    scores = model.evaluate(X_train[train], Y_train[train], verbose=False)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}')
    fold_no = fold_no + 1

Score for fold 1: loss of 0.01549906563013792; root_mean_squared_error of 0.015496984124183655
Score for fold 2: loss of 0.005724536720663309; root_mean_squared_error of 0.005723513197153807
Score for fold 3: loss of 0.009551148861646652; root_mean_squared_error of 0.009549764916300774
Score for fold 4: loss of 0.011755700223147869; root_mean_squared_error of 0.011749790981411934
Score for fold 5: loss of 0.005230107344686985; root_mean_squared_error of 0.005230754613876343
Score for fold 6: loss of 0.005272132810205221; root_mean_squared_error of 0.005271930247545242
Score for fold 7: loss of 0.00594754982739687; root_mean_squared_error of 0.005946459248661995
Score for fold 8: loss of 0.006926438305526972; root_mean_squared_error of 0.006927221082150936
Score for fold 9: loss of 0.005190389230847359; root_mean_squared_error of 0.005190315190702677
Score for fold 10: loss of 0.011581392958760262; root_mean_squared_error of 0.01158023253083229
