1. Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import make_pipeline
from sklearn.kernel_approximation import RBFSampler

import torch
from torch import nn
from torch.optim import Adam

In [3]:
df = pd.read_csv('FLIR_groups1and2_train.csv', skiprows=2)
df = df.dropna(axis=1, how='all')
y_train = df.loc[:, 'aveOralM']

2. Data Preprocessing

In [4]:
# Taking out common features out which are independent of rounds for each subject
columns = ['Gender', 'Age', 'Ethnicity', 'T_atm', 'Humidity', 'Distance']
common_features = df[columns]

In [5]:
common_features.loc[:, 'Age'] = [math.ceil((int(x.split('-')[1]) + int(x.split('-')[0]))/2) if '-' in x else int(x.strip('>')) for x in common_features['Age']]
common_features

Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Distance
0,Male,46,White,24.0,28.0,0.80
1,Female,36,Black or African-American,24.0,26.0,0.80
2,Female,26,White,24.0,26.0,0.80
3,Female,26,Black or African-American,24.0,27.0,0.80
4,Male,19,White,24.0,27.0,0.80
...,...,...,...,...,...,...
705,Female,19,White,24.4,13.5,0.60
706,Female,23,Asian,24.4,14.7,0.63
707,Male,23,Multiracial,22.0,30.0,0.60
708,Male,19,White,22.0,30.0,0.60


In [6]:
print(min(common_features['Age']))

19


In [7]:
print(max(common_features['Age']))

60


In [8]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categorical columns
encoded_data = encoder.fit_transform(common_features[['Gender', 'Ethnicity']])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Gender', 'Ethnicity']))

In [9]:
common_features = common_features.drop(columns=['Gender', 'Ethnicity'])
common_features = pd.concat([common_features, encoded_df], axis=1)

common_features 

Unnamed: 0,Age,T_atm,Humidity,Distance,Gender_Female,Gender_Male,Ethnicity_American Indian or Alaskan Native,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White
0,46,24.0,28.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,36,24.0,26.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,26,24.0,26.0,0.80,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,26,24.0,27.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,19,24.0,27.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
705,19,24.4,13.5,0.60,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
706,23,24.4,14.7,0.63,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
707,23,22.0,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
708,19,22.0,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Column names
columns_set = {
    
    'T_offset_' : [1,2,3,4],
    'Max1R13_': [1,2,3,4],
    'Max1L13_': [1,2,3,4],
    'aveAllR13_':  [1,2,3,4],
    'aveAllL13_': [1,2,3,4],
    'T_RC_' : [1,2,3,4],
    'T_RC_Dry_': [1,2,3,4],
    'T_RC_Wet_': [1,2,3,4],
    'T_RC_Max_': [1,2,3,4],
    'T_LC_': [1,2,3,4],
    'T_LC_Dry_': [1,2,3,4],
    'T_LC_Wet_': [1,2,3,4],
    'T_LC_Max_': [1,2,3,4],
    'RCC_': [1,2,3,4],
    'LCC_': [1,2,3,4],
    'canthiMax_': [1,2,3,4],
    'canthi4Max_': [1,2,3,4],
    'T_FHCC_': [1,2,3,4],
    'T_FHRC_': [1,2,3,4],
    'T_FHLC_': [1,2,3,4],
    'T_FHBC_': [1,2,3,4],
    'T_FHTC_': [1,2,3,4],
    'T_FH_Max_': [1,2,3,4],
    'T_FHC_Max_': [1,2,3,4],
    'T_Max_': [1,2,3,4],
    'T_OR_': [1,2,3,4],
    'T_OR_Max_': [1,2,3,4]
}

In [11]:
# Rows with NaN values filled with means for thermal data
df_filled = pd.DataFrame()
column_names = []
for header, rounds in columns_set.items():
    for roundd in rounds:
        column_names.append(f'{header}{roundd}')
    
thermal_info = df[column_names]
thermal_info = thermal_info.fillna(thermal_info.mean())

In [12]:
#Column wise mean of 4 rounds
new_mean_dataframe = pd.DataFrame()

for header, rounds in columns_set.items():
    column_names = [f'{header}{roundd}' for roundd in rounds]
    new_mean_dataframe[f'{header}mean'] = thermal_info[column_names].mean(axis=1)
    


Final Dataset

In [13]:
X_train = pd.concat([new_mean_dataframe, common_features], axis=1)
X_train

Unnamed: 0,T_offset_mean,Max1R13_mean,Max1L13_mean,aveAllR13_mean,aveAllL13_mean,T_RC_mean,T_RC_Dry_mean,T_RC_Wet_mean,T_RC_Max_mean,T_LC_mean,...,Humidity,Distance,Gender_Female,Gender_Male,Ethnicity_American Indian or Alaskan Native,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White
0,0.7025,35.0300,35.3775,34.4000,34.9175,34.9850,34.9850,34.7625,35.0325,35.3375,...,28.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.7800,34.5500,34.5200,33.9300,34.2250,34.7100,34.6325,34.6400,34.7425,34.5600,...,26.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.8625,35.6525,35.5175,34.2775,34.8000,35.6850,35.6675,35.6150,35.7175,35.5025,...,26.0,0.80,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.9300,35.2225,35.6125,34.3850,35.2475,35.2075,35.2000,35.1175,35.2250,35.5950,...,27.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.8950,35.5450,35.6650,34.9100,35.3675,35.6025,35.4750,35.5700,35.6400,35.6400,...,27.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,0.9325,35.4800,35.5300,34.9000,34.9900,35.5650,35.5650,35.1350,35.6300,35.5325,...,13.5,0.60,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
706,0.8550,35.6550,35.5325,35.1925,35.2075,35.6125,35.6000,35.4850,35.6550,35.5275,...,14.7,0.63,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
707,0.9700,36.7325,36.4600,36.2225,36.1150,36.7175,36.7150,36.6400,36.7350,36.4350,...,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
708,1.0725,36.9450,37.0675,36.3825,36.4825,36.9250,36.9200,36.8200,36.9475,37.0500,...,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


3. Data Normalization

In [14]:
scaler = MinMaxScaler()
X_train_original = scaler.fit_transform(X_train)

In [15]:
y_train_original = y_train.iloc[:].values.astype(float)

4. Model Creation and training

In [16]:
class MeanTrivialSystem:
    def __init__(self):
        self_output_value = None
        
    def fit(self, X, y):
        self.output_value = sum(y) / len(y)
            
    def predict(self, X):
        return [self.output_value] * len(X)       
        

In [None]:
# 1NN

class NearestNeighbor1(object):
    def __init__(self, p_val: int = 2) -> None:
        self.knn_reg = KNeighborsRegressor(
            n_neighbors=1,
            weights='uniform',
            algorithm='auto',
            leaf_size=30,
            p=p_val,
            metric='minkowski',
            n_jobs=None
        )

        self.cv_scores = None

    def fit(self, X, y) -> None:
        self.knn_reg.fit(X, y)

    def predict(self, X):
        return self.knn_reg.predict(X)

    def score(self, X, y):
        return self.knn_reg.score(X, y)
    
    def cross_validate(self, X, y, n_splits=5, shuffle=True, random_state=42):
        cross_val = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        self.cv_scores = cross_val_score(self.knn_reg, X, y, cv=cross_val)
        return self.cv_scores

In [None]:
# 1NN 20-fold cross validation (on normalized dataset)

num_splits = 20
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

mae_trains = []
mse_trains = []
rmse_trains = []

mae_vals = []
mse_vals = []
rmse_vals = []

for train_index, val_index in kf.split(X_train_original):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    NN = NearestNeighbor1()
    NN.fit(X_train_fold, y_train_fold)
    
    train_pred = NN.predict(X_train_fold)
    val_pred = NN.predict(X_val_fold)

    # training metrics
    mae_train = mean_absolute_error(y_train_fold, train_pred)
    mae_trains.append(mae_train)
    mse_train = mean_squared_error(y_train_fold, train_pred, squared=True)
    mse_trains.append(mse_train)
    rmse_train = mean_squared_error(y_train_fold, train_pred, squared=False)
    rmse_trains.append(rmse_train)
    
    # validation metrics
    mae_val = mean_absolute_error(y_val_fold, val_pred)
    mae_vals.append(mae_val)
    mse_val = mean_squared_error(y_val_fold, val_pred, squared=True)
    mse_vals.append(mse_val)
    rmse_val = mean_squared_error(y_val_fold, val_pred, squared=False)
    rmse_vals.append(rmse_val)
    
    print(f'validation rmse: {rmse_val}')

mse_train_mean = sum(mse_trains)/len(mse_trains)
rmse_train_mean = sum(rmse_trains)/len(rmse_trains)
mae_train_mean = sum(mae_trains)/len(mae_trains) 

mse_val_mean = sum(mse_vals)/len(mse_vals)
rmse_val_mean = sum(rmse_vals)/len(rmse_vals)
mae_val_mean = sum(mae_vals)/len(mae_vals)

print(f'mae_trains: {mae_trains}')
print(f'mean training mae: {mae_train_mean}')
print(f'mse_trains: {mse_trains}')
print(f'mean training mse: {mse_train_mean}')
print(f'rmse_trains: {rmse_trains}')
print(f'mean training rmse: {rmse_train_mean}')

print(f'mae_vals: {mae_vals}')
print(f'mean validation mae: {mae_val_mean}')
print(f'mse_vals: {mse_vals}')
print(f'mean validation mse: {mse_val_mean}')
print(f'rmse_vals: {rmse_vals}')
print(f'mean validation rmse: {rmse_val_mean}')

In [17]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
# Trivial System

train_rmse = []
train_mse = []
train_mae = []

val_rmse = []
val_mse = []
val_mae = []

for train_index, val_index in kf.split(X_train_original):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]
    

    trivial = MeanTrivialSystem()
    trivial.fit(X_train_fold, y_train_fold)

    # Make predictions on the training set
    train_predictions = trivial.predict(X_train_fold)
    
    mse_train = mean_squared_error(y_train_fold, train_predictions)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train_fold, train_predictions)
    
    train_mse.append(mse_train)
    train_rmse.append(rmse_train)
    train_mae.append(mae_train)
    
    val_predictions = trivial.predict(X_val_fold)

    mse_val = mean_squared_error(y_val_fold, val_predictions)
    rmse_val = np.sqrt(mse_val)
    mae_val = mean_absolute_error(y_val_fold, val_predictions)
    
    
    val_mse.append(mse_val)
    val_rmse.append(rmse_val)
    val_mae.append(mae_val)

print(np.mean(train_mse))       
print(np.mean(train_rmse))   
print(np.mean(train_mae))   

print(np.mean(val_mse))       
print(np.mean(val_rmse))   
print(np.mean(val_mae))    

0.2376469760464194
0.48723527683216333
0.31586001041461065
0.23845573174965323
0.4839243646729258
0.3160925411624781


In [19]:
# Linear Regression

train_rmse = []
train_mse = []
train_mae = []

val_rmse = []
val_mse = []
val_mae = []

for train_index, val_index in kf.split(X_train_original):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]
    

    linear_reg = LinearRegression()
    linear_reg.fit(X_train_fold, y_train_fold)

    train_predictions = linear_reg.predict(X_train_fold)
    
    mse_train = mean_squared_error(y_train_fold, train_predictions)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train_fold, train_predictions)
    
    train_mse.append(mse_train)
    train_rmse.append(rmse_train)
    train_mae.append(mae_train)
    
    val_predictions = linear_reg.predict(X_val_fold)

    mse_val = mean_squared_error(y_val_fold, val_predictions)
    rmse_val = np.sqrt(mse_val)
    mae_val = mean_absolute_error(y_val_fold, val_predictions)
    
    
    val_mse.append(mse_val)
    val_rmse.append(rmse_val)
    val_mae.append(mae_val)

print(np.mean(train_mse))       
print(np.mean(train_rmse))   
print(np.mean(train_mae))   

print(np.mean(val_mse))       
print(np.mean(val_rmse))   
print(np.mean(val_mae))         

0.057964544703018775
0.24072384941072983
0.18853265961815263
0.06709574189571246
0.2584805662355625
0.20122475247383892


In [20]:
# Support Vector Regressor

def supportvectorreg(X_train_original, y_train_original, kernel='linear', C=10, gamma='auto'):
    
    train_rmse = []
    train_mse = []
    train_mae = []

    val_rmse = []
    val_mse = []
    val_mae = []
    for train_index, val_index in kf.split(X_train_original):

        X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
        y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]
        
        if kernel == 'linear':
            clf = svm.SVR(kernel=kernel, C=C)
        elif kernel == 'rbf':
            clf = svm.SVR(kernel=kernel, C=C, gamma=gamma)
        else:
            raise ValueError("Invalid kernel type.")

        clf.fit(X_train_fold, y_train_fold)

        # Make predictions on the training set
        train_predictions = clf.predict(X_train_fold)
        
        mse_train = mean_squared_error(y_train_fold, train_predictions)
        rmse_train = np.sqrt(mse_train)
        mae_train = mean_absolute_error(y_train_fold, train_predictions)
        
        train_mse.append(mse_train)
        train_rmse.append(rmse_train)
        train_mae.append(mae_train)
        
        val_predictions = clf.predict(X_val_fold)

        mse_val = mean_squared_error(y_val_fold, val_predictions)
        rmse_val = np.sqrt(mse_val)
        mae_val = mean_absolute_error(y_val_fold, val_predictions)
        
        
        val_mse.append(mse_val)
        val_rmse.append(rmse_val)
        val_mae.append(mae_val)
        
    # Get the number of support vectors
    n_support_vectors = np.sum(clf.n_support_)
    degrees_of_freedom = len(X_train_original) - n_support_vectors
    
    print("Number of support vectors:", n_support_vectors)
    print("Degrees of freedom:", degrees_of_freedom)
    print(np.mean(train_mse))       
    print(np.mean(train_rmse))   
    print(np.mean(train_mae))   

    print(np.mean(val_mse))       
    print(np.mean(val_rmse))   
    print(np.mean(val_mae))  

supportvectorreg(X_train_original, y_train_original, kernel='rbf')   

0.012565801378451919
0.11202671500425414
0.09341474212901964
0.09405117305786188
0.3036622932239355
0.22595910142900894


In [21]:
#Polynomial Regression

def polynomial_regression(X_train,y_train, orders):
  
    for order in range(1,orders+1):
      
        train_rmse = []
        train_mse = []
        train_mae = []

        val_rmse = []
        val_mse = []
        val_mae = []
        
        polynomial = PolynomialFeatures(degree=order, include_bias=True)
        X_train_polynomial = polynomial.fit_transform(X_train)
        print(f'Degree of freedom for order: {order} = {X_train_polynomial.shape[1]}')
        
        for train_index, val_index in kf.split(X_train):

            X_train_fold, X_val_fold = X_train_polynomial[train_index], X_train_polynomial[val_index]
            y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]

            
            model = LinearRegression()
            model.fit(X_train_fold, y_train_fold)

            # Make predictions on the training set
            train_predictions = model.predict(X_train_fold)
            
            mse_train = mean_squared_error(y_train_fold, train_predictions)
            rmse_train = np.sqrt(mse_train)
            mae_train = mean_absolute_error(y_train_fold, train_predictions)
            
            train_mse.append(mse_train)
            train_rmse.append(rmse_train)
            train_mae.append(mae_train)
            
            val_predictions = model.predict(X_val_fold)

            mse_val = mean_squared_error(y_val_fold, val_predictions)
            rmse_val = np.sqrt(mse_val)
            mae_val = mean_absolute_error(y_val_fold, val_predictions)
        
        
            val_mse.append(mse_val)
            val_rmse.append(rmse_val)
            val_mae.append(mae_val)

        print(np.mean(train_mse))       
        print(np.mean(train_rmse))   
        print(np.mean(train_mae))   

        print(np.mean(val_mse))       
        print(np.mean(val_rmse))   
        print(np.mean(val_mae))  
        

polynomial_regression(X_train_original, y_train_original, 4)

          
    

0.07609151434928177
0.27271755420694455
0.21473075245227982
0.10485347510888317
0.31469365673701716
0.23388680049074115
1.0130947286372543e-26
9.914282947283874e-14
6.50797100091258e-14
1000.8246761809976
16.24518176999448
2.662029392032099
3.62760464945126e-28
1.7736302637693773e-14
1.0753213655975036e-14
1.142467820891807
0.9791730062987117
0.5167789256304653
2.2294266921801808e-26
1.3061698507419393e-13
6.294057606872128e-14
6.76959292841119
2.347155976161025
0.7717735897141195


In [23]:
# Random Forest

def random_forest(X_train,y_train):
  
      
        train_rmse = []
        train_mse = []
        train_mae = []

        val_rmse = []
        val_mse = []
        val_mae = []
        
        
        
        
        for train_index, val_index in kf.split(X_train):
            
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train_fold, y_train_fold)

            train_predictions = model.predict(X_train_fold)
            
            mse_train = mean_squared_error(y_train_fold, train_predictions)
            rmse_train = np.sqrt(mse_train)
            mae_train = mean_absolute_error(y_train_fold, train_predictions)
            
            train_mse.append(mse_train)
            train_rmse.append(rmse_train)
            train_mae.append(mae_train)
            
            val_predictions = model.predict(X_val_fold)

            mse_val = mean_squared_error(y_val_fold, val_predictions)
            rmse_val = np.sqrt(mse_val)
            mae_val = mean_absolute_error(y_val_fold, val_predictions)
        
        
            val_mse.append(mse_val)
            val_rmse.append(rmse_val)
            val_mae.append(mae_val)

        print(np.mean(train_mse))       
        print(np.mean(train_rmse))   
        print(np.mean(train_mae))   

        print(np.mean(val_mse))       
        print(np.mean(val_rmse))   
        print(np.mean(val_mae)) 
        

random_forest(X_train_original, y_train_original)

          
    

0.00950937966549311
0.09747098607868424
0.0739843309859161
0.06733510809859153
0.25880492268764443
0.19771338028169


RBF Neural Network (RBF Sampler)

In [None]:
rbf_feature_linearreg = RBFSampler(gamma=1, random_state=1)
regressor_linearreg = LinearRegression()

rbf_model_linearreg_sampler = make_pipeline(rbf_feature_linearreg, regressor_linearreg)

In [None]:
rbf_feature_svc = RBFSampler(gamma=1, random_state=1)
regressor_svc = svm.SVR(kernel='linear', C=1.0, gamma='auto')

rbf_model_svc_rbf_sampler = make_pipeline(rbf_feature_svc, regressor_svc)

In [None]:
# RBF Neural Network 20-fold cross validation
# --> linear reg with RBF Sampler (normalized)

num_splits = 20
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
mae_trains = []
mae_vals = []
mse_trains = []
mse_vals = []
rmse_trains = []
rmse_vals = []

for train_index, val_index in kf.split(X_train):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
     
    rbf_model_linearreg_sampler.fit(X_train_fold, y_train_fold)

    # training set predictions
    train_pred = rbf_model_linearreg_sampler.predict(X_train_fold)
    
    mae_train = mean_absolute_error(y_train_fold, train_pred)
    mae_trains.append(mae_train)
    mse_train = mean_squared_error(y_train_fold, train_pred, squared=True)
    mse_trains.append(mse_train)
    rmse_train = mean_squared_error(y_train_fold, train_pred, squared=False)
    rmse_trains.append(rmse_train)
    # mse_train = mean_squared_error(y_train_fold, train_predictions)
    
    print(f'training rmse: {rmse_train}')
    # print(f'training mse: {mse_train}')
    
    # predictions on the validation set
    val_pred = rbf_model_linearreg_sampler.predict(X_val_fold)

    mae_val = mean_absolute_error(y_val_fold, val_pred)
    mae_vals.append(mae_val)
    mse_val = mean_squared_error(y_val_fold, val_pred, squared=True)
    mse_vals.append(mse_val)
    rmse_val = mean_squared_error(y_val_fold, val_pred, squared=False)
    rmse_vals.append(rmse_val)
    # mse_val = mean_squared_error(y_val_fold, val_pred)
    
    print(f'validation rmse: {rmse_val}')
    # print(f'validation mse: {mse_val}')

mae_train_mean = sum(mae_trains)/len(mae_trains)
mse_train_mean = sum(mse_trains)/len(mse_trains)
rmse_train_mean = sum(rmse_trains)/len(rmse_trains)

mae_val_mean = sum(mae_vals)/len(mae_vals)
mse_val_mean = sum(mse_vals)/len(mse_vals)
rmse_val_mean = sum(rmse_vals)/len(rmse_vals)

print(f'mean training mae: {mae_train_mean}')
print(f'mean training mse: {mse_train_mean}')
print(f'mean training rmse: {rmse_train_mean}')
print(f'mean validation mae: {mae_val_mean}')
print(f'mean validation mse: {mse_val_mean}')
print(f'mean validation rmse: {rmse_val_mean}')

In [None]:
# RBF Neural Network 20-fold cross validation
# --> svc with RBF Sampler (normalized)

num_splits = 20
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
mae_trains = []
mae_vals = []
mse_trains = []
mse_vals = []
rmse_trains = []
rmse_vals = []

for train_index, val_index in kf.split(X_train):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
     
    rbf_model_svc_rbf_sampler.fit(X_train_fold, y_train_fold)

    # training set predictions
    train_pred = rbf_model_svc_rbf_sampler.predict(X_train_fold)
    
    mae_train = mean_absolute_error(y_train_fold, train_pred)
    mae_trains.append(mae_train)
    mse_train = mean_squared_error(y_train_fold, train_pred, squared=True)
    mse_trains.append(mse_train)
    rmse_train = mean_squared_error(y_train_fold, train_pred, squared=False)
    rmse_trains.append(rmse_train)
    # mse_train = mean_squared_error(y_train_fold, train_predictions)
    
    print(f'training rmse: {rmse_train}')
    # print(f'training mse: {mse_train}')
    
    # predictions on the validation set
    val_pred = rbf_model_svc_rbf_sampler.predict(X_val_fold)

    mae_val = mean_absolute_error(y_val_fold, val_pred)
    mae_vals.append(mae_val)
    mse_val = mean_squared_error(y_val_fold, val_pred, squared=True)
    mse_vals.append(mse_val)
    rmse_val = mean_squared_error(y_val_fold, val_pred, squared=False)
    rmse_vals.append(rmse_val)
    # mse_val = mean_squared_error(y_val_fold, val_pred)
    
    print(f'validation rmse: {rmse_val}')
    # print(f'validation mse: {mse_val}')

mae_train_mean = sum(mae_trains)/len(mae_trains)
mse_train_mean = sum(mse_trains)/len(mse_trains)
rmse_train_mean = sum(rmse_trains)/len(rmse_trains)

mae_val_mean = sum(mae_vals)/len(mae_vals)
mse_val_mean = sum(mse_vals)/len(mse_vals)
rmse_val_mean = sum(rmse_vals)/len(rmse_vals)

print(f'mean training mae: {mae_train_mean}')
print(f'mean training mse: {mse_train_mean}')
print(f'mean training rmse: {rmse_train_mean}')
print(f'mean validation mae: {mae_val_mean}')
print(f'mean validation mse: {mse_val_mean}')
print(f'mean validation rmse: {rmse_val_mean}')

RBF Network (K Means Clustering for RBF Centers)

In [None]:
n_centers = 10 
kmeans = KMeans(n_clusters=n_centers, random_state=0)
kmeans.fit(X_train_original) # change for normalized part
centers = kmeans.cluster_centers_
gamma = 1.0  

In [None]:
X_rbf = rbf_kernel(X_train, centers, gamma=gamma)
print(X_rbf)

X_rbf = rbf_kernel(X_train_original, centers, gamma=gamma)
print(X_rbf)

In [None]:
model = LinearRegression()
kf = KFold(n_splits=20, shuffle=True, random_state=42)

In [None]:
# K-Fold Cross Validation (unnormalized)
mae_trains = []
mse_trains = []
rmse_trains = []

mae_vals = []
mse_vals = []
rmse_vals = []

for train_index, val_index in kf.split(X_rbf):
    X_train_fold, X_val_fold = X_rbf[train_index], X_rbf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    model.fit(X_train_fold, y_train_fold)
    train_pred = model.predict(X_train_fold)
    val_pred = model.predict(X_val_fold)

    mae_train = mean_absolute_error(y_train_fold, train_pred)
    mae_trains.append(mae_train)
    mse_train = mean_squared_error(y_train_fold, train_pred, squared=True)
    mse_trains.append(mse_train)
    rmse_train = mean_squared_error(y_train_fold, train_pred, squared=False)
    rmse_trains.append(rmse_train)

    mae_val = mean_absolute_error(y_val_fold, val_pred)
    mae_vals.append(mae_val)
    mse_val = mean_squared_error(y_val_fold, val_pred, squared=True)
    mse_vals.append(mse_val)
    rmse_val = mean_squared_error(y_val_fold, val_pred, squared=False)
    rmse_vals.append(rmse_val)

    print(f'Training RMSE: {rmse_train}, Validation RMSE: {rmse_val}')
    
mean_mae_train = sum(mae_trains) / len(mae_trains)
mean_mse_train = sum(mse_trains) / len(mse_trains)
mean_rmse_train = sum(rmse_trains) / len(rmse_trains)
mean_mae_val = sum(mae_vals) / len(mae_vals)
mean_mse_val = sum(mse_vals) / len(mse_vals)
mean_rmse_val = sum(rmse_vals) / len(rmse_vals)
    
# average RMSE
print(f'Average Training MAE: {mean_mae_train}')
print(f'Average Training MSE: {mean_mse_train}')
print(f'Average Training RMSE: {mean_rmse_train}')
print(f'Average Validation MAE: {mean_mae_val}')
print(f'Average Validation MSE: {mean_mse_val}')
print(f'Average Validation RMSE: {mean_rmse_val}')

In [None]:
# K-Fold Cross Validation (normalized)
mae_trains = []
mse_trains = []
rmse_trains = []

mae_vals = []
mse_vals = []
rmse_vals = []

for train_index, val_index in kf.split(X_rbf):
    X_train_fold, X_val_fold = X_rbf[train_index], X_rbf[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    model.fit(X_train_fold, y_train_fold)
    train_pred = model.predict(X_train_fold)
    val_pred = model.predict(X_val_fold)

    mae_train = mean_absolute_error(y_train_fold, train_pred)
    mae_trains.append(mae_train)
    mse_train = mean_squared_error(y_train_fold, train_pred, squared=True)
    mse_trains.append(mse_train)
    rmse_train = mean_squared_error(y_train_fold, train_pred, squared=False)
    rmse_trains.append(rmse_train)

    mae_val = mean_absolute_error(y_val_fold, val_pred)
    mae_vals.append(mae_val)
    mse_val = mean_squared_error(y_val_fold, val_pred, squared=True)
    mse_vals.append(mse_val)
    rmse_val = mean_squared_error(y_val_fold, val_pred, squared=False)
    rmse_vals.append(rmse_val)

    print(f'Training RMSE: {rmse_train}, Validation RMSE: {rmse_val}')
    
mean_mae_train = sum(mae_trains) / len(mae_trains)
mean_mse_train = sum(mse_trains) / len(mse_trains)
mean_rmse_train = sum(rmse_trains) / len(rmse_trains)
mean_mae_val = sum(mae_vals) / len(mae_vals)
mean_mse_val = sum(mse_vals) / len(mse_vals)
mean_rmse_val = sum(rmse_vals) / len(rmse_vals)
    
# average RMSE
print(f'Average Training MAE: {mean_mae_train}')
print(f'Average Training MSE: {mean_mse_train}')
print(f'Average Training RMSE: {mean_rmse_train}')
print(f'Average Validation MAE: {mean_mae_val}')
print(f'Average Validation MSE: {mean_mse_val}')
print(f'Average Validation RMSE: {mean_rmse_val}')

RBF Network (ANN Implementation)

In [None]:
# for normalized dataset
X_train_np = X_train_original 
y_train_np = y_train.values

In [None]:
X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32)

In [None]:
class RBFNetwork(nn.Module):
    def __init__(self, centers, num_outputs):
        super(RBFNetwork, self).__init__()
        self.num_centers = centers.shape[0]
        self.centers = nn.Parameter(torch.tensor(centers, dtype=torch.float32), requires_grad=True)
        self.spreads = nn.Parameter(torch.ones(self.num_centers), requires_grad=True)
        self.linear = nn.Linear(self.num_centers, num_outputs)

    def forward(self, x):
        x = x.unsqueeze(1) - self.centers.unsqueeze(0)
        rbf_activations = torch.exp(-self.spreads.unsqueeze(0).unsqueeze(0) * (x ** 2).sum(2))
        out = self.linear(rbf_activations)
        return out.squeeze()

In [None]:
## 10 fold RBF Network ANN Implementation

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X_train)
centers = kmeans.cluster_centers_

num_folds = 10
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# RBF network parameters
num_outputs = 1
centers_initial = kmeans.cluster_centers_ 

# 5 fold mean rmse
fold_rmse_val = []
fold_rmse_train = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train_tensor)):
    print(f'Fold {fold + 1}/{num_folds}')
    
    X_train, y_train = X_train_tensor[train_idx], y_train_tensor[train_idx]
    X_val, y_val = X_train_tensor[val_idx], y_train_tensor[val_idx]

    model = RBFNetwork(centers=centers_initial, num_outputs=num_outputs)
    criterion = nn.MSELoss()
    optimizer = Adam(model.parameters(), lr=0.001)

    # training loop
    num_epochs = 10000 
    for epoch in range(num_epochs):
        # model.train()
        optimizer.zero_grad()
        outputs = model.forward(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        # print loss every 10 epochs
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

    model.eval()  #  evaluation mode 
    with torch.no_grad():
        train_outputs_post = model(X_train)
        train_loss_post = criterion(train_outputs_post, y_train)

    # validation
    model.eval()  
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
    
    print(f'MSE Validation Loss for Fold {fold + 1}: {val_loss.item()}')
    print(f'RMSE Validation Loss for Fold {fold + 1}: {math.sqrt(val_loss.item())}')
    fold_rmse_val.append(math.sqrt(val_loss.item()))
    fold_rmse_train.append(math.sqrt(train_loss_post.item()))

    # reset the centers (re-initialized each fold)
    model.centers = nn.Parameter(torch.tensor(kmeans.cluster_centers_, dtype=torch.float32), requires_grad=True)

In [None]:
fold_rmse_mean_val = sum(fold_rmse_val) / len(fold_rmse_val)
fold_rmse_mean_train = sum(fold_rmse_train) / len(fold_rmse_train)
print(f'10 fold mean rmse val: {fold_rmse_mean_val}')
print(f'10 fold mean rmse train: {fold_rmse_mean_train}')

On an earlier iteration: 
Best Results:-
- lr = 0.001, epochs = 10000
- [0.25580558502633516, 0.4197435754564935, 0.4785440864662346, 0.3611876343480811, 0.44053788981142666, 0.25008861640104796, 0.5463839097074379, 0.426668659735783, 0.19947390018180974, 0.2637490336233216]
- 10 fold mean rmse: 0.3642182890757971

5. Testing

In [50]:
df = pd.read_csv('FLIR_groups1and2_test.csv', skiprows=2)
df = df.dropna(axis=1, how='all')
X_test = df.iloc[:, :-1]
y_test = df.loc[:, 'aveOralM']

In [51]:
# Taking out common features out which are independent of rounds for each subject
columns = ['Gender', 'Age', 'Ethnicity', 'T_atm', 'Humidity', 'Distance']
common_features = df[columns]

In [52]:
common_features.loc[:, 'Age'] = [math.ceil((int(x.split('-')[1]) + int(x.split('-')[0]))/2) if '-' in x else int(x.strip('>')) for x in common_features['Age']]
common_features

Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Distance
0,Female,19,Asian,22.0,30.0,0.60
1,Female,19,White,24.1,15.6,0.62
2,Male,19,Asian,24.1,15.6,0.62
3,Male,23,Asian,24.1,15.6,0.66
4,Male,46,White,24.1,18.0,0.60
...,...,...,...,...,...,...
305,Female,23,Asian,25.7,50.8,0.60
306,Female,23,White,25.7,50.8,0.60
307,Female,19,Black or African-American,28.0,24.3,0.60
308,Male,28,Hispanic/Latino,25.0,39.8,0.60


In [53]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categorical columns
encoded_data = encoder.fit_transform(common_features[['Gender', 'Ethnicity']])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Gender', 'Ethnicity']))

In [54]:
common_features = common_features.drop(columns=['Gender', 'Ethnicity'])
common_features = pd.concat([common_features, encoded_df], axis=1)

remaning_columns = ['T_atm', 'Humidity', 'Distance'] 
remaning_features = X_test[remaning_columns].fillna(X_test[remaning_columns].mean())
common_features = common_features.drop(columns=remaning_columns)
common_features = pd.concat([common_features, remaning_features], axis=1)

common_features['Ethnicity_American Indian or Alaskan Native'] = 0.0
common_features

Unnamed: 0,Age,Gender_Female,Gender_Male,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White,T_atm,Humidity,Distance,Ethnicity_American Indian or Alaskan Native
0,19,1.0,0.0,1.0,0.0,0.0,0.0,0.0,22.0,30.0,0.60,0.0
1,19,1.0,0.0,0.0,0.0,0.0,0.0,1.0,24.1,15.6,0.62,0.0
2,19,0.0,1.0,1.0,0.0,0.0,0.0,0.0,24.1,15.6,0.62,0.0
3,23,0.0,1.0,1.0,0.0,0.0,0.0,0.0,24.1,15.6,0.66,0.0
4,46,0.0,1.0,0.0,0.0,0.0,0.0,1.0,24.1,18.0,0.60,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
305,23,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25.7,50.8,0.60,0.0
306,23,1.0,0.0,0.0,0.0,0.0,0.0,1.0,25.7,50.8,0.60,0.0
307,19,1.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0,24.3,0.60,0.0
308,28,0.0,1.0,0.0,0.0,1.0,0.0,0.0,25.0,39.8,0.60,0.0


In [55]:
# Column names
columns_set = {
    
    'T_offset_' : [1,2,3,4],
    'Max1R13_': [1,2,3,4],
    'Max1L13_': [1,2,3,4],
    'aveAllR13_':  [1,2,3,4],
    'aveAllL13_': [1,2,3,4],
    'T_RC_' : [1,2,3,4],
    'T_RC_Dry_': [1,2,3,4],
    'T_RC_Wet_': [1,2,3,4],
    'T_RC_Max_': [1,2,3,4],
    'T_LC_': [1,2,3,4],
    'T_LC_Dry_': [1,2,3,4],
    'T_LC_Wet_': [1,2,3,4],
    'T_LC_Max_': [1,2,3,4],
    'RCC_': [1,2,3,4],
    'LCC_': [1,2,3,4],
    'canthiMax_': [1,2,3,4],
    'canthi4Max_': [1,2,3,4],
    'T_FHCC_': [1,2,3,4],
    'T_FHRC_': [1,2,3,4],
    'T_FHLC_': [1,2,3,4],
    'T_FHBC_': [1,2,3,4],
    'T_FHTC_': [1,2,3,4],
    'T_FH_Max_': [1,2,3,4],
    'T_FHC_Max_': [1,2,3,4],
    'T_Max_': [1,2,3,4],
    'T_OR_': [1,2,3,4],
    'T_OR_Max_': [1,2,3,4]
}

In [56]:
# Rows with NaN values filled with means for thermal data
df_filled = pd.DataFrame()
column_names = []
for header, rounds in columns_set.items():
    for roundd in rounds:
        column_names.append(f'{header}{roundd}')
    
thermal_info = df[column_names]
thermal_info = thermal_info.fillna(thermal_info.mean())

In [57]:
#Column wise mean of 4 rounds
new_mean_dataframe = pd.DataFrame()

for header, rounds in columns_set.items():
    column_names = [f'{header}{roundd}' for roundd in rounds]
    new_mean_dataframe[f'{header}mean'] = thermal_info[column_names].mean(axis=1)
    


In [58]:
X_test = pd.concat([new_mean_dataframe, common_features], axis=1)

In [59]:
X_test

Unnamed: 0,T_offset_mean,Max1R13_mean,Max1L13_mean,aveAllR13_mean,aveAllL13_mean,T_RC_mean,T_RC_Dry_mean,T_RC_Wet_mean,T_RC_Max_mean,T_LC_mean,...,Gender_Male,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White,T_atm,Humidity,Distance,Ethnicity_American Indian or Alaskan Native
0,1.067500,35.600000,35.332500,35.305000,35.002500,35.585000,35.585000,35.395000,35.600000,35.390000,...,0.0,1.0,0.0,0.0,0.0,0.0,22.0,30.0,0.60,0.0
1,0.555000,36.112500,36.237500,35.535000,35.810000,36.187500,36.187500,35.637500,36.200000,36.205000,...,0.0,0.0,0.0,0.0,0.0,1.0,24.1,15.6,0.62,0.0
2,0.767500,37.620000,37.130000,37.162500,36.530000,37.695000,37.567500,37.690000,37.740000,37.140000,...,1.0,1.0,0.0,0.0,0.0,0.0,24.1,15.6,0.62,0.0
3,0.850000,35.490000,35.627500,34.865000,34.962500,35.490000,35.482500,35.300000,35.517500,35.622500,...,1.0,1.0,0.0,0.0,0.0,0.0,24.1,15.6,0.66,0.0
4,0.877500,34.735000,34.660000,34.102500,34.185000,34.747500,34.747500,34.377500,34.790000,34.672500,...,1.0,0.0,0.0,0.0,0.0,1.0,24.1,18.0,0.60,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,1.222500,35.642500,35.652500,34.857500,35.005000,35.752500,35.657500,35.737500,35.775000,35.802500,...,0.0,1.0,0.0,0.0,0.0,0.0,25.7,50.8,0.60,0.0
306,1.467500,35.982500,35.757500,35.427500,35.197500,35.970000,35.950000,35.862500,36.007500,35.825000,...,0.0,0.0,0.0,0.0,0.0,1.0,25.7,50.8,0.60,0.0
307,0.130000,36.407500,36.340000,35.870000,35.960000,36.410000,36.362500,36.365000,36.447500,36.302500,...,0.0,0.0,1.0,0.0,0.0,0.0,28.0,24.3,0.60,0.0
308,1.138141,35.764716,35.639493,34.641506,34.575435,35.783421,35.697344,35.732854,35.820831,35.792977,...,1.0,0.0,0.0,1.0,0.0,0.0,25.0,39.8,0.60,0.0


In [60]:
scaler = MinMaxScaler()
X_test_original = scaler.fit_transform(X_test)

In [61]:
y_test_original = y_test.iloc[:].values.astype(float)

In [62]:
def train_and_evaluate(X_train_original, y_train_original, X_test_original, y_test_original, kernel='linear', C=10, gamma='auto'):
    
    if kernel == 'linear':
            clf = svm.SVR(kernel=kernel, C=C)
    elif kernel == 'rbf':
            clf = svm.SVR(kernel=kernel, C=C, gamma=gamma)
    else:
            raise ValueError("Invalid kernel type.")

    clf.fit(X_train_original, y_train_original)

    # Make predictions on the training set
    train_predictions = clf.predict(X_train_original)
        
    train_mse = mean_squared_error(y_train_original, train_predictions)
    train_rmse = np.sqrt(train_mse)
    train_mae = mean_absolute_error(y_train_original, train_predictions)
    
    test_predictions = clf.predict(X_test_original)

    test_mse = mean_squared_error(y_test_original, test_predictions)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test_original, test_predictions)
        
    print(train_mse)       
    print(train_rmse)   
    print(train_mae)
    
    print(test_mse)       
    print(test_rmse)   
    print(test_mae) 

train_and_evaluate(X_train_original, y_train_original, X_test_original, y_test_original, kernel='rbf')   

0.013305234793008313
0.11534831941995649
0.09480329696073524
0.22329070085905017
0.4725364545292248
0.32053458457244977
