1. Import Libraries

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor



In [69]:
df = pd.read_csv('FLIR_groups1and2_train.csv', skiprows=2)
df = df.dropna(axis=1, how='all')
y_train = df.loc[:, 'aveOralM']

2. Data Preprocessing

In [70]:
# Taking out common features out which are independent of rounds for each subject
columns = ['Gender', 'Age', 'Ethnicity', 'T_atm', 'Humidity', 'Distance']
common_features = df[columns]

In [71]:
common_features.loc[:, 'Age'] = [math.ceil((int(x.split('-')[1]) + int(x.split('-')[0]))/2) if '-' in x else int(x.strip('>')) for x in common_features['Age']]
common_features

Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Distance
0,Male,46,White,24.0,28.0,0.80
1,Female,36,Black or African-American,24.0,26.0,0.80
2,Female,26,White,24.0,26.0,0.80
3,Female,26,Black or African-American,24.0,27.0,0.80
4,Male,19,White,24.0,27.0,0.80
...,...,...,...,...,...,...
705,Female,19,White,24.4,13.5,0.60
706,Female,23,Asian,24.4,14.7,0.63
707,Male,23,Multiracial,22.0,30.0,0.60
708,Male,19,White,22.0,30.0,0.60


In [72]:
print(min(common_features['Age']))

19


In [73]:
print(max(common_features['Age']))

60


In [74]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the categorical columns
encoded_data = encoder.fit_transform(common_features[['Gender', 'Ethnicity']])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Gender', 'Ethnicity']))

In [75]:
common_features = common_features.drop(columns=['Gender', 'Ethnicity'])
common_features = pd.concat([common_features, encoded_df], axis=1)

common_features 

Unnamed: 0,Age,T_atm,Humidity,Distance,Gender_Female,Gender_Male,Ethnicity_American Indian or Alaskan Native,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White
0,46,24.0,28.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,36,24.0,26.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,26,24.0,26.0,0.80,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,26,24.0,27.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,19,24.0,27.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
705,19,24.4,13.5,0.60,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
706,23,24.4,14.7,0.63,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
707,23,22.0,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
708,19,22.0,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [76]:
# Column names
columns_set = {
    
    'T_offset_' : [1,2,3,4],
    'Max1R13_': [1,2,3,4],
    'Max1L13_': [1,2,3,4],
    'aveAllR13_':  [1,2,3,4],
    'aveAllL13_': [1,2,3,4],
    'T_RC_' : [1,2,3,4],
    'T_RC_Dry_': [1,2,3,4],
    'T_RC_Wet_': [1,2,3,4],
    'T_RC_Max_': [1,2,3,4],
    'T_LC_': [1,2,3,4],
    'T_LC_Dry_': [1,2,3,4],
    'T_LC_Wet_': [1,2,3,4],
    'T_LC_Max_': [1,2,3,4],
    'RCC_': [1,2,3,4],
    'LCC_': [1,2,3,4],
    'canthiMax_': [1,2,3,4],
    'canthi4Max_': [1,2,3,4],
    'T_FHCC_': [1,2,3,4],
    'T_FHRC_': [1,2,3,4],
    'T_FHLC_': [1,2,3,4],
    'T_FHBC_': [1,2,3,4],
    'T_FHTC_': [1,2,3,4],
    'T_FH_Max_': [1,2,3,4],
    'T_FHC_Max_': [1,2,3,4],
    'T_Max_': [1,2,3,4],
    'T_OR_': [1,2,3,4],
    'T_OR_Max_': [1,2,3,4]
}

In [77]:
# Rows with NaN values filled with means for thermal data
df_filled = pd.DataFrame()
column_names = []
for header, rounds in columns_set.items():
    for roundd in rounds:
        column_names.append(f'{header}{roundd}')
    
thermal_info = df[column_names]
thermal_info = thermal_info.fillna(thermal_info.mean())

In [78]:
#Column wise mean of 4 rounds
new_mean_dataframe = pd.DataFrame()

for header, rounds in columns_set.items():
    column_names = [f'{header}{roundd}' for roundd in rounds]
    new_mean_dataframe[f'{header}mean'] = thermal_info[column_names].mean(axis=1)
    


#Final Dataset

In [79]:
X_train = pd.concat([new_mean_dataframe, common_features], axis=1)
X_train

Unnamed: 0,T_offset_mean,Max1R13_mean,Max1L13_mean,aveAllR13_mean,aveAllL13_mean,T_RC_mean,T_RC_Dry_mean,T_RC_Wet_mean,T_RC_Max_mean,T_LC_mean,...,Humidity,Distance,Gender_Female,Gender_Male,Ethnicity_American Indian or Alaskan Native,Ethnicity_Asian,Ethnicity_Black or African-American,Ethnicity_Hispanic/Latino,Ethnicity_Multiracial,Ethnicity_White
0,0.7025,35.0300,35.3775,34.4000,34.9175,34.9850,34.9850,34.7625,35.0325,35.3375,...,28.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.7800,34.5500,34.5200,33.9300,34.2250,34.7100,34.6325,34.6400,34.7425,34.5600,...,26.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.8625,35.6525,35.5175,34.2775,34.8000,35.6850,35.6675,35.6150,35.7175,35.5025,...,26.0,0.80,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.9300,35.2225,35.6125,34.3850,35.2475,35.2075,35.2000,35.1175,35.2250,35.5950,...,27.0,0.80,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.8950,35.5450,35.6650,34.9100,35.3675,35.6025,35.4750,35.5700,35.6400,35.6400,...,27.0,0.80,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,0.9325,35.4800,35.5300,34.9000,34.9900,35.5650,35.5650,35.1350,35.6300,35.5325,...,13.5,0.60,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
706,0.8550,35.6550,35.5325,35.1925,35.2075,35.6125,35.6000,35.4850,35.6550,35.5275,...,14.7,0.63,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
707,0.9700,36.7325,36.4600,36.2225,36.1150,36.7175,36.7150,36.6400,36.7350,36.4350,...,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
708,1.0725,36.9450,37.0675,36.3825,36.4825,36.9250,36.9200,36.8200,36.9475,37.0500,...,30.0,0.60,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [80]:
X_train.columns

Index(['T_offset_mean', 'Max1R13_mean', 'Max1L13_mean', 'aveAllR13_mean',
       'aveAllL13_mean', 'T_RC_mean', 'T_RC_Dry_mean', 'T_RC_Wet_mean',
       'T_RC_Max_mean', 'T_LC_mean', 'T_LC_Dry_mean', 'T_LC_Wet_mean',
       'T_LC_Max_mean', 'RCC_mean', 'LCC_mean', 'canthiMax_mean',
       'canthi4Max_mean', 'T_FHCC_mean', 'T_FHRC_mean', 'T_FHLC_mean',
       'T_FHBC_mean', 'T_FHTC_mean', 'T_FH_Max_mean', 'T_FHC_Max_mean',
       'T_Max_mean', 'T_OR_mean', 'T_OR_Max_mean', 'Age', 'T_atm', 'Humidity',
       'Distance', 'Gender_Female', 'Gender_Male',
       'Ethnicity_American Indian or Alaskan Native', 'Ethnicity_Asian',
       'Ethnicity_Black or African-American', 'Ethnicity_Hispanic/Latino',
       'Ethnicity_Multiracial', 'Ethnicity_White'],
      dtype='object')

In [81]:
columns = ['aveAllL13_mean', 'T_offset_mean', 'T_FH_Max_mean', 'T_Max_mean', 'T_OR_Max_mean', 'T_RC_mean', 'T_atm', 'Max1R13_mean', 'Max1L13_mean', 'T_FHLC_mean', 'T_LC_Dry_mean', 'canthi4Max_mean', 'T_FHC_Max_mean', 'Distance', 'T_FHBC_mean']
X_train_selected_features = X_train[columns]


3. Data Normalization

In [82]:
scaler = MinMaxScaler()
X_train_original = scaler.fit_transform(X_train_selected_features)
X_train_original = pd.DataFrame(X_train_original, columns=X_train_selected_features.columns)

pca = PCA(n_components = 5)
X_train_original = pca.fit_transform(X_train_original)

In [83]:
#X_train_original = X_train.iloc[:, 1:].values.astype(float)
#y_train_original = y_train.iloc[:].values.astype(float)
y_train_original = y_train

4. Model creation and Training

Models

In [84]:
class MeanTrivialSystem:
    def __init__(self):
        self_output_value = None
        
    def fit(self, X, y):
        self.output_value = sum(y) / len(y)
            
    def predict(self, X):
        return [self.output_value] * len(X)       
      
        

In [85]:
# 1NN

class NearestNeighbor1(object):
    def __init__(self, p_val: int = 2) -> None:
        self.knn_reg = KNeighborsRegressor(
            n_neighbors=1,
            weights='uniform',
            algorithm='auto',
            leaf_size=30,
            p=p_val,
            metric='minkowski',
            n_jobs=None
        )

        self.cv_scores = None

    def fit(self, X, y) -> None:
        self.knn_reg.fit(X, y)

    def predict(self, X):
        return self.knn_reg.predict(X)

    def score(self, X, y):
        return self.knn_reg.score(X, y)
    
    def cross_validate(self, X, y, n_splits=5, shuffle=True, random_state=42):
        cross_val = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        self.cv_scores = cross_val_score(self.knn_reg, X, y, cv=cross_val)
        return self.cv_scores

In [86]:
# 1NN 20-fold cross validation (on normalized dataset)

num_splits = 20
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

mae_trains = []
mse_trains = []
rmse_trains = []

mae_vals = []
mse_vals = []
rmse_vals = []

for train_index, val_index in kf.split(X_train_original):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    NN = NearestNeighbor1()
    NN.fit(X_train_fold, y_train_fold)
    
    train_pred = NN.predict(X_train_fold)
    val_pred = NN.predict(X_val_fold)

    # training metrics
    mae_train = mean_absolute_error(y_train_fold, train_pred)
    mae_trains.append(mae_train)
    mse_train = mean_squared_error(y_train_fold, train_pred, squared=True)
    mse_trains.append(mse_train)
    rmse_train = mean_squared_error(y_train_fold, train_pred, squared=False)
    rmse_trains.append(rmse_train)
    
    # validation metrics
    mae_val = mean_absolute_error(y_val_fold, val_pred)
    mae_vals.append(mae_val)
    mse_val = mean_squared_error(y_val_fold, val_pred, squared=True)
    mse_vals.append(mse_val)
    rmse_val = mean_squared_error(y_val_fold, val_pred, squared=False)
    rmse_vals.append(rmse_val)
    
    print(f'validation rmse: {rmse_val}')

mse_train_mean = sum(mse_trains)/len(mse_trains)
rmse_train_mean = sum(rmse_trains)/len(rmse_trains)
mae_train_mean = sum(mae_trains)/len(mae_trains) 

mse_val_mean = sum(mse_vals)/len(mse_vals)
rmse_val_mean = sum(rmse_vals)/len(rmse_vals)
mae_val_mean = sum(mae_vals)/len(mae_vals)

print(f'mae_trains: {mae_trains}')
print(f'mean training mae: {mae_train_mean}')
print(f'mse_trains: {mse_trains}')
print(f'mean training mse: {mse_train_mean}')
print(f'rmse_trains: {rmse_trains}')
print(f'mean training rmse: {rmse_train_mean}')

print(f'mae_vals: {mae_vals}')
print(f'mean validation mae: {mae_val_mean}')
print(f'mse_vals: {mse_vals}')
print(f'mean validation mse: {mse_val_mean}')
print(f'rmse_vals: {rmse_vals}')
print(f'mean validation rmse: {rmse_val_mean}')

validation rmse: 0.2830881291910502
validation rmse: 0.3675179511866662
validation rmse: 0.3929058411375423
validation rmse: 0.38756719847444976
validation rmse: 0.33082388735465307
validation rmse: 0.2787621447279623
validation rmse: 0.37481476906748373
validation rmse: 0.3767551518485774
validation rmse: 0.39973949850704904
validation rmse: 0.33767999315591435
validation rmse: 0.38739053753470637
validation rmse: 0.32260103622187636
validation rmse: 0.32315409857925614
validation rmse: 0.43907370028927223
validation rmse: 0.3350906060839584
validation rmse: 0.359960315272988
validation rmse: 0.2937443008565684
validation rmse: 0.3153229636873459
validation rmse: 0.36974894957833915
validation rmse: 0.3149829927381569
mae_trains: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
mean training mae: 0.0
mse_trains: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
mean training mse: 0.0

In [88]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [89]:
# Trivial System

train_rmse = []
train_mse = []
train_mae = []

val_rmse = []
val_mse = []
val_mae = []

for train_index, val_index in kf.split(X_train_original):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]
    

    trivial = MeanTrivialSystem()
    trivial.fit(X_train_fold, y_train_fold)

    # Make predictions on the training set
    train_predictions = trivial.predict(X_train_fold)
    
    mse_train = mean_squared_error(y_train_fold, train_predictions)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train_fold, train_predictions)
    
    train_mse.append(mse_train)
    train_rmse.append(rmse_train)
    train_mae.append(mae_train)
    
    val_predictions = trivial.predict(X_val_fold)

    mse_val = mean_squared_error(y_val_fold, val_predictions)
    rmse_val = np.sqrt(mse_val)
    mae_val = mean_absolute_error(y_val_fold, val_predictions)
    
    
    val_mse.append(mse_val)
    val_rmse.append(rmse_val)
    val_mae.append(mae_val)

print(np.mean(train_mse))       
print(np.mean(train_rmse))   
print(np.mean(train_mae))   

print(np.mean(val_mse))       
print(np.mean(val_rmse))   
print(np.mean(val_mae))    

0.2376469760464194
0.48723527683216333
0.31586001041461065
0.23845573174965323
0.4839243646729258
0.3160925411624781


In [90]:
# Linear Regression

train_rmse = []
train_mse = []
train_mae = []

val_rmse = []
val_mse = []
val_mae = []

for train_index, val_index in kf.split(X_train_original):

    X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
    y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]
    linear_reg = LinearRegression()
    linear_reg.fit(X_train_fold, y_train_fold)

    train_predictions = linear_reg.predict(X_train_fold)
    
    mse_train = mean_squared_error(y_train_fold, train_predictions)
    rmse_train = np.sqrt(mse_train)
    mae_train = mean_absolute_error(y_train_fold, train_predictions)
    
    train_mse.append(mse_train)
    train_rmse.append(rmse_train)
    train_mae.append(mae_train)
    
    val_predictions = linear_reg.predict(X_val_fold)

    mse_val = mean_squared_error(y_val_fold, val_predictions)
    rmse_val = np.sqrt(mse_val)
    mae_val = mean_absolute_error(y_val_fold, val_predictions)
    
    
    val_mse.append(mse_val)
    val_rmse.append(rmse_val)
    val_mae.append(mae_val)

print(np.mean(train_mse))       
print(np.mean(train_rmse))   
print(np.mean(train_mae))   

print(np.mean(val_mse))       
print(np.mean(val_rmse))   
print(np.mean(val_mae))   

0.07040407830673309
0.26531675103543173
0.20676974809391013
0.07273528501550414
0.2693804343266899
0.20947244945350615


In [91]:
# Support Vector Regressor

def supportvectorreg(X_train_original, y_train_original, kernel='linear', C=10, gamma='auto'):
    
    train_rmse = []
    train_mse = []
    train_mae = []

    val_rmse = []
    val_mse = []
    val_mae = []
    

    
    for train_index, val_index in kf.split(X_train_original):

        X_train_fold, X_val_fold = X_train_original[train_index], X_train_original[val_index]
        y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]
        
        if kernel == 'linear':
            clf = svm.SVR(kernel=kernel, C=C)
        elif kernel == 'rbf':
            clf = svm.SVR(kernel=kernel, C=C, gamma=gamma)
        else:
            raise ValueError("Invalid kernel type.")

        clf.fit(X_train_fold, y_train_fold)

        train_predictions = clf.predict(X_train_fold)
        
        mse_train = mean_squared_error(y_train_fold, train_predictions)
        rmse_train = np.sqrt(mse_train)
        mae_train = mean_absolute_error(y_train_fold, train_predictions)
        
        train_mse.append(mse_train)
        train_rmse.append(rmse_train)
        train_mae.append(mae_train)
        
        val_predictions = clf.predict(X_val_fold)

        mse_val = mean_squared_error(y_val_fold, val_predictions)
        rmse_val = np.sqrt(mse_val)
        mae_val = mean_absolute_error(y_val_fold, val_predictions)
        
        
        val_mse.append(mse_val)
        val_rmse.append(rmse_val)
        val_mae.append(mae_val)
        
    # Get the number of support vectors
    n_support_vectors = np.sum(clf.n_support_)
    degrees_of_freedom = len(X_train_original) - n_support_vectors
    
    print("Number of support vectors:", n_support_vectors)
    print("Degrees of freedom:", degrees_of_freedom)
    print(np.mean(train_mse))       
    print(np.mean(train_rmse))   
    print(np.mean(train_mae))   

    print(np.mean(val_mse))       
    print(np.mean(val_rmse))   
    print(np.mean(val_mae))  

supportvectorreg(X_train_original, y_train_original, kernel='rbf')   

Number of support vectors: 390
Degrees of freedom: 320
0.05739353657831085
0.23949265460285413
0.1829166110362336
0.06178969594761895
0.24741739639183083
0.18915654176095456


In [92]:
# Polynomial Regression

def polynomial_regression(X_train,y_train, orders):
  
    for order in range(1,orders+1):
      
        train_rmse = []
        train_mse = []
        train_mae = []

        val_rmse = []
        val_mse = []
        val_mae = []
        
        X_train_norm = scaler.fit_transform(X_train)
        polynomial = PolynomialFeatures(degree=order, include_bias=True)
        X_train_polynomial = polynomial.fit_transform(X_train_norm)
        
        pca = PCA(n_components = 10)
        X_train_polynomial = pca.fit_transform(X_train_polynomial) 
        
        print(f'Degree of freedom for order: {order} = {X_train_polynomial.shape[1]}')
        
        
        for train_index, val_index in kf.split(X_train_original):
            
            X_train_fold, X_val_fold = X_train_polynomial[train_index], X_train_polynomial[val_index]
            y_train_fold, y_val_fold = y_train_original[train_index], y_train_original[val_index]

            
            model = LinearRegression()
            model.fit(X_train_fold, y_train_fold)

            
            train_predictions = model.predict(X_train_fold)
            
            mse_train = mean_squared_error(y_train_fold, train_predictions)
            rmse_train = np.sqrt(mse_train)
            mae_train = mean_absolute_error(y_train_fold, train_predictions)
            
            train_mse.append(mse_train)
            train_rmse.append(rmse_train)
            train_mae.append(mae_train)
            
            val_predictions = model.predict(X_val_fold)

            mse_val = mean_squared_error(y_val_fold, val_predictions)
            rmse_val = np.sqrt(mse_val)
            mae_val = mean_absolute_error(y_val_fold, val_predictions)
        
        
            val_mse.append(mse_val)
            val_rmse.append(rmse_val)
            val_mae.append(mae_val)

        print(np.mean(train_mse))       
        print(np.mean(train_rmse))   
        print(np.mean(train_mae))   

        print(np.mean(val_mse))       
        print(np.mean(val_rmse))   
        print(np.mean(val_mae))  
        

polynomial_regression(X_train_selected_features, y_train, 4)

          
    

Degree of freedom for order: 1 = 10
0.06661452139045329
0.2580657322184529
0.20024218169534858
0.412626664741128
0.4788858834699166
0.2260451077938827
Degree of freedom for order: 2 = 10
0.05761386062014502
0.23996118844829115
0.18589156663607237
0.5345316981497183
0.5090686301000777
0.2153596803874734
Degree of freedom for order: 3 = 10
0.05778229062937549
0.2402936163114368
0.18621425917287582
0.09008869883497302
0.28880141966402223
0.1972114672049303
Degree of freedom for order: 4 = 10
0.059297962982124905
0.2434155542974974
0.18849058861757134
0.06585634985371161
0.254938002930977
0.19530079519516005


In [93]:
# Random Forest

def random_forest(X_train,y_train):
  
      
        train_rmse = []
        train_mse = []
        train_mae = []

        val_rmse = []
        val_mse = []
        val_mae = []
        
        
        for train_index, val_index in kf.split(X_train):
            
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train_fold, y_train_fold)

            train_predictions = model.predict(X_train_fold)
            
            mse_train = mean_squared_error(y_train_fold, train_predictions)
            rmse_train = np.sqrt(mse_train)
            mae_train = mean_absolute_error(y_train_fold, train_predictions)
            
            train_mse.append(mse_train)
            train_rmse.append(rmse_train)
            train_mae.append(mae_train)
            
            val_predictions = model.predict(X_val_fold)

            mse_val = mean_squared_error(y_val_fold, val_predictions)
            rmse_val = np.sqrt(mse_val)
            mae_val = mean_absolute_error(y_val_fold, val_predictions)
        
        
            val_mse.append(mse_val)
            val_rmse.append(rmse_val)
            val_mae.append(mae_val)

        print(np.mean(train_mse))       
        print(np.mean(train_rmse))   
        print(np.mean(train_mae))   

        print(np.mean(val_mse))       
        print(np.mean(val_rmse))   
        print(np.mean(val_mae)) 
        

random_forest(X_train_original, y_train_original)

          
    

0.010393141461267793
0.10191212353502728
0.07567411971831038
0.07130551443661963
0.2660787210189588
0.2014049295774641


5. Testing

In [95]:
df = pd.read_csv('FLIR_groups1and2_test.csv', skiprows=2)
df = df.dropna(axis=1, how='all')
X_test = df.iloc[:, :-1]
y_test_original = df.loc[:, 'aveOralM']

In [96]:
# Column names
columns_set = {
    
    'T_offset_' : [1,2,3,4],
    'Max1R13_': [1,2,3,4],
    'Max1L13_': [1,2,3,4],
    'aveAllR13_':  [1,2,3,4],
    'aveAllL13_': [1,2,3,4],
    'T_RC_' : [1,2,3,4],
    'T_RC_Dry_': [1,2,3,4],
    'T_RC_Wet_': [1,2,3,4],
    'T_RC_Max_': [1,2,3,4],
    'T_LC_': [1,2,3,4],
    'T_LC_Dry_': [1,2,3,4],
    'T_LC_Wet_': [1,2,3,4],
    'T_LC_Max_': [1,2,3,4],
    'RCC_': [1,2,3,4],
    'LCC_': [1,2,3,4],
    'canthiMax_': [1,2,3,4],
    'canthi4Max_': [1,2,3,4],
    'T_FHCC_': [1,2,3,4],
    'T_FHRC_': [1,2,3,4],
    'T_FHLC_': [1,2,3,4],
    'T_FHBC_': [1,2,3,4],
    'T_FHTC_': [1,2,3,4],
    'T_FH_Max_': [1,2,3,4],
    'T_FHC_Max_': [1,2,3,4],
    'T_Max_': [1,2,3,4],
    'T_OR_': [1,2,3,4],
    'T_OR_Max_': [1,2,3,4]
}

In [97]:
# Rows with NaN values filled with means for thermal data
df_filled = pd.DataFrame()
column_names = []
for header, rounds in columns_set.items():
    for roundd in rounds:
        column_names.append(f'{header}{roundd}')
    
thermal_info = df[column_names]
thermal_info = thermal_info.fillna(thermal_info.mean())

In [98]:
#Column wise mean of 4 rounds
new_mean_dataframe = pd.DataFrame()

for header, rounds in columns_set.items():
    column_names = [f'{header}{roundd}' for roundd in rounds]
    new_mean_dataframe[f'{header}mean'] = thermal_info[column_names].mean(axis=1)
    


In [99]:
remaning_columns = ['T_atm', 'Humidity', 'Distance']    
remaning_features = X_test[remaning_columns].fillna(X_test[remaning_columns].mean())
X_test = pd.concat([new_mean_dataframe, remaning_features], axis=1)
X_test

Unnamed: 0,T_offset_mean,Max1R13_mean,Max1L13_mean,aveAllR13_mean,aveAllL13_mean,T_RC_mean,T_RC_Dry_mean,T_RC_Wet_mean,T_RC_Max_mean,T_LC_mean,...,T_FHBC_mean,T_FHTC_mean,T_FH_Max_mean,T_FHC_Max_mean,T_Max_mean,T_OR_mean,T_OR_Max_mean,T_atm,Humidity,Distance
0,1.067500,35.600000,35.332500,35.305000,35.002500,35.585000,35.585000,35.395000,35.600000,35.390000,...,34.79250,34.837500,35.582500,35.3775,36.072500,35.970000,36.017500,22.0,30.0,0.60
1,0.555000,36.112500,36.237500,35.535000,35.810000,36.187500,36.187500,35.637500,36.200000,36.205000,...,35.42250,35.307500,36.187500,35.6525,36.720000,36.677500,36.720000,24.1,15.6,0.62
2,0.767500,37.620000,37.130000,37.162500,36.530000,37.695000,37.567500,37.690000,37.740000,37.140000,...,36.53000,35.420000,37.510000,36.9975,37.975000,37.595000,37.645000,24.1,15.6,0.62
3,0.850000,35.490000,35.627500,34.865000,34.962500,35.490000,35.482500,35.300000,35.517500,35.622500,...,34.17750,33.882500,34.830000,34.5625,36.107500,35.897500,35.970000,24.1,15.6,0.66
4,0.877500,34.735000,34.660000,34.102500,34.185000,34.747500,34.747500,34.377500,34.790000,34.672500,...,33.55750,33.200000,34.565000,34.5650,35.552500,35.395000,35.427500,24.1,18.0,0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,1.222500,35.642500,35.652500,34.857500,35.005000,35.752500,35.657500,35.737500,35.775000,35.802500,...,35.13750,35.275000,35.852500,35.7475,36.067500,35.677500,35.710000,25.7,50.8,0.60
306,1.467500,35.982500,35.757500,35.427500,35.197500,35.970000,35.950000,35.862500,36.007500,35.825000,...,35.20750,35.070000,35.765000,35.5525,36.500000,36.452500,36.490000,25.7,50.8,0.60
307,0.130000,36.407500,36.340000,35.870000,35.960000,36.410000,36.362500,36.365000,36.447500,36.302500,...,35.36750,35.342500,36.375000,35.7100,36.535000,35.965000,35.997500,28.0,24.3,0.60
308,1.138141,35.764716,35.639493,34.641506,34.575435,35.783421,35.697344,35.732854,35.820831,35.792977,...,34.70051,34.690765,35.490428,35.2869,36.091883,35.781776,35.807031,25.0,39.8,0.60


In [100]:
columns = ['aveAllL13_mean', 'T_offset_mean', 'T_FH_Max_mean', 'T_Max_mean', 'T_OR_Max_mean', 'T_RC_mean', 'T_atm', 'Max1R13_mean', 'Max1L13_mean', 'T_FHLC_mean', 'T_LC_Dry_mean', 'canthi4Max_mean', 'T_FHC_Max_mean', 'Distance', 'T_FHBC_mean']
X_test_selected_features = X_test[columns]


In [101]:
X_test_selected_features.describe()

Unnamed: 0,aveAllL13_mean,T_offset_mean,T_FH_Max_mean,T_Max_mean,T_OR_Max_mean,T_RC_mean,T_atm,Max1R13_mean,Max1L13_mean,T_FHLC_mean,T_LC_Dry_mean,canthi4Max_mean,T_FHC_Max_mean,Distance,T_FHBC_mean
count,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0
mean,35.095968,1.022227,35.542366,36.154939,35.896333,35.761603,24.574194,35.68154,35.71582,34.709807,35.714964,35.863587,35.233094,0.628214,34.621615
std,0.641707,0.401755,0.518969,0.51567,0.596062,0.596181,1.159642,0.618556,0.584568,0.671719,0.575979,0.563694,0.548472,0.0435,0.662395
min,33.3975,-0.325,34.465,35.175,34.0975,34.0,22.0,33.8975,34.1225,32.1225,34.105,34.495,33.7375,0.54,32.84
25%,34.740625,0.823499,35.245625,35.82375,35.52125,35.377977,24.0,35.27375,35.338125,34.328668,35.338125,35.48625,34.86375,0.6,34.274375
50%,35.05375,1.000748,35.5,36.07,35.87375,35.68875,24.2,35.6125,35.6325,34.73875,35.62375,35.757357,35.20125,0.6,34.625
75%,35.436875,1.212286,35.8,36.32625,36.164375,36.055,24.9,35.98125,35.99,35.08375,35.97625,36.113302,35.514375,0.66,34.96125
max,37.68,2.555,38.0025,38.4175,37.9025,38.385,29.1,38.405,38.0425,37.165,38.0375,38.3825,37.6325,0.72,37.2125


In [102]:
scaler = MinMaxScaler()
X_test_original = scaler.fit_transform(X_test_selected_features)
X_test_original = pd.DataFrame(X_test_original, columns=X_test_selected_features.columns)

pca = PCA(n_components = 5)
X_test_original = pca.fit_transform(X_test_original)

In [103]:
def train_and_evaluate(X_train_original, y_train_original, X_test_original, y_test_original, kernel='linear', C=10, gamma='auto'):
    
    if kernel == 'linear':
            clf = svm.SVR(kernel=kernel, C=C)
    elif kernel == 'rbf':
            clf = svm.SVR(kernel=kernel, C=C, gamma=gamma)
    else:
            raise ValueError("Invalid kernel type.")

    clf.fit(X_train_original, y_train_original)

    # Make predictions on the training set
    train_predictions = clf.predict(X_train_original)
        
    train_mse = mean_squared_error(y_train_original, train_predictions)
    train_rmse = np.sqrt(train_mse)
    train_mae = mean_absolute_error(y_train_original, train_predictions)
    
    test_predictions = clf.predict(X_test_original)

    test_mse = mean_squared_error(y_test_original, test_predictions)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test_original, test_predictions)
        
    print(train_mse)       
    print(train_rmse)   
    print(train_mae)
    
    print(test_mse)       
    print(test_rmse)   
    print(test_mae) 

train_and_evaluate(X_train_original, y_train_original, X_test_original, y_test_original, kernel='rbf')   

0.057948344736137185
0.24072462428288716
0.18316093939952338
0.09822285795662443
0.31340526153308984
0.23424274713212573
