In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector


In [21]:
df = pd.read_csv('FLIR_groups1and2_train.csv', skiprows=2)
df = df.dropna(axis=1, how='all')
y_train = df.loc[:, 'aveOralM']
X_train = df.iloc[:, :-1]

In [22]:
# Column names
columns_set = {
    
    'T_offset_' : [1,2,3,4],
    'Max1R13_': [1,2,3,4],
    'Max1L13_': [1,2,3,4],
    'aveAllR13_':  [1,2,3,4],
    'aveAllL13_': [1,2,3,4],
    'T_RC_' : [1,2,3,4],
    'T_RC_Dry_': [1,2,3,4],
    'T_RC_Wet_': [1,2,3,4],
    'T_RC_Max_': [1,2,3,4],
    'T_LC_': [1,2,3,4],
    'T_LC_Dry_': [1,2,3,4],
    'T_LC_Wet_': [1,2,3,4],
    'T_LC_Max_': [1,2,3,4],
    'RCC_': [1,2,3,4],
    'LCC_': [1,2,3,4],
    'canthiMax_': [1,2,3,4],
    'canthi4Max_': [1,2,3,4],
    'T_FHCC_': [1,2,3,4],
    'T_FHRC_': [1,2,3,4],
    'T_FHLC_': [1,2,3,4],
    'T_FHBC_': [1,2,3,4],
    'T_FHTC_': [1,2,3,4],
    'T_FH_Max_': [1,2,3,4],
    'T_FHC_Max_': [1,2,3,4],
    'T_Max_': [1,2,3,4],
    'T_OR_': [1,2,3,4],
    'T_OR_Max_': [1,2,3,4]
}

In [23]:
# Rows with NaN values filled with means for thermal data
df_filled = pd.DataFrame()
column_names = []
for header, rounds in columns_set.items():
    for roundd in rounds:
        column_names.append(f'{header}{roundd}')
    
thermal_info = df[column_names]
thermal_info = thermal_info.fillna(thermal_info.mean())

In [24]:
#Column wise mean of 4 rounds
new_mean_dataframe = pd.DataFrame()

for header, rounds in columns_set.items():
    column_names = [f'{header}{roundd}' for roundd in rounds]
    new_mean_dataframe[f'{header}mean'] = thermal_info[column_names].mean(axis=1)
  
remaning_columns = ['T_atm', 'Humidity', 'Distance']    
X_train = pd.concat([new_mean_dataframe, X_train[remaning_columns]], axis=1)

In [25]:
X_train

Unnamed: 0,T_offset_mean,Max1R13_mean,Max1L13_mean,aveAllR13_mean,aveAllL13_mean,T_RC_mean,T_RC_Dry_mean,T_RC_Wet_mean,T_RC_Max_mean,T_LC_mean,...,T_FHBC_mean,T_FHTC_mean,T_FH_Max_mean,T_FHC_Max_mean,T_Max_mean,T_OR_mean,T_OR_Max_mean,T_atm,Humidity,Distance
0,0.7025,35.0300,35.3775,34.4000,34.9175,34.9850,34.9850,34.7625,35.0325,35.3375,...,33.4925,33.0025,34.5300,34.0075,35.6925,35.6350,35.6525,24.0,28.0,0.80
1,0.7800,34.5500,34.5200,33.9300,34.2250,34.7100,34.6325,34.6400,34.7425,34.5600,...,33.9700,34.0025,34.6825,34.6600,35.1750,35.0925,35.1075,24.0,26.0,0.80
2,0.8625,35.6525,35.5175,34.2775,34.8000,35.6850,35.6675,35.6150,35.7175,35.5025,...,34.8200,34.6700,35.3450,35.2225,35.9125,35.8600,35.8850,24.0,26.0,0.80
3,0.9300,35.2225,35.6125,34.3850,35.2475,35.2075,35.2000,35.1175,35.2250,35.5950,...,34.3025,34.9175,35.6025,35.3150,35.7200,34.9650,34.9825,24.0,27.0,0.80
4,0.8950,35.5450,35.6650,34.9100,35.3675,35.6025,35.4750,35.5700,35.6400,35.6400,...,34.6700,33.8275,35.4175,35.3725,35.8950,35.5875,35.6175,24.0,27.0,0.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,0.9325,35.4800,35.5300,34.9000,34.9900,35.5650,35.5650,35.1350,35.6300,35.5325,...,34.6475,34.6600,35.1550,35.1175,35.9825,35.7600,35.8100,24.4,13.5,0.60
706,0.8550,35.6550,35.5325,35.1925,35.2075,35.6125,35.6000,35.4850,35.6550,35.5275,...,34.9825,34.9075,35.6025,35.1875,36.3650,36.1725,36.1950,24.4,14.7,0.63
707,0.9700,36.7325,36.4600,36.2225,36.1150,36.7175,36.7150,36.6400,36.7350,36.4350,...,34.6775,34.9600,36.2075,35.3000,36.9325,36.7825,36.8325,22.0,30.0,0.60
708,1.0725,36.9450,37.0675,36.3825,36.4825,36.9250,36.9200,36.8200,36.9475,37.0500,...,35.4225,35.4425,36.1950,35.8175,37.1275,36.7275,36.7425,22.0,30.0,0.60


In [26]:
scaler = StandardScaler()
X_train_original = scaler.fit_transform(X_train)
# pca = PCA(n_components = 18)
# X_train_original = pca.fit_transform(X_train)
X_train_original = pd.DataFrame(X_train_original, columns=X_train.columns)

In [27]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [28]:
def train_and_evaluate(X_train_original, y_train_original, column_names, kernel='linear', C=1.0, gamma='auto'):
    selected_features = set()  # Initialize an empty set to store selected features
    num_samples, num_features = X_train_original.shape
    
    
    for _ in range(num_features):
        best_feature = None
        best_score = float('inf')
        
        # Iterate over each feature not yet added to the feature set
        for feature_name in column_names:
            if feature_name not in selected_features:
                # Add the feature to the current feature set
                candidate_features = list(selected_features) + [feature_name]
                # Train and evaluate model with the candidate feature set
                score = train_and_evaluate_with_features(X_train_original, y_train_original, candidate_features, kernel, C, gamma)
                
                # Update the best feature and score
                if score < best_score:
                    best_score = score
                    best_feature = feature_name
        
        # Add the best feature to the selected features
        selected_features.add(best_feature)
        
        # Check if desired number of features is achieved
        if len(selected_features) == num_features:
            break
    
    return selected_features


def train_and_evaluate_with_features(X_train_original, y_train_original, selected_features, kernel='linear', C=1.0, gamma='auto'):    

    mse = []
    X_train_original = X_train_original[selected_features]
    for train_index, val_index in kf.split(X_train_original):

        X_train_fold, X_val_fold = X_train_original.iloc[train_index], X_train_original.iloc[val_index]
        y_train_fold, y_val_fold = y_train_original.iloc[train_index], y_train_original.iloc[val_index]
        
        if kernel == 'linear':
            clf = svm.SVR(kernel=kernel, C=C)
        elif kernel == 'rbf':
            clf = svm.SVR(kernel=kernel, C=C, gamma=gamma)
        else:
            raise ValueError("Invalid kernel type.")

        clf.fit(X_train_fold, y_train_fold)

        # Make predictions on the training set
        train_predictions = clf.predict(X_train_fold)
        
        rmse_train = np.sqrt(mean_squared_error(y_train_fold, train_predictions))
        
        #print(rmse_train)
        
        # Make predictions on the training set
        val_predictions = clf.predict(X_val_fold)

        mse_val = mean_squared_error(y_val_fold, val_predictions)
        mse.append(mse_val)

    return np.mean(mse)        
    

scaler = StandardScaler()
X_train_original = scaler.fit_transform(X_train)

X_train_original = pd.DataFrame(X_train_original, columns=X_train.columns)
selected_features = train_and_evaluate(X_train_original, y_train, X_train_original.columns.tolist(), 'rbf')
print(selected_features)


{'aveAllL13_mean', 'T_offset_mean', 'T_FH_Max_mean', 'T_Max_mean', 'T_OR_Max_mean', 'T_RC_mean', 'T_atm', 'Max1R13_mean', 'Max1L13_mean', 'T_FHLC_mean', 'T_LC_Dry_mean', 'canthi4Max_mean', 'T_FHC_Max_mean', 'Distance', 'T_FHBC_mean', 'T_RC_Max_mean', 'T_FHRC_mean', 'aveAllR13_mean', 'T_RC_Dry_mean', 'T_LC_Wet_mean', 'T_LC_mean', 'T_LC_Max_mean', 'RCC_mean', 'Humidity', 'LCC_mean', 'T_RC_Wet_mean', 'T_OR_mean', 'canthiMax_mean', 'T_FHCC_mean', 'T_FHTC_mean'}
