In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# Define the local folder
local_folder = '/content/drive/MyDrive/CapstoneData/Geisha'

# Lists to store metrics for all files
rmse_pls_list = []
mae_pls_list = []
accuracy_pls_list = []

rmse_rf_list = []
mae_rf_list = []
accuracy_rf_list = []

rmse_svm_list = []
mae_svm_list = []
accuracy_svm_list = []

# Counter to limit to 15 files
file_counter = 0

# Loop to process files in the folder
for file_name in os.listdir(local_folder):
    if file_counter >= 15:
        break  # Exit the loop if 15 files have been read

    if file_name.endswith("_a.csv"):
        # Read the CSV file and skip the first 28 rows
        full_path = os.path.join(local_folder, file_name)
        df = pd.read_csv(full_path, skiprows=28)

        # Outliers detection and handling
        Q1 = df['Absorbance (AU)'].quantile(0.25)
        Q3 = df['Absorbance (AU)'].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df['Absorbance (AU)'] < (Q1 - 1.5 * IQR)) | (df['Absorbance (AU)'] > (Q3 + 1.5 * IQR)))

        # Option 1: Remove outliers
        df_without_outliers = df[~outliers]

        # Option 2: Adjust outliers
        df_adjusted = df.copy()
        df_adjusted.loc[outliers, 'Absorbance (AU)'] = df['Absorbance (AU)'].median()

        # Split data into training and testing sets (80-20)
        X = df_without_outliers['Wavelength (nm)'].values.reshape(-1, 1)
        y = df_without_outliers['Absorbance (AU)'].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train PLS model with 1 component
        pls_model = PLSRegression(n_components=1)  # Adjust the number of components as needed
        pls_model.fit(X_train, y_train)

        # Predictions on the test set for PLS
        y_pred_pls = pls_model.predict(X_test)

        # Calculate RMSE and MAE for PLS
        rmse_pls = sqrt(mean_squared_error(y_test, y_pred_pls))
        mae_pls = mean_absolute_error(y_test, y_pred_pls)

        # Calculate accuracy for PLS in percentage
        y_range = np.max(y) - np.min(y)
        accuracy_pls = (1 - rmse_pls / y_range) * 100

        # Store metrics in the lists
        rmse_pls_list.append(rmse_pls)
        mae_pls_list.append(mae_pls)
        accuracy_pls_list.append(accuracy_pls)

        # Train Random Forest model
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)

        # Predictions on the test set for Random Forest
        y_pred_rf = rf_model.predict(X_test)

        # Calculate RMSE and MAE for Random Forest
        rmse_rf = sqrt(mean_squared_error(y_test, y_pred_rf))
        mae_rf = mean_absolute_error(y_test, y_pred_rf)

        # Calculate accuracy for Random Forest
        accuracy_rf = (1 - rmse_rf / y_range) * 100

        # Store metrics in the lists
        rmse_rf_list.append(rmse_rf)
        mae_rf_list.append(mae_rf)
        accuracy_rf_list.append(accuracy_rf)

        # Train Support Vector Machine (SVM) model
        svm_model = SVR(kernel='linear')
        svm_model.fit(X_train, y_train)

        # Predictions on the test set for SVM
        y_pred_svm = svm_model.predict(X_test)

        # Calculate RMSE and MAE for SVM
        rmse_svm = sqrt(mean_squared_error(y_test, y_pred_svm))
        mae_svm = mean_absolute_error(y_test, y_pred_svm)

        # Calculate accuracy for SVM
        accuracy_svm = (1 - rmse_svm / y_range) * 100

        # Store metrics in the lists
        rmse_svm_list.append(rmse_svm)
        mae_svm_list.append(mae_svm)
        accuracy_svm_list.append(accuracy_svm)

        # Print metrics
        print(f"File: {file_name}")
        print(f"RMSE (PLS): {rmse_pls}, MAE (PLS): {mae_pls}")
        print(f"Accuracy (PLS): {accuracy_pls}%")
        print(f"RMSE (Random Forest): {rmse_rf}, MAE (Random Forest): {mae_rf}")
        print(f"Accuracy (Random Forest): {accuracy_rf}%")
        print(f"RMSE (SVM): {rmse_svm}, MAE (SVM): {mae_svm}")
        print(f"Accuracy (SVM): {accuracy_svm}%")
        print(f"------------------------------------------------------------------------------")

        # Save the clean or adjusted DataFrame if necessary
        df_without_outliers.to_csv(os.path.join(local_folder, file_name.replace(".csv", "_clean.csv")), index=False)

        file_counter += 1  # Increment the counter after processing a file

# Calculate the average metrics for PLS
average_rmse_pls = sum(rmse_pls_list) / len(rmse_pls_list)
average_mae_pls = sum(mae_pls_list) / len(mae_pls_list)
average_accuracy_pls = sum(accuracy_pls_list) / len(accuracy_pls_list)

print(f"Average RMSE (PLS): {average_rmse_pls}, Average MAE (PLS): {average_mae_pls}, Average accuracy (PLS): {average_accuracy_pls}%")

# Calculate the average metrics for Random Forest
average_rmse_rf = sum(rmse_rf_list) / len(rmse_rf_list)
average_mae_rf = sum(mae_rf_list) / len(mae_rf_list)
average_accuracy_rf = sum(accuracy_rf_list) / len(accuracy_rf_list)

print(f"Average RMSE (Random Forest): {average_rmse_rf}, Average MAE (Random Forest): {average_mae_rf}, Average accuracy (Random Forest): {average_accuracy_rf}%")

# Calculate the average metrics for SVM
average_rmse_svm = sum(rmse_svm_list) / len(rmse_svm_list)
average_mae_svm = sum(mae_svm_list) / len(mae_svm_list)
average_accuracy_svm = sum(accuracy_svm_list) / len(accuracy_svm_list)

print(f"Average RMSE (SVM): {average_rmse_svm}, Average MAE (SVM): {average_mae_svm}, Average accuracy (SVM): {average_accuracy_svm}%")


File: Hadamard 1_20230916_153715_a.csv
RMSE (PLS): 0.055310846812090156, MAE (PLS): 0.049795194483757285
Accuracy (PLS): 70.42723658263759%
RMSE (Random Forest): 0.002911597905710347, MAE (Random Forest): 0.0016652447605203084
Accuracy (Random Forest): 98.44327105812383%
RMSE (SVM): 0.06186210810314238, MAE (SVM): 0.054538477001924913
Accuracy (SVM): 66.92450770734466%
------------------------------------------------------------------------------
File: Hadamard 1_20230916_153735_a.csv
RMSE (PLS): 0.05314547077204601, MAE (PLS): 0.0482539444470051
Accuracy (PLS): 70.19324570182017%
RMSE (Random Forest): 0.004872565530760233, MAE (Random Forest): 0.0023978036335355222
Accuracy (Random Forest): 97.26721089366015%
RMSE (SVM): 0.05908019846468923, MAE (SVM): 0.05150268474938904
Accuracy (SVM): 66.86474060831999%
------------------------------------------------------------------------------
File: Hadamard 1_20230916_153808_a.csv
RMSE (PLS): 0.05632977512537188, MAE (PLS): 0.05257661883463990