In [194]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.tensorboard import SummaryWriter

In [125]:
folder_x_train = Path("./data_rosa/X_train")
folder_x_test = Path("./data_rosa/X_test")
folder_others = Path("./data")

paths_x_train = list(folder_x_train.glob("./*.txt"))
paths_x_test = list(folder_x_test.glob("./*.txt"))
paths_others = list(folder_others.glob("./*.txt"))

In [263]:
y_path = "./data_rosa/Y/concentration_withDummy.xlsx"

df_y = pd.read_excel(y_path, usecols=range(1, 10), nrows=16)
df_y = df_y.fillna(0)
df_y.iloc[:, 1:] = df_y.iloc[:, 1:] / 100
df_y["others"] = df_y.iloc[:, 1:].apply(lambda x: 1 - sum(x), axis=1)

df_y.head()

Unnamed: 0,sample,Li,Na,Mg,K,Ca,Al,Si,Fe,others
0,RM_NIST183,0.01914,0.001484,0.0,0.066412,0.0,0.0,0.0,0.0,0.912964
1,RM_OREAS751,0.00468,0.0247,0.00293,0.0239,0.00772,0.084048,0.333894,0.016787,0.501341
2,RM_OREAS752,0.00707,0.027,0.00047,0.021,0.00215,0.086271,0.34128,0.008743,0.506016
3,RM_OREAS753,0.0102,0.0216,0.00011,0.0195,0.00109,0.0868,0.344505,0.008743,0.507452
4,RM_OREAS999,0.0267,0.00693,0.00473,0.00522,0.00481,0.123796,0.300238,0.017416,0.510159


In [155]:
def create_df_for_whole_dataset(path_list):
    df = pd.DataFrame()
    
    for i, p in enumerate(path_list):
        for sample_name in df_y["sample"]:
            if p.name.startswith(sample_name):
                col_1 = []
                col_2 = []
                col_3 = []
                col_4 = []
                col_5 = []
                col_6 = []

                with open(p, "r") as f:
                    for idx, line in enumerate(f):
                        if idx > 31:
                            tmp_nums = line.split()
                            
                            col_1.append(float(tmp_nums[0]))
                            col_2.append(float(tmp_nums[1]))
                            col_3.append(float(tmp_nums[2]))
                            col_4.append(float(tmp_nums[3]))
                            col_5.append(float(tmp_nums[4]))
                            col_6.append(float(tmp_nums[5]))

                assert len(col_1) == len(col_2) == len(col_3) == len(col_4) == len(col_5) == len(col_6)

                tmp_df = pd.DataFrame(
                    {
                        "intensity_1": col_2,
                        "intensity_2": col_3,
                        "intensity_3": col_4,
                        "intensity_4": col_5,
                        "intensity_5": col_6,
                    },
                    index=col_1
                )
                tmp_df["set_num"] = i
                tmp_df["target_file_name"] = sample_name
                tmp_df["data_file_name"] = p.name
                
                df = pd.concat([df, tmp_df], axis=0)
                break
                
    df = df.reset_index()
    df = df.rename(columns={"index": "wavelength"})
    return df

In [156]:
df_train = create_df_for_whole_dataset(paths_x_train)
df_test = create_df_for_whole_dataset(paths_x_test)

In [170]:
USE_COLS = ["intensity_1", "intensity_2", "intensity_3", "intensity_4", "intensity_5"]

In [173]:
scaler = StandardScaler()
scaler.fit(df_train[USE_COLS])
X_train = scaler.transform(df_train[USE_COLS])
X_test = scaler.transform(df_test[USE_COLS])

In [292]:
X_train[:, [1,3,4]].mean(axis=1).reshape(1, -1).shape

(1, 61440)

In [297]:
# data pipeline
def convert_data_to_X_and_y(param_df, param_X, param_df_y, use_channel=None):
    X_list = []
    y_list = []

    for set_num in param_df["set_num"].unique():
        # tmp mask
        tmp_mask = param_df["set_num"] == set_num
        
        # get specific X data
        X_tmp = param_X[tmp_mask]
        if isinstance(use_channel, list):
            X_tmp = X_tmp[:, use_channel].mean(axis=1).reshape(1, -1)
        elif isinstance(use_channel, int):
            X_tmp = X_tmp[:, use_channel].reshape(1, -1)
        else:
            X_tmp
        
        # get specific y data
        tmp_df = param_df[tmp_mask]
        tmp_sample = tmp_df["target_file_name"].iloc[0]
        tmp_y = param_df_y[param_df_y["sample"] == tmp_sample]
        y_tmp = tmp_y.iloc[0, 1:].astype(float).values
        
        # gather
        X_list.append(X_tmp)
        y_list.append(y_tmp)
        
    return np.array(X_list), np.array(y_list)

In [315]:
Xs_train, ys_train = convert_data_to_X_and_y(df_train, X_train, df_y, use_channel=0)
Xs_test, ys_test = convert_data_to_X_and_y(df_test, X_test, df_y, use_channel=0)

#### Dataset and DataLoader

In [317]:
class SensorDataset(torch.utils.data.Dataset):
    def __init__(self, Xs, ys):
        super().__init__()
        self.Xs = Xs
        self.ys = ys
        
    def __len__(self):
        return len(self.Xs)
    
    def __getitem__(self, idx):
        return torch.Tensor(self.Xs[idx]), torch.Tensor(self.ys[idx])

In [318]:
ds_train = SensorDataset(Xs_train, ys_train)
ds_test = SensorDataset(Xs_test, ys_test)

In [319]:
dl_train = torch.utils.data.DataLoader(
    dataset=ds_train,
    batch_size=1,
    shuffle=True
)

dl_test = torch.utils.data.DataLoader(
    dataset=ds_test,
    batch_size=1,
    shuffle=True
)

In [397]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [487]:
import torch.nn as nn
import torch.nn.functional as F

# PyTorch models inherit from torch.nn.Module
class DrugClassifierCNN(nn.Module):
    def __init__(self):
        super(DrugClassifierCNN, self).__init__()
        self.pool = nn.MaxPool1d(2, 2)
        self.conv1 = nn.Conv1d(1, 1, 3, padding=0)
        self.conv2 = nn.Conv1d(1, 1, 3, padding=0)
        self.conv3 = nn.Conv1d(1, 1, 3, padding=0)
        self.conv4 = nn.Conv1d(1, 1, 3, padding=0)
        self.fc1 = nn.Linear(766, 128)
        self.fc2 = nn.Linear(128, 9)

    def forward(self, x):
        x = self.pool(F.leaky_relu(self.conv1(x)))
        x = self.pool(F.leaky_relu(self.conv2(x)))
        x = self.pool(F.leaky_relu(self.conv3(x)))
        x = self.pool(F.leaky_relu(self.conv4(x)))
        x = self.fc1(x)
        x = self.fc2(x)
        x = x.squeeze(1)
        x = F.softmax(x, dim=1)
        return x

model = DrugClassifierCNN()

In [488]:
count_parameters(model)

99353

#### Training setup

In [489]:
loss_fn = torch.nn.MSELoss()
optimiser = torch.optim.AdamW(model.parameters(), lr=0.00001, weight_decay=0.05)

In [490]:
EPOCHS = 5000
TOLERANCE_VALUE = 10
TOLERANCE = TOLERANCE_VALUE

In [491]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_dir = f"runs/drug_detection_{timestamp}"
writer = SummaryWriter(log_dir)

best_vloss = 1_000_000.
running_loss = 0.
last_loss = 0.
step_interval_to_record = 1

for epoch in range(EPOCHS):
    print(f"EPOCH {epoch+1:3d}", end="\t")
    
    model.train(True)
    
    for i, data in enumerate(dl_train):
        inputs, labels = data
        
        optimiser.zero_grad()
        outputs = model(inputs)
        
        loss = loss_fn(outputs, labels)
        loss.backward()
        
        optimiser.step()
        
        running_loss += loss.item()
        if i % step_interval_to_record == step_interval_to_record - 1:
            last_loss = running_loss / step_interval_to_record
            # print(f"   step: {i+1:3d} | loss: {last_loss:4f}")
            tb_x = epoch * len(dl_train) + i + 1
            writer.add_scalar("Loss/train", last_loss, tb_x)
            running_loss = 0.0
            
    avg_last_loss = last_loss
    
    # evaluation
    running_vloss = 0.0
    model.eval()
    
    with torch.no_grad():
        for vi, vdata in enumerate(dl_test):
            vinputs, vlabels = vdata
            vouputs = model(vinputs)
            vloss = loss_fn(vouputs, vlabels)
            running_vloss += vloss
            
    avg_v_loss = running_vloss / (vi + 1)
    print(f"Loss train {avg_last_loss:.4f} | valid {avg_v_loss:.4f}")
    
    writer.add_scalars(
        "Training vs Validation Loss",
        {
            "Training": avg_last_loss,
            "Validation": avg_v_loss,
        },
        epoch + 1
    )
    writer.flush()
    
    # best performance
    if avg_v_loss < best_vloss:
        best_vloss = avg_v_loss
        model_path = f"{log_dir}/best.pt"
        torch.save(model.state_dict(), model_path)
        TOLERANCE = TOLERANCE_VALUE
    else:
        TOLERANCE -= 1
    
    if TOLERANCE == 0:
        break

EPOCH   1	Loss train 0.0277 | valid 0.0847
EPOCH   2	Loss train 0.0677 | valid 0.0827
EPOCH   3	Loss train 0.0255 | valid 0.0806
EPOCH   4	Loss train 0.0626 | valid 0.0783
EPOCH   5	Loss train 0.0617 | valid 0.0759
EPOCH   6	Loss train 0.0582 | valid 0.0733
EPOCH   7	Loss train 0.0592 | valid 0.0705
EPOCH   8	Loss train 0.0533 | valid 0.0676
EPOCH   9	Loss train 0.0506 | valid 0.0645
EPOCH  10	Loss train 0.0479 | valid 0.0613
EPOCH  11	Loss train 0.0461 | valid 0.0579
EPOCH  12	Loss train 0.0142 | valid 0.0544
EPOCH  13	Loss train 0.0391 | valid 0.0509
EPOCH  14	Loss train 0.0361 | valid 0.0474
EPOCH  15	Loss train 0.0408 | valid 0.0440
EPOCH  16	Loss train 0.0100 | valid 0.0404
EPOCH  17	Loss train 0.0275 | valid 0.0372
EPOCH  18	Loss train 0.0313 | valid 0.0340
EPOCH  19	Loss train 0.0232 | valid 0.0310
EPOCH  20	Loss train 0.0209 | valid 0.0282
EPOCH  21	Loss train 0.0198 | valid 0.0256
EPOCH  22	Loss train 0.0079 | valid 0.0231
EPOCH  23	Loss train 0.0189 | valid 0.0210
EPOCH  24	L

#### Evaluation

In [492]:
from IPython.display import display

with torch.no_grad():
    model.eval()
    test_loss = 0.0
    for d in dl_test:
        tmp_x_test, y_true = d
        y_pred = model(tmp_x_test)
        test_loss += loss_fn(y_pred, y_true)
        
        tmp_df = pd.DataFrame(
            {
                "y_true": y_true.squeeze().numpy(),
                "y_pred": y_pred.squeeze().numpy()
            },
        ).T
        tmp_df.columns = df_y.columns[1:]
        display(tmp_df)
        # print("------------------------------------------------------------------------------------")
        # print(f"{y_true = }")
        # print(f"{y_pred = }")
        # print()
print(f'Test Loss: {test_loss / len(dl_test):.4f}')
        

Unnamed: 0,Li,Na,Mg,K,Ca,Al,Si,Fe,others
y_true,0.0244,0.0068,0.0065,0.0106,0.0097,0.0,0.0,0.0,0.942
y_pred,0.027229,0.023428,0.02117,0.025226,0.014682,0.031997,0.052396,0.015188,0.788684


Unnamed: 0,Li,Na,Mg,K,Ca,Al,Si,Fe,others
y_true,0.0244,0.0068,0.0065,0.0106,0.0097,0.0,0.0,0.0,0.942
y_pred,0.027229,0.023427,0.02117,0.025226,0.014682,0.031997,0.052396,0.015189,0.788682


Unnamed: 0,Li,Na,Mg,K,Ca,Al,Si,Fe,others
y_true,0.0244,0.0068,0.0065,0.0106,0.0097,0.0,0.0,0.0,0.942
y_pred,0.027229,0.023429,0.021169,0.025225,0.014682,0.031995,0.052396,0.015187,0.788687


Test Loss: 0.0031
