In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# ingredient_mappings = {
#     "ambient": 0,
#     "alcohol": 1,
#     "apple_juice": 2,
#     "balsamic_vinegar": 3,
#     "basil": 4,
#     "black_pepper": 5,
#     "cayenne": 6,
#     "chili_powder": 7,
#     "cinnamon": 8,
#     "cloves": 9,
#     "coffee_beans": 10,
#     "cumin": 11,
#     "garlic_powder": 12,
#     "matcha": 13,
#     "mediterranean_blend": 14,
#     "mint_leaves": 15,
#     "nutmeg": 16,
#     "olive_oil": 17,
#     "onion_powder": 18,
#     "oregano": 19,
#     "paprika": 20,
#     "rosemary": 21,
#     "soybean_oil": 22,
#     "vanilla_extract": 23
# }

ingredient_mappings = {
    "ambient": 0,
    "alcohol": 1,
    "coffee_beans": 2,
}

In [3]:
import os

directory = "/Users/derre/Documents/workspace/smell_sensor/smell_data_switch"

paths = []

for file in os.listdir(directory):
    file_path = os.path.join(directory, file)
    if os.path.isfile(file_path) and ("ambient" in file_path or "alcohol" in file_path or "coffee_beans" in file_path):  # Check if it's a file
        paths.append(os.path.join(directory, file))

In [4]:
def create_state_average_df(df):
    df['Group'] = (df['State'] != df['State'].shift()).cumsum()

    averaged_df = df.groupby('Group').mean().reset_index()
    
    averaged_df['State'] = df.groupby('Group')['State'].first().values
    
    averaged_df = averaged_df.drop(columns=['Group'])

    averaged_df = averaged_df[averaged_df["State"] < 2]
    averaged_df.reset_index(drop=True)
    return averaged_df

In [5]:
def calculate_state_difference(df):
    if len(df) % 2 != 0:
        df = df[:-1]

    odd_rows = df.iloc[1::2].reset_index(drop=True)
    even_rows = df.iloc[0::2].reset_index(drop=True)

    result = odd_rows - even_rows
    return result

In [6]:
from collections import defaultdict
import re

ingredient_df = []

for path in paths:
    ingredient_name = re.split(r'[./]', path)[-3]
        
    dataframe = pd.read_csv(path)
    dataframe.drop(columns="timestamp", inplace=True)
    avg_ingredient_df = create_state_average_df(dataframe)
    diff_ingredient_df = calculate_state_difference(avg_ingredient_df)
    diff_ingredient_df["label"] = ingredient_mappings[ingredient_name]
    ingredient_df.append(diff_ingredient_df)

In [7]:
print(ingredient_df)

[        NO2    C2H50H       VOC        CO  Alcohol       LPG  Benzene  \
0 -0.576271 -0.574529 -0.647077 -1.015999      0.0 -0.473943      0.0   
1  0.239407  0.170763  0.394068 -0.444068      0.0  0.172034      0.0   
2  1.050847  0.644068  0.966102  0.440678      0.0 -0.118644      0.0   
3  0.079825 -0.031165 -0.030891 -0.812739      0.0  0.097594      0.0   
4 -0.158470 -0.033607 -0.233060  1.226503      0.0 -0.022951      0.0   
5 -0.247849 -0.269355 -0.438710 -1.971505      0.0 -1.090860      0.0   
6 -0.201244 -0.088468 -0.302996  1.271340      0.0  1.309497      0.0   
7  1.299435  0.670056  0.991243  0.659887      0.0 -0.712571      0.0   
8  0.069186  0.073076 -0.055571 -0.051959      0.0  0.019172      0.0   
9 -0.472359 -0.212917 -0.209086 -0.292556      0.0 -0.521346      0.0   

   Temperature  Pressure  Humidity  Gas_Resistance  Altitude  State  label  
0    -0.032275  0.048918 -0.096794       12.270976 -0.410477      1      0  
1    -0.013750 -0.003263  0.083534       

In [8]:
def plot_difference(df1, df2, variable):
    df1_list = df1[variable].tolist()
    df2_list = df2[variable].tolist()

    max_length = max(len(df1_list), len(df2_list))
    df1_list_padded = np.pad(df1_list, (0, max_length - len(df1_list)), constant_values=np.nan)
    df2_list_padded = np.pad(df2_list, (0, max_length - len(df2_list)), constant_values=np.nan)
    
    # Plot
    plt.plot(df1_list_padded, label='df1 list (padded)')
    plt.plot(df2_list_padded, label='df2 list (padded)')
    plt.legend()
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title('Padded Plot')
    plt.show()

In [9]:
combined_df = pd.concat(ingredient_df, axis=0, ignore_index=True)

In [10]:
print(combined_df)

           NO2     C2H50H        VOC          CO  Alcohol         LPG  \
0    -0.576271  -0.574529  -0.647077   -1.015999      0.0   -0.473943   
1     0.239407   0.170763   0.394068   -0.444068      0.0    0.172034   
2     1.050847   0.644068   0.966102    0.440678      0.0   -0.118644   
3     0.079825  -0.031165  -0.030891   -0.812739      0.0    0.097594   
4    -0.158470  -0.033607  -0.233060    1.226503      0.0   -0.022951   
..         ...        ...        ...         ...      ...         ...   
111 -13.815437 -31.212039 -35.384660  -95.123398      0.0 -398.708350   
112  -4.939583 -11.745833 -14.416667  -66.779167      0.0 -394.452083   
113  -5.455289 -27.201636 -15.362653  -76.680304      0.0 -593.844828   
114  -5.443416 -26.197779 -20.506610  -78.217874      0.0 -682.791645   
115 -11.363360 -47.171231 -42.233922 -104.510651      0.0 -562.660783   

     Benzene  Temperature  Pressure  Humidity  Gas_Resistance  Altitude  \
0        0.0    -0.032275  0.048918 -0.096794   

In [11]:
X = combined_df.drop('label', axis=1).values  # Features
y = combined_df['label'].values  # Labels

In [12]:
from collections import Counter

print(Counter(y))

Counter({0: 48, 1: 39, 2: 29})


In [13]:
print(X.shape)

(116, 13)


In [14]:
print(y.shape)

(116,)


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X).astype(np.float32)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y, dtype=torch.long)

# Split data into training (80%) and testing (20%)
train_size = int(0.8 * len(X_tensor))
test_size = len(X_tensor) - train_size
train_dataset, test_dataset = random_split(TensorDataset(X_tensor, y_tensor), [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define the neural network
class ClassifierNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(ClassifierNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)  # No softmax here because CrossEntropyLoss applies it automatically
        return x

# Initialize model, loss, and optimizer
model = ClassifierNN(input_size=13, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(batch_y.numpy())
        y_pred.extend(predicted.numpy())

# Compute accuracy and classification report
accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_true, y_pred))


Epoch [10/100], Loss: 0.0406
Epoch [20/100], Loss: 0.0008
Epoch [30/100], Loss: 0.1936
Epoch [40/100], Loss: 0.0002
Epoch [50/100], Loss: 0.0001
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Test Accuracy: 0.8333
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.86      0.75         7
           1       1.00      1.00      1.00        10
           2       0.80      0.57      0.67         7

    accuracy                           0.83        24
   macro avg       0.82      0.81      0.81        24
weighted avg       0.84      0.83      0.83        24



In [16]:
print(y_true, y_pred)

[0, 2, 1, 0, 2, 1, 1, 0, 2, 1, 1, 2, 0, 2, 1, 1, 1, 2, 2, 1, 0, 0, 0, 1] [0, 2, 1, 0, 2, 1, 1, 0, 0, 1, 1, 2, 0, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 1]


In [17]:
torch.save(model.state_dict(), "model.pth")
print("Model saved successfully!")

Model saved successfully!


In [22]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply PCA
pca = PCA(n_components=9)  # Choose the number of components
X_pca = pca.fit_transform(X_scaled)

In [15]:
# Assume `X_pca` contains PCA-transformed features and `y` contains labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

In [16]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('pca', PCA(n_components=9)),  # Step 2: Apply PCA (adjust n_components as needed)
    ('classifier', RandomForestClassifier(random_state=42))  # Step 3: Train classifier
])

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, 'pca_model_pipeline.pkl')

print("Pipeline saved successfully!")

Pipeline saved successfully!


In [16]:
# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Confusion Matrix:
 [[7 0]
 [0 3]]


In [17]:
combined_df

Unnamed: 0,NO2,C2H50H,VOC,CO,Alcohol,LPG,Benzene,Temperature,Pressure,Humidity,Gas_Resistance,Altitude,State,label
0,0.182429,21.003101,25.555297,-66.417313,0.0,-10.80336,0.0,0.025328,-0.029243,-0.05708,-29.786217,0.234395,-1,0
1,-12.37229,-26.473552,-15.015705,-88.997688,0.0,-548.353,0.0,0.03238,-0.026236,0.020205,13.249803,0.227667,-1,0
2,-20.226739,-73.033374,-55.937905,-114.438815,0.0,-290.6618,0.0,0.00251,-0.041561,0.028256,69.030967,0.351515,-1,0
3,-13.815437,-31.212039,-35.38466,-95.123398,0.0,-398.7083,0.0,0.014082,-0.011274,0.021436,-13.834609,0.10197,-1,0
4,-4.939583,-11.745833,-14.416667,-66.779167,0.0,-394.4521,0.0,0.006375,-0.044813,0.01425,37.904146,0.370896,-1,0
5,-5.455289,-27.201636,-15.362653,-76.680304,0.0,-593.8448,0.0,0.006271,-0.025941,0.078101,-4.00742,0.217101,-1,0
6,-5.443416,-26.197779,-20.50661,-78.217874,0.0,-682.7916,0.0,0.008226,-0.00991,0.004104,32.730849,0.086729,-1,0
7,-11.36336,-47.171231,-42.233922,-104.510651,0.0,-562.6608,0.0,0.005472,-0.028939,-0.018326,4.143208,0.236752,-1,0
8,-10.475712,-15.595802,-15.606147,-82.061469,0.344528,267.0007,0.0,-0.05204,0.037346,0.002717,14.768546,-0.294123,-1,0
9,-12.975683,-47.259016,-41.267486,-61.840164,0.004098,-5626.919,0.0,-0.023355,0.04026,0.028842,-16.101462,-0.336216,-1,0


In [None]:
def create_state_average_df(df):
    df['Group'] = (df['State'] != df['State'].shift()).cumsum()

    averaged_df = df.groupby('Group').mean().reset_index()
    
    averaged_df['State'] = df.groupby('Group')['State'].first().values
    
    averaged_df = averaged_df.drop(columns=['Group'])

    averaged_df = averaged_df[averaged_df["State"] < 2]
    averaged_df.reset_index(drop=True)
    return averaged_df


def calculate_state_difference(df):
    if len(df) % 2 != 0:
        df = df[:-1]

    odd_rows = df.iloc[1::2].reset_index(drop=True)
    even_rows = df.iloc[0::2].reset_index(drop=True)

    result = odd_rows - even_rows
    return result


substance_path = "" # csv path

substance_df = pd.read_csv(substance_path)

substance_df.drop(columns="timestamp", inplace=True)

avg_substance_df = create_state_average_df(substance_df)

diff_substance_df = calculate_state_difference(avg_substance_df)

substance_values = diff_substance_df.values
