In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
alcohol_1 = "/Users/derre/Documents/workspace/smell_sensor/alcohol.5ba5479f2e29.csv"
alcohol_2 = "/Users/derre/Documents/workspace/smell_sensor/alcohol.8a6c0767c2d0.csv"

coffee_1 = "/Users/derre/Documents/workspace/smell_sensor/coffee_beans.19e5779bc861.csv"
coffee_2 = "/Users/derre/Documents/workspace/smell_sensor/coffee_beans.cff4c1517704.csv"

In [4]:
alcohol_df1 = pd.read_csv(alcohol_1)
alcohol_df2 = pd.read_csv(alcohol_2)

coffee_df1 = pd.read_csv(coffee_1)
coffee_df2 = pd.read_csv(coffee_2)

In [5]:
alcohol_df1.drop(columns="timestamp", inplace=True)
alcohol_df2.drop(columns="timestamp", inplace=True)

coffee_df1.drop(columns="timestamp", inplace=True)
coffee_df2.drop(columns="timestamp", inplace=True)

In [6]:
def create_state_average_df(df):
    df['Group'] = (df['State'] != df['State'].shift()).cumsum()

    averaged_df = df.groupby('Group').mean().reset_index()
    
    averaged_df['State'] = df.groupby('Group')['State'].first().values
    
    averaged_df = averaged_df.drop(columns=['Group'])

    averaged_df = averaged_df[averaged_df["State"] < 2]
    averaged_df.reset_index(drop=True)
    return averaged_df

In [7]:
avg_alcohol_df1 = create_state_average_df(alcohol_df1)
avg_alcohol_df2 = create_state_average_df(alcohol_df2)

avg_coffee_df1 = create_state_average_df(coffee_df1)
avg_coffee_df2 = create_state_average_df(coffee_df2)

In [8]:
def calculate_state_difference(df):
    if len(df) % 2 != 0:
        df = df[:-1]

    odd_rows = df.iloc[1::2].reset_index(drop=True)
    even_rows = df.iloc[0::2].reset_index(drop=True)

    result = odd_rows - even_rows
    return result
    

In [9]:
diff_alcohol_df1 = calculate_state_difference(avg_alcohol_df1)
diff_alcohol_df2 = calculate_state_difference(avg_alcohol_df2)

diff_coffee_df1 = calculate_state_difference(avg_coffee_df1)
diff_coffee_df2 = calculate_state_difference(avg_coffee_df2)

In [9]:
def plot_difference(df1, df2, variable):
    df1_list = df1[variable].tolist()
    df2_list = df2[variable].tolist()

    max_length = max(len(df1_list), len(df2_list))
    df1_list_padded = np.pad(df1_list, (0, max_length - len(df1_list)), constant_values=np.nan)
    df2_list_padded = np.pad(df2_list, (0, max_length - len(df2_list)), constant_values=np.nan)
    
    # Plot
    plt.plot(df1_list_padded, label='df1 list (padded)')
    plt.plot(df2_list_padded, label='df2 list (padded)')
    plt.legend()
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title('Padded Plot')
    plt.show()

In [10]:
# 0 means alcohol
# 1 means coffee
diff_alcohol_df1["label"] = 0
diff_alcohol_df2["label"] = 0
diff_coffee_df1["label"] = 1
diff_coffee_df2["label"] = 1

In [11]:
combined_df = pd.concat([diff_alcohol_df1, diff_alcohol_df2, diff_coffee_df1, diff_coffee_df2], axis=0, ignore_index=True)

In [12]:
X = combined_df.drop('label', axis=1).values  # Features
y = combined_df['label'].values  # Labels

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Apply PCA
pca = PCA(n_components=9)  # Choose the number of components
X_pca = pca.fit_transform(X_scaled)

In [15]:
# Assume `X_pca` contains PCA-transformed features and `y` contains labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

In [16]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('pca', PCA(n_components=9)),  # Step 2: Apply PCA (adjust n_components as needed)
    ('classifier', RandomForestClassifier(random_state=42))  # Step 3: Train classifier
])

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, 'pca_model_pipeline.pkl')

print("Pipeline saved successfully!")

Pipeline saved successfully!


In [16]:
# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Confusion Matrix:
 [[7 0]
 [0 3]]


In [17]:
combined_df

Unnamed: 0,NO2,C2H50H,VOC,CO,Alcohol,LPG,Benzene,Temperature,Pressure,Humidity,Gas_Resistance,Altitude,State,label
0,0.182429,21.003101,25.555297,-66.417313,0.0,-10.80336,0.0,0.025328,-0.029243,-0.05708,-29.786217,0.234395,-1,0
1,-12.37229,-26.473552,-15.015705,-88.997688,0.0,-548.353,0.0,0.03238,-0.026236,0.020205,13.249803,0.227667,-1,0
2,-20.226739,-73.033374,-55.937905,-114.438815,0.0,-290.6618,0.0,0.00251,-0.041561,0.028256,69.030967,0.351515,-1,0
3,-13.815437,-31.212039,-35.38466,-95.123398,0.0,-398.7083,0.0,0.014082,-0.011274,0.021436,-13.834609,0.10197,-1,0
4,-4.939583,-11.745833,-14.416667,-66.779167,0.0,-394.4521,0.0,0.006375,-0.044813,0.01425,37.904146,0.370896,-1,0
5,-5.455289,-27.201636,-15.362653,-76.680304,0.0,-593.8448,0.0,0.006271,-0.025941,0.078101,-4.00742,0.217101,-1,0
6,-5.443416,-26.197779,-20.50661,-78.217874,0.0,-682.7916,0.0,0.008226,-0.00991,0.004104,32.730849,0.086729,-1,0
7,-11.36336,-47.171231,-42.233922,-104.510651,0.0,-562.6608,0.0,0.005472,-0.028939,-0.018326,4.143208,0.236752,-1,0
8,-10.475712,-15.595802,-15.606147,-82.061469,0.344528,267.0007,0.0,-0.05204,0.037346,0.002717,14.768546,-0.294123,-1,0
9,-12.975683,-47.259016,-41.267486,-61.840164,0.004098,-5626.919,0.0,-0.023355,0.04026,0.028842,-16.101462,-0.336216,-1,0


In [None]:
def create_state_average_df(df):
    df['Group'] = (df['State'] != df['State'].shift()).cumsum()

    averaged_df = df.groupby('Group').mean().reset_index()
    
    averaged_df['State'] = df.groupby('Group')['State'].first().values
    
    averaged_df = averaged_df.drop(columns=['Group'])

    averaged_df = averaged_df[averaged_df["State"] < 2]
    averaged_df.reset_index(drop=True)
    return averaged_df


def calculate_state_difference(df):
    if len(df) % 2 != 0:
        df = df[:-1]

    odd_rows = df.iloc[1::2].reset_index(drop=True)
    even_rows = df.iloc[0::2].reset_index(drop=True)

    result = odd_rows - even_rows
    return result


substance_path = "" # csv path

substance_df = pd.read_csv(substance_path)

substance_df.drop(columns="timestamp", inplace=True)

avg_substance_df = create_state_average_df(substance_df)

diff_substance_df = calculate_state_difference(avg_substance_df)

substance_values = diff_substance_df.values
