In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [8]:
df_control = pd.read_csv('data_sensor_01.csv')
df_exp = pd.read_csv('data_sensor_02.csv')

In [9]:
df_control.describe()

Unnamed: 0,Temperature (°C),Humidity (%),Pressure (kPa),Gas Resistance (KOhm)
count,31599.0,31599.0,31599.0,31599.0
mean,19.362706,65.757044,100.906309,151.508433
std,1.266416,3.148382,0.99618,12.209348
min,18.37,49.99,74.38,0.0
25%,18.8,64.52,100.86,149.13
50%,19.12,66.11,100.94,152.96
75%,19.68,67.48,101.04,156.69
max,33.03,100.0,101.13,177.9


In [10]:
df_exp.describe()

Unnamed: 0,Timestamp,Temperature (°C),Humidity (%),Pressure (kPa),Gas Resistance (KOhm)
count,6496,6496.0,6496.0,6496.0,6496.0
unique,6496,413.0,1345.0,94.0,1147.0
top,2025-11-14 18:21:41,18.76,50.28,100.83,36.28
freq,1,91.0,97.0,220.0,43.0


In [11]:
df_control = df_control[::5]
df_control.describe()

Unnamed: 0,Temperature (°C),Humidity (%),Pressure (kPa),Gas Resistance (KOhm)
count,6320.0,6320.0,6320.0,6320.0
mean,19.362345,65.750693,100.909728,151.51137
std,1.261369,3.129675,0.950456,12.149382
min,18.38,50.0,74.38,0.0
25%,18.8,64.51,100.86,149.13
50%,19.12,66.11,100.94,152.96
75%,19.68,67.47,101.04,156.69
max,33.03,100.0,101.13,176.03


In [12]:
# Independent two-sample t-test on 'gas resistance' between control and experimental groups
import scipy.stats as stats
# Extract numeric series and drop missing values
control = pd.to_numeric(df_control['Gas Resistance (KOhm)'], errors='coerce').dropna()
exp = pd.to_numeric(df_exp['Gas Resistance (KOhm)'], errors='coerce').dropna()
# Basic summary
n1 = control.size
n2 = exp.size
m1 = control.mean()
m2 = exp.mean()
s1 = control.std(ddof=1)
s2 = exp.std(ddof=1)
print(f"Control: n={n1}, mean={m1:.4f}, sd={s1:.4f}")
print(f"Experiment: n={n2}, mean={m2:.4f}, sd={s2:.4f}")
# Levene's test for equal variances
lev_stat, lev_p = stats.levene(control, exp)
equal_var = lev_p > 0.05
print(f"Levene test: stat={lev_stat:.4f}, p={lev_p:.4f} -> equal_var={equal_var}")
# Two-sample t-test (uses equal_var depending on Levene)
t_stat, p_val = stats.ttest_ind(control, exp, equal_var=equal_var)
print(f"t-statistic={t_stat:.4f}, p-value={p_val:.4f}")
# 95% CI for the difference in means (Welch approximation)
diff = m1 - m2
se = (s1**2 / n1 + s2**2 / n2) ** 0.5
# Welch-Satterthwaite degrees of freedom
num = (s1**2 / n1 + s2**2 / n2)**2
den = (s1**4) / (n1**2 * (n1 - 1)) if n1 > 1 else 0
den += (s2**4) / (n2**2 * (n2 - 1)) if n2 > 1 else 0
df_welch = num / den if den != 0 else min(n1, n2) - 1
from math import isnan
if se == 0 or isnan(se):
    print("Cannot compute CI (zero or invalid standard error).")
else:
    t_crit = stats.t.ppf(1 - 0.025, df=df_welch)
    ci_low = diff - t_crit * se
    ci_high = diff + t_crit * se
    print(f"Difference in means (control - experiment) = {diff:.4f}")
    print(f"95% CI = [{ci_low:.4f}, {ci_high:.4f}] (df~{df_welch:.2f})")

Control: n=6320, mean=151.5114, sd=12.1494
Experiment: n=6495, mean=48.4817, sd=21.1052
Levene test: stat=2823.3689, p=0.0000 -> equal_var=False
t-statistic=339.7970, p-value=0.0000
Difference in means (control - experiment) = 103.0297
95% CI = [102.4353, 103.6240] (df~10427.47)


## Naive Bayes Classifier

Train a Gaussian Naive Bayes classifier to predict whether a gas resistance reading comes from the control or experimental group.

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare the data
# Control group: label = 0
# Experimental group: label = 1
X_control = control.values.reshape(-1, 1)
y_control = np.zeros(len(control))

X_exp = exp.values.reshape(-1, 1)
y_exp = np.ones(len(exp))

# Combine the data
X = np.vstack([X_control, X_exp])
y = np.concatenate([y_control, y_exp])

# Split into train and test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Make predictions
y_pred = gnb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Control', 'Experimental']))
print(f"Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9910 (99.10%)

Classification Report:
              precision    recall  f1-score   support

     Control       1.00      0.98      0.99      1264
Experimental       0.98      1.00      0.99      1299

    accuracy                           0.99      2563
   macro avg       0.99      0.99      0.99      2563
weighted avg       0.99      0.99      0.99      2563

Confusion Matrix:
[[1241   23]
 [   0 1299]]
