In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [43]:
# 1. Load the generated data
data = pd.read_csv("sensor_data/csv/air_quality_readings_errors.csv")

# Print the total number of samples for verification
print("Total number of samples: ", len(data))

# Print the number of samples for each category
print("Number of samples for each category:")
print(data['health_environment'].value_counts())

Total number of samples:  1333
Number of samples for each category:
health_environment
noticeable    603
stinky        483
fresh         247
Name: count, dtype: int64


In [44]:
# 2. Preprocess the data
# Drop non-essential columns (sensor_id and timestamp)
data.drop(columns=['sensor_id', 'timestamp'], inplace=True)

# Check for missing values
print("Missing values in each column:")
missing_values = data.isnull().sum()
print(missing_values)

if missing_values.sum() > 0:
    missing_columns = missing_values[missing_values > 0].index.tolist()
    print("\nColumns with missing values:", missing_columns)
    data = data.dropna()  # Dropping rows with missing values
    print("\nDropped rows with missing values.")

Missing values in each column:
nh3                     0
h2s                     0
voc                     0
no2                   248
health_environment      0
dtype: int64

Columns with missing values: ['no2']

Dropped rows with missing values.


In [45]:
# 3. Encode the target variable (health_environment)
le = LabelEncoder()
data['health_environment_encoded'] = le.fit_transform(data['health_environment'])

# Print the encoded health environment column
print(data['health_environment_encoded'].head())

0    1
1    0
2    2
5    2
6    1
Name: health_environment_encoded, dtype: int64


In [46]:
#4. Define features and target
X = data[['nh3', 'h2s', 'voc', 'no2']]
y = data['health_environment_encoded']
print(X)
print(y)

       nh3     h2s     voc     no2
0     2.68  0.0058  198.11  243.08
1     2.98  0.0030   54.63  273.27
2     6.02  0.0051  435.33  172.12
5     9.21  0.0055  356.79  176.04
6     8.96  0.0017  164.31  298.60
...    ...     ...     ...     ...
1328  0.85  0.0058  481.89  133.91
1329  6.45  0.0018  493.29  265.38
1330  1.49  0.0100  459.53  125.20
1331  5.19  0.0045  373.01  233.22
1332  5.19  0.0045  373.01  233.22

[1085 rows x 4 columns]
0       1
1       0
2       2
5       2
6       1
       ..
1328    2
1329    2
1330    2
1331    1
1332    1
Name: health_environment_encoded, Length: 1085, dtype: int64


In [47]:
# 5. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled)

[[-0.76101445  0.31360281 -0.79140874  1.03578058]
 [-0.6556816  -0.66495649 -1.69465606  1.38208764]
 [ 0.41169125  0.06896298  0.7019585   0.22180413]
 ...
 [-1.17883474  1.78144175  0.85430437 -0.31641141]
 [ 0.12027037 -0.14072829  0.30963641  0.92267732]
 [ 0.12027037 -0.14072829  0.30963641  0.92267732]]


In [48]:
# 6. Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
print(X_train)
print(y_train)
print(X_test)
print(y_test)

Training set size: 868
Testing set size: 217
[[-7.22392404e-01  8.72779550e-01 -1.51133740e+00  2.56514077e-02]
 [-8.97947149e-01  5.93191179e-01 -3.17624630e-03 -7.15943541e-01]
 [-4.06393863e-01  8.02882457e-01  7.86944830e-01 -7.18696562e-01]
 ...
 [ 4.22224532e-01  1.38860077e-01  7.49613797e-01  5.25439372e-01]
 [ 2.89819040e-02 -1.01444195e+00 -1.11385318e+00  4.43020657e-03]
 [ 8.40044825e-01 -9.34108612e-04  7.47095683e-01 -1.70616026e-01]]
1271    1
1076    1
1155    1
867     1
266     1
       ..
1089    1
611     2
397     2
901     0
445     1
Name: health_environment_encoded, Length: 868, dtype: int64
[[ 7.34711978e-01  1.74649321e+00 -5.65596906e-01  1.41966532e-01]
 [ 1.04017723e+00 -1.15423614e+00  5.66421037e-01 -1.09138674e+00]
 [-1.10861284e+00 -1.08433905e+00  7.73095206e-01 -1.42633759e+00]
 [ 5.69690518e-01  2.08757169e-01 -4.01352950e-01  8.63372662e-01]
 [-9.15502623e-01  8.02882457e-01  9.01959666e-01  1.54795714e+00]
 [-9.92746711e-01 -1.60856724e+00 -6.50646

In [49]:
# 7. Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
print(clf)

clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [50]:
# 8. Evaluate the model on test data
y_pred = clf.predict(X_test)
print(y_pred)
print("classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

[2 1 0 2 2 0 2 2 2 1 1 2 0 1 1 0 1 2 2 1 0 0 2 1 0 0 2 0 1 2 1 2 2 2 0 0 0
 2 1 2 1 1 2 0 2 0 2 1 0 1 1 2 1 1 1 0 1 1 1 2 0 2 2 1 1 1 2 1 2 1 2 2 1 2
 2 0 1 2 2 1 1 1 2 0 0 2 1 1 2 2 0 2 2 2 2 2 2 1 1 1 0 1 2 2 1 0 2 1 1 1 1
 2 1 1 1 0 1 1 1 2 1 0 2 0 1 2 1 1 1 1 2 0 2 1 1 1 1 2 1 1 0 2 0 1 1 1 1 1
 1 1 2 2 2 2 1 0 1 0 1 1 2 2 1 1 0 2 1 1 1 1 2 1 2 1 0 2 2 2 0 2 1 1 1 2 2
 0 2 2 1 1 0 1 0 0 0 0 0 1 1 1 2 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 0]
classification Report:
              precision    recall  f1-score   support

       fresh       1.00      1.00      1.00        42
  noticeable       1.00      1.00      1.00        98
      stinky       1.00      1.00      1.00        77

    accuracy                           1.00       217
   macro avg       1.00      1.00      1.00       217
weighted avg       1.00      1.00      1.00       217



In [51]:
# New input as floats
new_data = np.array([[3.5, 0.004, 420.0, 180.0]])  # NH3, H2S, VOC, NO2

# Scale using the same scaler used during training
new_data_scaled = scaler.transform(new_data)

# Predict using the trained classifier
y_pred = clf.predict(new_data_scaled)

# Decode the predicted label back to the original category
predicted_category = le.inverse_transform(y_pred)

# Print the result
print("Based on the given air parameters the quality is:", predicted_category[0])

Based on the given air parameters the quality is: fresh
