In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [12]:
# 1. Load the generated data
data = pd.read_csv("sensor_data/csv/air_quality_readings_errors.csv")

# Print the total number of samples for verification
print("Total number of samples: ", len(data))

# Print the number of samples for each category
print("Number of samples for each category:")
print(data['health_environment'].value_counts())

Total number of samples:  1333
Number of samples for each category:
health_environment
Fresh         464
Stinky        436
Noticeable    433
Name: count, dtype: int64


In [13]:
# 2. Preprocess the data
# Drop non-essential columns (sensor_id and timestamp)
data.drop(columns=['sensor_id', 'timestamp'], inplace=True)

# Check for missing values
print("Missing values in each column:")
missing_values = data.isnull().sum()
print(missing_values)

if missing_values.sum() > 0:
    missing_columns = missing_values[missing_values > 0].index.tolist()
    print("\nColumns with missing values:", missing_columns)
    data = data.dropna()  # Dropping rows with missing values
    print("\nDropped rows with missing values.")

Missing values in each column:
nh3                     0
h2s                     0
voc                     0
no2                   257
health_environment      0
dtype: int64

Columns with missing values: ['no2']

Dropped rows with missing values.


In [14]:
# 3. Encode the target variable (health_environment)
le = LabelEncoder()
data['health_environment_encoded'] = le.fit_transform(data['health_environment'])

# Print the encoded health environment column
print(data['health_environment_encoded'])

1       2
2       0
5       2
6       1
9       2
       ..
1326    1
1327    0
1328    0
1329    0
1330    0
Name: health_environment_encoded, Length: 1076, dtype: int64


In [15]:
#4. Define features and target
X = data[['nh3', 'h2s', 'voc', 'no2']]
y = data['health_environment_encoded']
print(X)
print(y)

       nh3     h2s     voc     no2
1     7.46  0.0078  544.98   52.74
2     4.52  0.0031  544.37  205.33
5     8.66  0.0051  188.49  131.40
6     7.80  0.0046  412.86   40.43
9     6.20  0.0092   59.76   25.55
...    ...     ...     ...     ...
1326  5.46  0.0009  300.95  272.01
1327  5.60  0.0054  192.04   85.86
1328  5.60  0.0054  192.04   85.86
1329  0.79  0.0009   64.94  281.22
1330  3.81  0.0006  585.85   80.15

[1076 rows x 4 columns]
1       2
2       0
5       2
6       1
9       2
       ..
1326    1
1327    0
1328    0
1329    0
1330    0
Name: health_environment_encoded, Length: 1076, dtype: int64


In [16]:
# 5. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled)

[[ 0.912402    0.98199382  1.32849486 -1.13644493]
 [-0.1266195  -0.66646238  1.32477761  0.61269935]
 [ 1.3364924   0.03500834 -0.84390403 -0.23476271]
 ...
 [ 0.25506187  0.14022895 -0.82227084 -0.75678926]
 [-1.44483384 -1.43808017 -1.5968      1.48262897]
 [-0.37753965 -1.54330078  1.57755077 -0.82224318]]


In [17]:
# 6. Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
print(X_train)
print(y_train)
print(X_test)
print(y_test)

Training set size: 860
Testing set size: 216
[[ 1.55913987  1.2625821   0.57468481  0.5153782 ]
 [ 0.75690218  0.24544956  0.97011516  1.50509651]
 [-1.35648167 -1.22763895  1.55171282  0.31569216]
 ...
 [-0.58251668  1.05214089  0.57803644 -1.36753968]
 [ 0.06422119 -0.24557994  0.02940654  1.21703048]
 [ 0.5130502  -0.84183006  1.15421056  1.25096106]]
403     0
1030    2
1149    0
809     1
1017    1
       ..
956     0
978     1
79      2
1172    0
316     0
Name: health_environment_encoded, Length: 860, dtype: int64
[[ 5.02447940e-01  1.19243503e+00  4.79559715e-01  5.10563725e-01]
 [-1.28933402e+00  8.06626136e-01 -8.39882085e-01  2.33387575e-01]
 [-7.83959624e-01 -1.40359336e-01  1.42937746e-02  9.32059467e-01]
 [ 1.29761745e+00 -6.51924461e-05  4.86994220e-01 -1.65789832e+00]
 [-1.44483384e+00  7.00818796e-02  5.78648483e-02 -5.61667335e-03]
 [-1.49784514e+00 -5.26168233e-01 -1.49247317e+00  8.10436682e-01]
 [-5.82516681e-01  1.05214089e+00  5.78036436e-01 -1.36753968e+00]
 [ 1

In [18]:
# 7. Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
print(clf)

clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
# 8. Evaluate the model on test data
y_pred = clf.predict(X_test)
print(y_pred)
print("classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

[2 0 1 2 2 0 2 1 0 1 1 0 2 0 0 2 2 0 2 2 0 0 0 0 1 0 2 2 1 0 2 0 0 2 2 0 0
 1 0 2 0 2 1 1 1 0 2 2 1 0 2 0 1 2 0 0 2 0 1 2 2 2 0 1 2 1 1 1 1 0 1 2 2 2
 1 1 2 0 1 2 2 0 1 1 0 0 1 1 1 2 2 2 1 2 1 0 0 0 0 1 1 0 2 2 1 0 0 1 2 1 1
 1 0 1 1 0 2 0 1 0 1 1 0 0 0 0 2 1 0 0 2 2 0 0 0 2 2 0 0 0 0 1 1 1 0 1 0 2
 1 2 1 0 2 0 0 1 0 0 1 2 1 1 1 0 2 2 2 1 0 2 1 2 1 0 0 1 0 0 2 0 1 0 2 2 2
 2 1 0 2 2 1 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 2 1 2 2 0 1 0 0]
classification Report:
              precision    recall  f1-score   support

       Fresh       0.57      0.68      0.62        72
  Noticeable       0.64      0.61      0.62        71
      Stinky       0.70      0.60      0.65        73

    accuracy                           0.63       216
   macro avg       0.64      0.63      0.63       216
weighted avg       0.64      0.63      0.63       216



In [20]:
# New input as floats
new_data = np.array([[3.5, 0.004, 420.0, 180.0]])  # NH3, H2S, VOC, NO2

# Scale using the same scaler used during training
new_data_scaled = scaler.transform(new_data)

# Predict using the trained classifier
y_pred = clf.predict(new_data_scaled)

# Decode the predicted label back to the original category
predicted_category = le.inverse_transform(y_pred)

# Print the result
print("Based on the given air parameters the quality is:", predicted_category[0])

Based on the given air parameters the quality is: Noticeable
