In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [21]:
# 1. Load the generated data
data = pd.read_csv("sensor_data/csv/air_quality_readings_errors.csv")

# Print the total number of samples for verification
print("Total number of samples: ", len(data))

# Print the number of samples for each category
print("Number of samples for each category:")
print(data['health_environment'].value_counts())

Total number of samples:  10666
Number of samples for each category:
health_environment
noticeable    4582
stinky        4152
fresh         1932
Name: count, dtype: int64


In [22]:
# 2. Preprocess the data
# Drop non-essential columns (sensor_id and timestamp)
data.drop(columns=['sensor_id', 'timestamp'], inplace=True)

# Check for missing values
print("Missing values in each column:")
missing_values = data.isnull().sum()
print(missing_values)

if missing_values.sum() > 0:
    missing_columns = missing_values[missing_values > 0].index.tolist()
    print("\nColumns with missing values:", missing_columns)
    data = data.dropna()  # Dropping rows with missing values
    print("\nDropped rows with missing values.")

Missing values in each column:
nh3                      0
h2s                      0
voc                      0
no2                   2085
health_environment       0
dtype: int64

Columns with missing values: ['no2']

Dropped rows with missing values.


In [23]:
# 3. Encode the target variable (health_environment)
le = LabelEncoder()
data['health_environment_encoded'] = le.fit_transform(data['health_environment'])

# Print the encoded health environment column
print(data['health_environment_encoded'].head())

1    1
2    2
3    2
4    2
5    0
Name: health_environment_encoded, dtype: int64


In [24]:
#4. Define features and target
X = data[['nh3', 'h2s', 'voc', 'no2']]
y = data['health_environment_encoded']
print(X)
print(y)

        nh3     h2s     voc     no2
1      0.87  0.0073  227.45  193.68
2      6.63  0.0076  396.80  241.98
3      9.78  0.0088  195.47  275.68
4      9.78  0.0088  195.47  275.68
5      0.35  0.0044  252.93   12.86
...     ...     ...     ...     ...
10660  5.86  0.0068  427.69  166.22
10661  1.68  0.0094  188.34  192.17
10662  4.01  0.0036  147.31  153.70
10663  3.74  0.0015   81.54   16.07
10664  3.74  0.0015   81.54   16.07

[8581 rows x 4 columns]
1        1
2        2
3        2
4        2
5        0
        ..
10660    2
10661    1
10662    0
10663    0
10664    0
Name: health_environment_encoded, Length: 8581, dtype: int64


In [25]:
# 5. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled)

[[-1.43002474  0.81382835 -0.62105576  0.51096602]
 [ 0.57351296  0.91842651  0.43896886  1.06896798]
 [ 1.66919763  1.33681914 -0.82123047  1.45829854]
 ...
 [-0.33781842 -0.47621559 -1.12268182  0.04908365]
 [-0.43173425 -1.20840269 -1.53436071 -1.54093311]
 [-0.43173425 -1.20840269 -1.53436071 -1.54093311]]


In [26]:
# 6. Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
print(X_train)
print(y_train)
print(X_test)
print(y_test)

Training set size: 6864
Testing set size: 1717
[[ 1.5752818  -0.33675138 -0.87042914 -1.66154472]
 [ 0.18741455 -0.37161743 -0.69072257  0.12186652]
 [-0.44912607  0.11650731  0.43721623  1.46719422]
 ...
 [ 1.0048301  -1.4873311   0.93083029 -0.22425644]
 [ 0.51438077 -1.59192926  0.58268403  0.05058552]
 [-1.54828912  1.23222098  0.46482006 -0.74228559]]
5987    1
161     1
2422    1
8625    2
8353    0
       ..
7413    1
9383    1
9479    2
2322    1
4345    1
Name: health_environment_encoded, Length: 6864, dtype: int64
[[-1.19697435  1.16248888 -0.91925224 -1.45278498]
 [-0.56739045  0.29083757  0.00763554  0.88088782]
 [-0.64739283  1.26708703 -1.22708814 -0.5145792 ]
 ...
 [ 0.23263328 -1.38273295  1.30007064 -0.70104197]
 [-0.9882725  -1.66166137  1.36892373  0.83664045]
 [ 1.43266888  1.47628335  0.28436237 -1.56577171]]
7450    1
1002    1
4601    1
9962    1
1733    1
       ..
1410    2
4411    0
9518    2
7194    1
3742    2
Name: health_environment_encoded, Length: 1717, 

In [27]:
# 7. Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
print(clf)

clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)


In [28]:
# 8. Evaluate the model on test data
y_pred = clf.predict(X_test)
print(y_pred)
print("classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

[1 1 1 ... 2 1 2]
classification Report:
              precision    recall  f1-score   support

       fresh       1.00      1.00      1.00       309
  noticeable       1.00      1.00      1.00       740
      stinky       1.00      1.00      1.00       668

    accuracy                           1.00      1717
   macro avg       1.00      1.00      1.00      1717
weighted avg       1.00      1.00      1.00      1717



In [29]:
# New input as floats
new_data = np.array([[3.5, 0.004, 420.0, 180.0]])  # NH3, H2S, VOC, NO2

# Scale using the same scaler used during training
new_data_scaled = scaler.transform(new_data)

# Predict using the trained classifier
y_pred = clf.predict(new_data_scaled)

# Decode the predicted label back to the original category
predicted_category = le.inverse_transform(y_pred)

# Print the result
print(predicted_category[0])

fresh
