In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle


In [38]:
# Load the dataset
file_path = "global air pollution dataset.csv"  # Replace with your actual CSV file path
data = pd.read_csv(file_path)
data

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category
0,Russian Federation,Praskoveya,51,Moderate,1,Good,36,Good,0,Good,51,Moderate
1,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,41,Good
2,Italy,Priolo Gargallo,66,Moderate,1,Good,39,Good,2,Good,66,Moderate
3,Poland,Przasnysz,34,Good,1,Good,34,Good,0,Good,20,Good
4,France,Punaauia,22,Good,0,Good,22,Good,0,Good,6,Good
...,...,...,...,...,...,...,...,...,...,...,...,...
23458,India,Gursahaiganj,184,Unhealthy,3,Good,154,Unhealthy,2,Good,184,Unhealthy
23459,France,Sceaux,50,Good,1,Good,20,Good,5,Good,50,Good
23460,India,Mormugao,50,Good,1,Good,22,Good,1,Good,50,Good
23461,United States of America,Westerville,71,Moderate,1,Good,44,Good,2,Good,71,Moderate


In [39]:
# Display the first few rows of the dataset
print(data.head())

              Country              City  AQI Value AQI Category  CO AQI Value  \
0  Russian Federation        Praskoveya         51     Moderate             1   
1              Brazil  Presidente Dutra         41         Good             1   
2               Italy   Priolo Gargallo         66     Moderate             1   
3              Poland         Przasnysz         34         Good             1   
4              France          Punaauia         22         Good             0   

  CO AQI Category  Ozone AQI Value Ozone AQI Category  NO2 AQI Value  \
0            Good               36               Good              0   
1            Good                5               Good              1   
2            Good               39               Good              2   
3            Good               34               Good              0   
4            Good               22               Good              0   

  NO2 AQI Category  PM2.5 AQI Value PM2.5 AQI Category  
0             Good     

In [40]:
print(data.dtypes)

Country               object
City                  object
AQI Value              int64
AQI Category          object
CO AQI Value           int64
CO AQI Category       object
Ozone AQI Value        int64
Ozone AQI Category    object
NO2 AQI Value          int64
NO2 AQI Category      object
PM2.5 AQI Value        int64
PM2.5 AQI Category    object
dtype: object


In [41]:
# Handle missing values (fill missing numeric values with the mean, categorical with the mode)
for col in data.columns:
    if data[col].dtype == 'object':  # Categorical
        data[col] = data[col].fillna(data[col].mode()[0])
    else:  # Numeric
        data[col] = data[col].fillna(data[col].mean())

In [42]:
categorical_columns = ['Country', 'City','AQI Category', 'CO AQI Category', 'Ozone AQI Category', 'NO2 AQI Category', 'PM2.5 AQI Category']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  

In [43]:

print(data.dtypes)

Country               int64
City                  int64
AQI Value             int64
AQI Category          int64
CO AQI Value          int64
CO AQI Category       int64
Ozone AQI Value       int64
Ozone AQI Category    int64
NO2 AQI Value         int64
NO2 AQI Category      int64
PM2.5 AQI Value       int64
PM2.5 AQI Category    int64
dtype: object


In [44]:
features = ['CO AQI Value', 'Ozone AQI Value', 'NO2 AQI Value', 'PM2.5 AQI Value']
target = 'AQI Category'

In [45]:
X = data[features]
y = data[target]

In [46]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [48]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [49]:
# Make predictions
y_pred = model.predict(X_test)

In [50]:
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9985084167909652

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2014
           1       0.97      0.90      0.94        42
           2       1.00      1.00      1.00      1837
           3       1.00      1.00      1.00       419
           4       1.00      1.00      1.00       321
           5       0.94      0.97      0.95        60

    accuracy                           1.00      4693
   macro avg       0.98      0.98      0.98      4693
weighted avg       1.00      1.00      1.00      4693



In [51]:
train_score = model.score(X_train, y_train)  # R^2 score for the training set
test_score = model.score(X_test, y_test)    # R^2 score for the testing set

# Print the scores
print(f"Training Set Score (R^2): {train_score:.2f}")
print(f"Testing Set Score (R^2): {test_score:.2f}")

Training Set Score (R^2): 1.00
Testing Set Score (R^2): 1.00


In [52]:
# Save the trained model and scaler to a pickle file
with open("air_quality_model.pkl", "wb") as model_file:
    pickle.dump({"model": model, "scaler": scaler, "encoders": label_encoders}, model_file)

print("Model saved as 'air_quality_model.pkl'.")

Model saved as 'air_quality_model.pkl'.


In [53]:
with open("air_quality_model.pkl", "rb") as model_file:
    saved_data = pickle.load(model_file)
    saved_model = saved_data["model"]
    saved_scaler = saved_data["scaler"]
    saved_encoders = saved_data["encoders"]

In [62]:
# Take input from the user
print("\nEnter input values for prediction:")
co_aqi = int(input("CO AQI Value: "))
ozone_aqi = int(input("Ozone AQI Value: "))
no2_aqi = int(input("NO2 AQI Value: "))
pm25_aqi = int(input("PM2.5 AQI Value: "))


Enter input values for prediction:


In [63]:
# Prepare the input data
input_data = [[co_aqi, ozone_aqi, no2_aqi, pm25_aqi]]
input_data_scaled = saved_scaler.transform(input_data)



In [64]:

predicted_category_encoded = saved_model.predict(input_data_scaled)

In [65]:
# Decode the prediction back to the original target category
predicted_category_decoded = saved_encoders['AQI Category'].inverse_transform([predicted_category_encoded[0]])
print("\nPredicted AQI Category (Decoded):", predicted_category_decoded[0])



Predicted AQI Category (Decoded): Moderate
