# New Section

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [4]:
df = pd.read_csv("/content/city_day.csv")
print(df.head())
print(df.tail())

        City      Date  PM2.5  PM10     NO    NO2    NOx  NH3     CO    SO2  \
0  Ahmedabad  1/1/2015    NaN   NaN   0.92  18.22  17.15  NaN   0.92  27.64   
1  Ahmedabad  1/2/2015    NaN   NaN   0.97  15.69  16.46  NaN   0.97  24.55   
2  Ahmedabad  1/3/2015    NaN   NaN  17.40  19.30  29.70  NaN  17.40  29.07   
3  Ahmedabad  1/4/2015    NaN   NaN   1.70  18.48  17.97  NaN   1.70  18.59   
4  Ahmedabad  1/5/2015    NaN   NaN  22.10  21.42  37.76  NaN  22.10  39.33   

       O3  Unnamed: 11  Unnamed: 12  Unnamed: 13  Unnamed: 14 AQI_Bucket  
0  133.36          NaN          NaN          NaN          NaN        NaN  
1   34.06          NaN          NaN          NaN          NaN        NaN  
2   30.70          NaN          NaN          NaN          NaN        NaN  
3   36.08          NaN          NaN          NaN          NaN        NaN  
4   39.31          NaN          NaN          NaN          NaN        NaN  
           City       Date   PM2.5    PM10     NO    NO2    NOx  NH3    CO 

In [5]:
# Clean column names first (important if any column names have leading/trailing spaces)
df.columns = df.columns.str.strip()

# Updated list – only numerical columns
cols_with_zeros = ['PM2.5', 'PM10', 'NO', 'NO2','NOx', 'NH3', 'CO', 'SO2']

# Replace 0 with NaN for these columns
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

# Fill NaNs with median of each column
df[cols_with_zeros] = df[cols_with_zeros].fillna(df[cols_with_zeros].median())
print(df.head())
print(df.tail())


        City      Date   PM2.5   PM10     NO    NO2    NOx     NH3     CO  \
0  Ahmedabad  1/1/2015  48.455  98.67   0.92  18.22  17.15  24.385   0.92   
1  Ahmedabad  1/2/2015  48.455  98.67   0.97  15.69  16.46  24.385   0.97   
2  Ahmedabad  1/3/2015  48.455  98.67  17.40  19.30  29.70  24.385  17.40   
3  Ahmedabad  1/4/2015  48.455  98.67   1.70  18.48  17.97  24.385   1.70   
4  Ahmedabad  1/5/2015  48.455  98.67  22.10  21.42  37.76  24.385  22.10   

     SO2      O3  Unnamed: 11  Unnamed: 12  Unnamed: 13  Unnamed: 14  \
0  27.64  133.36          NaN          NaN          NaN          NaN   
1  24.55   34.06          NaN          NaN          NaN          NaN   
2  29.07   30.70          NaN          NaN          NaN          NaN   
3  18.59   36.08          NaN          NaN          NaN          NaN   
4  39.33   39.31          NaN          NaN          NaN          NaN   

  AQI_Bucket  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  
           Cit

In [6]:
df = df.dropna(subset=['AQI_Bucket'])


In [17]:
import pandas as pd

# Load the dataset
df = pd.read_csv('city_day.csv')

# List of important pollutant columns
columns_to_check = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3"]

# Fill missing values with the mean of each column
for col in columns_to_check:
    df[col].fillna(df[col].mean(), inplace=True)

# Describe statistics: min, max, mean, std
summary = df[columns_to_check].describe().loc[['min', 'max', 'mean', 'std']]

print("✅ Summary Statistics for Input Factors:")
print(summary)


✅ Summary Statistics for Input Factors:
           PM2.5         PM10         NO         NO2         NOx         NH3  \
min     0.040000     0.010000    0.02000    0.010000    0.000000    0.010000   
max   949.990000  1000.000000  390.68000  362.210000  467.630000  352.890000   
mean   67.450578   118.127103   17.57473   28.560659   32.309123   23.483476   
std    59.414476    71.500953   21.35922   22.941051   29.317936   20.711370   

              CO         SO2          O3  
min     0.000000    0.010000    0.010000  
max   175.810000  193.860000  257.730000  
mean    2.248598   14.531977   34.491430  
std     6.715753   16.909088   20.163443  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [19]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/city_day.csv')  # Replace with your actual file path

# List of input factor columns
input_columns = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3']

# Update all values < 1 to 1 in these columns
for col in input_columns:
    df[col] = df[col].apply(lambda x: max(x, 1))

# Optional: Save the updated dataset to a new file
df.to_csv('updated_dataset.csv', index=False)

print("✅ All minimum values are now set to 1.")

summary = df[columns_to_check].describe().loc[['min', 'max', 'mean', 'std']]

print("✅ Summary Statistics for Input Factors:")
print(summary)


✅ All minimum values are now set to 1.
✅ Summary Statistics for Input Factors:
           PM2.5         PM10          NO         NO2         NOx         NH3  \
min     1.000000     1.000000    1.000000    1.000000    1.000000    1.000000   
max   949.990000  1000.000000  390.680000  362.210000  467.630000  352.890000   
mean   67.450711   118.128515   17.580945   28.566132   32.340674   23.487037   
std    64.661312    90.603278   22.781262   24.468497   31.614275   25.681104   

              CO         SO2          O3  
min     1.000000    1.000000    1.000000  
max   175.810000  193.860000  257.730000  
mean    2.521637   14.533109   34.493676  
std     6.894697   18.132918   21.691423  


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['AQI_Bucket'] = le.fit_transform(df['AQI_Bucket'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['AQI_Bucket'] = le.fit_transform(df['AQI_Bucket'])


In [8]:
X = df[['PM2.5', 'PM10', 'NO', 'NO2','NOx', 'NH3', 'CO', 'SO2']]
y = df['AQI_Bucket']


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.688981288981289
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.65      0.61       122
           1       0.68      0.70      0.69       762
           2       0.58      0.55      0.57       286
           3       0.75      0.74      0.75       812
           4       0.74      0.78      0.76       196
           5       0.64      0.59      0.62       227

    accuracy                           0.69      2405
   macro avg       0.66      0.67      0.67      2405
weighted avg       0.69      0.69      0.69      2405

Confusion Matrix:
 [[ 79   7   2  34   0   0]
 [  3 532  61 158   3   5]
 [  1  79 158   3   4  41]
 [ 52 155   4 600   0   1]
 [  0   4  11   0 153  28]
 [  0  10  35   1  46 135]]


In [11]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.688981288981289
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.65      0.61       122
           1       0.68      0.70      0.69       762
           2       0.58      0.55      0.57       286
           3       0.75      0.74      0.75       812
           4       0.74      0.78      0.76       196
           5       0.64      0.59      0.62       227

    accuracy                           0.69      2405
   macro avg       0.66      0.67      0.67      2405
weighted avg       0.69      0.69      0.69      2405

Confusion Matrix:
 [[ 79   7   2  34   0   0]
 [  3 532  61 158   3   5]
 [  1  79 158   3   4  41]
 [ 52 155   4 600   0   1]
 [  0   4  11   0 153  28]
 [  0  10  35   1  46 135]]


In [12]:

# SVM
svm = SVC()
svm.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.688981288981289
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.65      0.61       122
           1       0.68      0.70      0.69       762
           2       0.58      0.55      0.57       286
           3       0.75      0.74      0.75       812
           4       0.74      0.78      0.76       196
           5       0.64      0.59      0.62       227

    accuracy                           0.69      2405
   macro avg       0.66      0.67      0.67      2405
weighted avg       0.69      0.69      0.69      2405

Confusion Matrix:
 [[ 79   7   2  34   0   0]
 [  3 532  61 158   3   5]
 [  1  79 158   3   4  41]
 [ 52 155   4 600   0   1]
 [  0   4  11   0 153  28]
 [  0  10  35   1  46 135]]


In [13]:
# KNN
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.688981288981289
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.65      0.61       122
           1       0.68      0.70      0.69       762
           2       0.58      0.55      0.57       286
           3       0.75      0.74      0.75       812
           4       0.74      0.78      0.76       196
           5       0.64      0.59      0.62       227

    accuracy                           0.69      2405
   macro avg       0.66      0.67      0.67      2405
weighted avg       0.69      0.69      0.69      2405

Confusion Matrix:
 [[ 79   7   2  34   0   0]
 [  3 532  61 158   3   5]
 [  1  79 158   3   4  41]
 [ 52 155   4 600   0   1]
 [  0   4  11   0 153  28]
 [  0  10  35   1  46 135]]


In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Scale your features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X = your features like PM2.5, PM10, etc.

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Use Logistic Regression with increased max_iter and alternate solver
logreg = LogisticRegression(max_iter=3000, solver='liblinear')  # 'liblinear' is good for small datasets
logreg.fit(X_train, y_train)

# Step 4: Evaluate
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6228690228690229
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       122
           1       0.55      0.64      0.59       762
           2       0.37      0.06      0.11       286
           3       0.66      0.91      0.76       812
           4       0.77      0.78      0.77       196
           5       0.71      0.45      0.55       227

    accuracy                           0.62      2405
   macro avg       0.51      0.47      0.46      2405
weighted avg       0.57      0.62      0.57      2405

Confusion Matrix:
 [[  0   2   0 120   0   0]
 [  0 490   7 260   5   0]
 [  0 246  18   2   5  15]
 [  0  77   0 735   0   0]
 [  0   8   9   0 152  27]
 [  0  74  15   0  35 103]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Create a DataFrame with X_test and predicted results
result_df = pd.DataFrame(X_test, columns=['PM2.5', 'PM10', 'NO', 'NO2','NOx', 'NH3', 'CO', 'SO2'])
result_df['Actual'] = y_test.values
result_df['Predicted'] = y_pred

# Save to CSV
result_df.to_csv('prediction_results.csv', index=False)


In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib

# 1. Load the CSV file
df = pd.read_csv('city_day.csv')

# 2. Extract features and target
X = df[['PM2.5', 'PM10', 'NO', 'NO2', 'NH3', 'CO', 'SO2']]
y = df['AQI_Bucket']  # Make sure this is your label column

# 3. Train a model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# 4. Save the model to a file
joblib.dump(model, 'ml_model.pkl')

print("Model trained and saved as ml_model.pkl")


Model trained and saved as ml_model.pkl
