In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, cohen_kappa_score
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv("C:/Users/ASUS/MINI PROJECT/data.csv", encoding='cp1252')

# Define breakpoints for pollutants
breakpoints = {
    'so2': [
        (0, 50, 0, 50),    # Good
        (51, 100, 51, 100),  # Moderate
        (101, 200, 101, 150), # Unhealthy for Sensitive Groups
        (201, 300, 151, 200), # Unhealthy
        (301, 400, 201, 300), # Very Unhealthy
        (401, float('inf'), 301, 500) # Hazardous
    ],
    'no2': [
        (0, 40, 0, 50),
        (41, 80, 51, 100),
        (81, 180, 101, 150),
        (181, 280, 151, 200),
        (281, 400, 201, 300),
        (401, float('inf'), 301, 500)
    ],
    'rspm': [
        (0, 30, 0, 50),
        (31, 60, 51, 100),
        (61, 90, 101, 150),
        (91, 120, 151, 200),
        (121, 250, 201, 300),
        (251, float('inf'), 301, 500)
    ],
    'spm': [
        (0, 50, 0, 50),
        (51, 100, 51, 100),
        (101, 250, 101, 150),
        (251, 350, 151, 200),
        (351, 500, 201, 300),
        (501, float('inf'), 301, 500)
    ]
}

# Impute missing values
imputer_X = SimpleImputer(strategy='mean')
X_value_imputed = imputer_X.fit_transform(df[['so2', 'no2', 'rspm', 'spm']])

def calculate_individual_aqi(pollutant_value, breakpoints):
    for bp_lo, bp_hi, i_lo, i_hi in breakpoints:
        if bp_lo <= pollutant_value <= bp_hi:
            return ((i_hi - i_lo) / (bp_hi - bp_lo)) * (pollutant_value - bp_lo) + i_lo
    return None

def calculate_aqi(df):
    aqi_values = []
    for i, row in df.iterrows():
        so2_aqi = calculate_individual_aqi(row['so2'], breakpoints['so2'])
        no2_aqi = calculate_individual_aqi(row['no2'], breakpoints['no2'])
        rspm_aqi = calculate_individual_aqi(row['rspm'], breakpoints['rspm'])
        spm_aqi = calculate_individual_aqi(row['spm'], breakpoints['spm'])

        pollutants_aqi = [so2_aqi, no2_aqi, rspm_aqi, spm_aqi]
        pollutants_aqi = [aqi if aqi is not None else 0 for aqi in pollutants_aqi]
        
        overall_aqi = max(pollutants_aqi)
        aqi_values.append(overall_aqi)
    
    df['AQI'] = aqi_values
    return df

df = calculate_aqi(df)

# Clean the DataFrame
df_cleaned = df.dropna(subset=['AQI'])

Y_value = df_cleaned['AQI']

# Match indices for imputed values
X_value_imputed_clean = X_value_imputed[df_cleaned.index]

X_train_value, X_test_value, Y_train_value, Y_test_value = train_test_split(X_value_imputed_clean, Y_value, test_size=0.2, random_state=42)

# Train KNN regressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train_value, Y_train_value)

y_pred_value_train = knn_regressor.predict(X_train_value)
y_pred_value_test = knn_regressor.predict(X_test_value)

# Calculate MSE
mse_train = mean_squared_error(Y_train_value, y_pred_value_train)
mse_test = mean_squared_error(Y_test_value, y_pred_value_test)

print("MSE on training data (AQI prediction):", mse_train)
print("MSE on test data (AQI prediction):", mse_test)

def categorize_aqi(aqi_value):
    if aqi_value <= 50:
        return 'Good'
    elif 51 <= aqi_value <= 100:
        return 'Moderate'
    elif 101 <= aqi_value <= 200:
        return 'Unhealthy for Sensitive Groups'
    elif 201 <= aqi_value <= 300:
        return 'Unhealthy'
    elif 301 <= aqi_value <= 400:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

df_cleaned['AQI_Range'] = df_cleaned['AQI'].apply(categorize_aqi)

X_range = X_value_imputed_clean
Y_range = df_cleaned['AQI_Range'] 

# Clean for classification
df_cleaned_range = df_cleaned.dropna(subset=['AQI_Range'])

# Match indices for imputed values
X_range = X_value_imputed_clean[df_cleaned_range.index]

X_train_range, X_test_range, Y_train_range, Y_test_range = train_test_split(X_range, Y_range, test_size=0.33, random_state=42)

# Train KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_range, Y_train_range)

y_pred_range_train = knn_classifier.predict(X_train_range)
y_pred_range_test = knn_classifier.predict(X_test_range)

# Calculate accuracy and Kappa score
accuracy_train = accuracy_score(Y_train_range, y_pred_range_train)
accuracy_test = accuracy_score(Y_test_range, y_pred_range_test)
kappa_score = cohen_kappa_score(Y_test_range, y_pred_range_test)

print("Accuracy on training data (AQI range prediction):", accuracy_train)
print("Accuracy on test data (AQI range prediction):", accuracy_test)
print("Kappa Score on test data (AQI range prediction):", kappa_score)


  df = pd.read_csv("C:/Users/ASUS/MINI PROJECT/data.csv", encoding='cp1252')


MSE on training data (AQI prediction): 61.246921040952536
MSE on test data (AQI prediction): 98.54510967016805
Accuracy on training data (AQI range prediction): 0.9927897871874004
Accuracy on test data (AQI range prediction): 0.9883375638930422
Kappa Score on test data (AQI range prediction): 0.9834766352843834


In [9]:
import numpy as np
import pandas as pd

# Assuming the KNN model (knn_regressor) is already trained

def predict_future_aqi(date, location, df, knn_regressor, imputer_X):
    # Filter historical data for the location
    location_data = df[df['location'] == location]

    if location_data.empty:
        return f"No historical data available for {location}.", None

    # Use the average pollutant values from historical data of the location
    avg_pollutants = location_data[['so2', 'no2', 'rspm', 'spm']].mean().values.reshape(1, -1)
    
    # Impute missing values (if any) using the imputer that was trained earlier
    avg_pollutants_imputed = imputer_X.transform(avg_pollutants)
    
    # Predict AQI for the future date
    predicted_aqi = knn_regressor.predict(avg_pollutants_imputed)
    
    # Categorize AQI range
    predicted_aqi_range = categorize_aqi(predicted_aqi[0])
    
    return predicted_aqi[0], predicted_aqi_range

# Get date and location input from the user
date_to_predict = input("Enter the date (YYYY-MM-DD) to predict AQI: ")
location_to_predict = input("Enter the location for AQI prediction: ")

# Predict AQI and AQI range for the future date
predicted_aqi, predicted_aqi_range = predict_future_aqi(date_to_predict, location_to_predict, df, knn_regressor, imputer_X)

print(f"Predicted AQI value for {date_to_predict} at {location_to_predict}: {predicted_aqi}")
print(f"Predicted AQI range for {date_to_predict} at {location_to_predict}: {predicted_aqi_range}")


Enter the date (YYYY-MM-DD) to predict AQI:  2024-12-12
Enter the location for AQI prediction:  Chennai


Predicted AQI value for 2024-12-12 at Chennai: 114.48322147651007
Predicted AQI range for 2024-12-12 at Chennai: Unhealthy for Sensitive Groups




In [2]:
def predict_aqi_for_date(df, date, location):
    # Filter the DataFrame for the specific date and location
    date_location_data = df[(df['sampling_date'] == date) & (df['location'] == location)]
    
    if date_location_data.empty:
        return f"No data available for {date} at {location}.", None  # Return None as the second value
    
    # Impute the features
    features = date_location_data[['so2', 'no2', 'rspm', 'spm']]
    features_imputed = imputer_X.transform(features)

    # Predict the AQI value
    predicted_aqi = knn_regressor.predict(features_imputed)

    # Calculate AQI range
    predicted_aqi_range = [categorize_aqi(aqi) for aqi in predicted_aqi]

    return predicted_aqi, predicted_aqi_range  # Return the predicted values


In [3]:
date_to_predict = '2023-10-01'  # Replace with the date you want to check
location_to_predict = 'Akola'  # Replace with the desired location

predicted_aqi, predicted_aqi_range = predict_aqi_for_date(df, date_to_predict, location_to_predict)

if predicted_aqi is not None:
    print(f"Predicted AQI values for {date_to_predict} at {location_to_predict}: {predicted_aqi}")
    print(f"Predicted AQI ranges for {date_to_predict} at {location_to_predict}: {predicted_aqi_range}")
else:
    print(f"No predictions available for {date_to_predict} at {location_to_predict}.")


Predicted AQI values for 2023-10-01 at Akola: No data available for 2023-10-01 at Akola.
Predicted AQI ranges for 2023-10-01 at Akola: None
