In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Load dataset
file_path = "AQI_of_different_Regions_Final.csv"  # Update with your actual file path
df=pd.read_csv(r'C:\Users\ASUS\Desktop\Climate1\AQI.csv')

def train_and_forecast(city, future_date):
    """
    Trains the Holt-Winters model on AQI data for a given city and predicts AQI for a future date.

    Parameters:
    - city (str): Name of the city.
    - future_date (str): Future date in 'YYYY-MM-DD' format.

    Returns:
    - Predicted AQI value and confidence probability.
    """
    # Filter data for the selected city
    df_city = df[df['City'] == city].copy()
    
    if df_city.empty:
        return f"City '{city}' not found in the dataset."
    
    # Convert date column to datetime and set as index
    df_city['date'] = pd.to_datetime(df_city['date'])
    df_city.set_index('date', inplace=True)
    
    # Ensure daily frequency
    df_city = df_city.asfreq('D')
    
    # Fill missing AQI values using interpolation
    df_city['Index Value'] = df_city['Index Value'].interpolate()

    # Define seasonal period (assumed yearly seasonality)
    seasonal_period = 365 if len(df_city) > 365 else 30  # Use 30 if dataset is small
    
    # Train Holt-Winters model
    model = ExponentialSmoothing(
        df_city['Index Value'],
        trend='add',
        seasonal='add',
        seasonal_periods=seasonal_period
    ).fit()
    
    # Convert future_date to datetime
    future_date = pd.to_datetime(future_date)
    
    # Predict AQI for the given future date
    days_ahead = (future_date - df_city.index[-1]).days
    if days_ahead < 1:
        return "Future date must be beyond the last recorded date."
    
    predicted_aqi = model.forecast(steps=days_ahead).iloc[-1]

    # Estimate prediction confidence using past error
    past_predictions = model.fittedvalues
    actual_values = df_city['Index Value'].iloc[:len(past_predictions)]
    error = np.abs(past_predictions - actual_values)
    
    # Calculate mean absolute error (MAE) as confidence indicator
    mae = np.mean(error)
    confidence_prob = max(0, 1 - (mae / predicted_aqi))  # Normalize confidence

    return {
        "City": city,
        "Future Date": future_date.strftime("%Y-%m-%d"),
        "Predicted AQI": round(predicted_aqi, 2),
        "Prediction Confidence": round(confidence_prob * 100, 2)  # Percentage
    }

# Example usage
if __name__ == "__main__":
    city_name = input("Enter city name: ")
    future_date = input("Enter future date (YYYY-MM-DD): ")
    
    result = train_and_forecast(city_name, future_date)
    print("\nAQI Prediction Result:")
    print(result)



AQI Prediction Result:
{'City': 'Agra', 'Future Date': '2024-06-10', 'Predicted AQI': -572.74, 'Prediction Confidence': 105.4}


In [8]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Load dataset
file_path = "AQI_of_different_Regions_Final.csv"  # Update with your actual file path


def train_and_forecast(city, future_date):
    """
    Trains the Holt-Winters model on AQI data for a given city and predicts AQI for a future date.

    Parameters:
    - city (str): Name of the city.
    - future_date (str): Future date in 'YYYY-MM-DD' format.

    Returns:
    - Predicted AQI value and confidence probability.
    """
    # Filter data for the selected city
    df_city = df[df['City'] == city].copy()
    
    if df_city.empty:
        return f"City '{city}' not found in the dataset."
    
    # Convert date column to datetime and set as index
    df_city['date'] = pd.to_datetime(df_city['date'])
    df_city.set_index('date', inplace=True)
    
    # Ensure daily frequency
    df_city = df_city.asfreq('D')
    
    # Fill missing AQI values using interpolation
    df_city['Index Value'] = df_city['Index Value'].interpolate()

    # Define seasonal period (assumed yearly seasonality)
    seasonal_period = 365 if len(df_city) > 365 else 30  # Use 30 if dataset is small
    
    # Train Holt-Winters model (use multiplicative trend to prevent negative AQI)
    model = ExponentialSmoothing(
        df_city['Index Value'],
        trend='mul',  # Changed from 'add' to 'mul'
        seasonal='mul',  # Changed from 'add' to 'mul'
        seasonal_periods=seasonal_period
    ).fit()
    
    # Convert future_date to datetime
    future_date = pd.to_datetime(future_date)
    
    # Predict AQI for the given future date
    days_ahead = (future_date - df_city.index[-1]).days
    if days_ahead < 1:
        return "Future date must be beyond the last recorded date."
    
    predicted_aqi = model.forecast(steps=days_ahead).iloc[-1]

    # Clip negative AQI values to zero
    predicted_aqi = max(predicted_aqi, 0)

    # Estimate prediction confidence using past error
    past_predictions = model.fittedvalues
    actual_values = df_city['Index Value'].iloc[:len(past_predictions)]
    error = np.abs(past_predictions - actual_values)

    # Compute MAPE (Mean Absolute Percentage Error) for confidence estimation
    mape = np.mean((error / actual_values.replace(0, np.nan)).dropna())  # Avoid divide by zero

    # Convert MAPE to confidence score (higher MAPE means lower confidence)
    confidence_prob = max(0, 1 - mape) * 100  # Normalize confidence to percentage

    return {
        "City": city,
        "Future Date": future_date.strftime("%Y-%m-%d"),
        "Predicted AQI": round(predicted_aqi, 2),
        "Prediction Confidence": round(confidence_prob, 2)  # Percentage
    }

# Example usage
if __name__ == "__main__":
    city_name = input("Enter city name: ")
    future_date = input("Enter future date (YYYY-MM-DD): ")
    
    result = train_and_forecast(city_name, future_date)
    print("\nAQI Prediction Result:")
    print(result)



AQI Prediction Result:
{'City': 'Agra', 'Future Date': '2025-02-25', 'Predicted AQI': 377.54, 'Prediction Confidence': 73.67}




In [10]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load dataset


def train_hybrid_model(city, future_date):
    """
    Trains a hybrid AQI prediction model using Holt-Winters and XGBoost.
    
    Parameters:
    - city (str): City name
    - future_date (str): Future date in 'YYYY-MM-DD' format
    
    Returns:
    - Predicted AQI value and confidence range
    """
    # Filter data for the selected city
    df_city = df[df['City'] == city].copy()
    
    if df_city.empty:
        return f"City '{city}' not found in the dataset."

    # Convert date column to datetime and set as index
    df_city['date'] = pd.to_datetime(df_city['date'])
    df_city.set_index('date', inplace=True)
    
    # Ensure daily frequency
    df_city = df_city.asfreq('D')
    
    # Fill missing AQI values using rolling mean
    df_city['Index Value'] = df_city['Index Value'].fillna(df_city['Index Value'].rolling(7, min_periods=1).mean())

    # Train Holt-Winters Model (choosing best parameters)
    model_hw = ExponentialSmoothing(
        df_city['Index Value'], 
        trend='mul', 
        seasonal='mul', 
        seasonal_periods=365
    ).fit()

    # Holt-Winters Forecast
    df_city['HW_Predicted'] = model_hw.fittedvalues

    # Calculate residuals (errors)
    df_city['Error'] = df_city['Index Value'] - df_city['HW_Predicted']

    # Feature Engineering for XGBoost
    df_city['day'] = df_city.index.day
    df_city['month'] = df_city.index.month
    df_city['year'] = df_city.index.year
    df_city['dayofweek'] = df_city.index.dayofweek

    # Prepare training data for XGBoost
    features = ['day', 'month', 'year', 'dayofweek']
    X = df_city[features]
    y = df_city['Error']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Train XGBoost on errors
    model_xgb = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
    model_xgb.fit(X_train, y_train)

    # Predict future AQI
    future_date = pd.to_datetime(future_date)
    days_ahead = (future_date - df_city.index[-1]).days
    if days_ahead < 1:
        return "Future date must be beyond the last recorded date."

    # Holt-Winters Prediction
    hw_forecast = model_hw.forecast(steps=days_ahead).iloc[-1]

    # Create future features for XGBoost
    future_features = pd.DataFrame({
        "day": [future_date.day],
        "month": [future_date.month],
        "year": [future_date.year],
        "dayofweek": [future_date.dayofweek]
    })

    # XGBoost Prediction (error correction)
    xgb_correction = model_xgb.predict(future_features)[0]

    # Final AQI prediction (Holt-Winters + XGBoost correction)
    final_prediction = hw_forecast + xgb_correction
    final_prediction = max(final_prediction, 0)  # Ensure AQI is non-negative

    # Compute confidence using MAE of past predictions
    past_hw_predictions = model_hw.fittedvalues
    past_actual = df_city['Index Value'].iloc[:len(past_hw_predictions)]
    past_errors = np.abs(past_hw_predictions - past_actual)
    mape = np.mean((past_errors / past_actual.replace(0, np.nan)).dropna())  # Avoid division by zero

    confidence_prob = max(0, 1 - mape) * 100  # Convert MAPE to confidence percentage

    return {
        "City": city,
        "Future Date": future_date.strftime("%Y-%m-%d"),
        "Predicted AQI": round(final_prediction, 2),
        "Prediction Confidence": round(confidence_prob, 2)  # Percentage
    }

# Example usage
if __name__ == "__main__":
    city_name = input("Enter city name: ")
    future_date = input("Enter future date (YYYY-MM-DD): ")
    
    result = train_hybrid_model(city_name, future_date)
    print("\nHybrid AQI Prediction Result:")
    print(result)


ValueError: endog must be strictly positive when usingmultiplicative trend or seasonal components.

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Load dataset

df=pd.read_csv(r'C:\Users\ASUS\Desktop\Climate1\AQI.csv')
# Choose a city (modify as needed)
city = "Delhi"
df_city = df[df['City'] == city].copy()

# Convert date column to datetime and set as index
df_city['date'] = pd.to_datetime(df_city['date'])
df_city.set_index('date', inplace=True)
df_city = df_city.asfreq('D')

# Fill missing values with rolling mean
df_city['Index Value'] = df_city['Index Value'].fillna(df_city['Index Value'].rolling(7, min_periods=1).mean())

# Ensure all values are positive for multiplicative test
df_city['Index Value'] = df_city['Index Value'].replace(0, np.nan).fillna(df_city['Index Value'].median()) + 1

# Fit Additive Model
model_add = ExponentialSmoothing(df_city['Index Value'], trend='add', seasonal='add', seasonal_periods=365).fit()
df_city['HW_Add'] = model_add.fittedvalues

# Fit Multiplicative Model (only if all values are positive)
try:
    model_mul = ExponentialSmoothing(df_city['Index Value'], trend='mul', seasonal='mul', seasonal_periods=365).fit()
    df_city['HW_Mul'] = model_mul.fittedvalues
    multiplicative_valid = True
except ValueError as e:
    print("Multiplicative model failed:", e)
    multiplicative_valid = False

# Plot results
plt.figure(figsize=(12,6))
plt.plot(df_city.index, df_city['Index Value'], label="Actual AQI", color='black', alpha=0.7)
plt.plot(df_city.index, df_city['HW_Add'], label="Holt-Winters Additive", linestyle="dashed")
if multiplicative_valid:
    plt.plot(df_city.index, df_city['HW_Mul'], label="Holt-Winters Multiplicative", linestyle="dotted")
plt.legend()
plt.title(f"Holt-Winters Model Comparison for {city}")
plt.xlabel("Date")
plt.ylabel("AQI")
plt.show()

# Compute Mean Absolute Error (MAE) to compare models
mae_add = np.mean(np.abs(df_city['Index Value'] - df_city['HW_Add']))
mae_mul = np.mean(np.abs(df_city['Index Value'] - df_city['HW_Mul'])) if multiplicative_valid else float("inf")

print(f"MAE (Additive): {mae_add:.2f}")
if multiplicative_valid:
    print(f"MAE (Multiplicative): {mae_mul:.2f}")

# Recommend best model
if mae_mul < mae_add and multiplicative_valid:
    print("✅ Multiplicative Model is better.")
else:
    print("✅ Additive Model is better.")


ValueError: Cannot compute initial seasonals using heuristic method with less than two full seasonal cycles in the data.

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load dataset
df=pd.read_csv(r'C:\Users\ASUS\Desktop\Climate1\AQI.csv')  # Update with your file path

# Convert date column to datetime format

df_city['Index Value'] = df_city['Index Value'].interpolate(method='linear')

# Count unique days per city
city_data_counts = df.groupby('City')['date'].nunique()

# Categorize cities
cities_with_2_years_data = city_data_counts[city_data_counts > 730].index.tolist()
cities_with_less_than_2_years_data = city_data_counts[city_data_counts <= 730].index.tolist()

# Store prediction results
prediction_results = []

for city in df['City'].unique():
    df_city = df[df['City'] == city].copy()
    df_city.set_index('date', inplace=True)
    df_city = df_city.asfreq('D')
    
    # Fill missing values using rolling mean
    df_city['Index Value'] = df_city['Index Value'].fillna(df_city['Index Value'].rolling(7, min_periods=1).mean())
    
    # Define features and target
    df_city['day_of_year'] = df_city.index.dayofyear
    df_city['year'] = df_city.index.year
    X = df_city[['day_of_year', 'year']]
    y = df_city['Index Value']
    
    # Split data for training/testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    if city in cities_with_2_years_data:
        # Apply Holt-Winters
        seasonal_periods = min(365, len(df_city) // 2)
        model_hw = ExponentialSmoothing(y_train, trend='add', seasonal='add', seasonal_periods=seasonal_periods).fit()
        hw_pred = model_hw.forecast(len(X_test))
        X_train['hw_prediction'] = model_hw.fittedvalues
        X_test['hw_prediction'] = hw_pred
    
    # Train XGBoost model
    model_xgb = XGBRegressor(objective='reg:squarederror', n_estimators=50, learning_rate=0.1, max_depth=3)
    model_xgb.fit(X_train, y_train)
    xgb_pred = model_xgb.predict(X_test)
    
    # Final prediction
    final_prediction = xgb_pred if city in cities_with_less_than_2_years_data else (0.5 * xgb_pred + 0.5 * hw_pred)
    
    # Compute confidence
    mae = mean_absolute_error(y_test, final_prediction)
    confidence = max(0, 100 - (mae / np.mean(y_test)) * 100)
    
    # Store results
    prediction_results.append({
        "City": city,
        "Predicted AQI": round(final_prediction[-1], 2),
        "Prediction Confidence": round(confidence, 2)
    })

# Output predictions
for result in prediction_results:
    print(result)




XGBoostError: [15:52:44] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\data.cc:514: Check failed: valid: Label contains NaN, infinity or a value too large.

In [21]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer

# Load dataset
  # Update with your file path

# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'])
df_city = df_city[~df_city.index.duplicated(keep='first')]

# Count unique days per citydf_city.set_index('date', inplace=True)
df_city = df_city.asfreq('D')

# Fix: Remove duplicate date entries
df_city = df_city[~df_city.index.duplicated(keep='first')]

city_data_counts = df.groupby('City')['date'].nunique()


# Categorize cities
cities_with_2_years_data = city_data_counts[city_data_counts > 730].index.tolist()
cities_with_less_than_2_years_data = city_data_counts[city_data_counts <= 730].index.tolist()

# Store prediction results
prediction_results = []

for city in df['City'].unique():
    df_city = df[df['City'] == city].copy()
    df_city.set_index('date', inplace=True)
    df_city = df_city.asfreq('D')
    
    # Fill missing values using linear interpolation
    df_city['Index Value'] = df_city['Index Value'].interpolate(method='linear')
    
    # Use Iterative Imputer for consecutive missing values
    imputer = IterativeImputer(max_iter=10, random_state=42)
    df_city[['Index Value']] = imputer.fit_transform(df_city[['Index Value']])
    
    # Define features and target
    df_city['day_of_year'] = df_city.index.dayofyear
    df_city['year'] = df_city.index.year
    X = df_city[['day_of_year', 'year']]
    y = df_city['Index Value']
    
    # Split data for training/testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    if city in cities_with_2_years_data:
        # Apply Holt-Winters
        seasonal_periods = min(365, len(df_city) // 2)
        model_hw = ExponentialSmoothing(y_train, trend='add', seasonal='add', seasonal_periods=seasonal_periods).fit()
        hw_pred = model_hw.forecast(len(X_test))
        X_train['hw_prediction'] = model_hw.fittedvalues
        X_test['hw_prediction'] = hw_pred
    
    # Train XGBoost model
    model_xgb = XGBRegressor(objective='reg:squarederror', n_estimators=50, learning_rate=0.1, max_depth=3)
    model_xgb.fit(X_train, y_train)
    xgb_pred = model_xgb.predict(X_test)
    
    # Final prediction
    final_prediction = xgb_pred if city in cities_with_less_than_2_years_data else (0.5 * xgb_pred + 0.5 * hw_pred)
    
    # Compute confidence
    mae = mean_absolute_error(y_test, final_prediction)
    confidence = max(0, 100 - (mae / np.mean(y_test)) * 100)
    
    # Store results
    prediction_results.append({
        "City": city,
        "Predicted AQI": round(final_prediction[-1], 2),
        "Prediction Confidence": round(confidence, 2)
    })

# Output predictions
for result in prediction_results:
    print(result)


  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),
  "Predicted AQI": round(final_prediction[-1], 2),


ValueError: cannot reindex on an axis with duplicate labels

In [23]:
df.head(20)

Unnamed: 0,date,City,No. Stations,Air Quality,Index Value,Prominent Pollutant,Region
0,2015-05-01,Chennai,,Satisfactory,87,CO,Eastern Coastal Region
1,2015-05-01,Varanasi,,Moderate,157,PM10,Indo-Gangetic Region
2,2015-05-01,Hyderabad,,Moderate,189,PM2.5,Tropical wet & dry
3,2015-05-01,Agra,,Moderate,179,PM10,Indo-Gangetic Region
4,2015-05-02,Varanasi,,Moderate,156,PM10,Indo-Gangetic Region
5,2015-05-02,Hyderabad,,Satisfactory,94,PM2.5,Tropical wet & dry
6,2015-05-02,Agra,,Moderate,135,PM10,Indo-Gangetic Region
7,2015-05-02,Chennai,,Moderate,120,PM2.5,Eastern Coastal Region
8,2015-05-03,Hyderabad,,Satisfactory,66,PM2.5,Tropical wet & dry
9,2015-05-03,Varanasi,,Poor,211,PM10,Indo-Gangetic Region
