In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

plt.style.use("seaborn-v0_8-whitegrid")


In [None]:
# Data Loading and Cleaning
# Load dataset
df = pd.read_excel('AirQualityUCI.xlsx')

# Drop completely empty columns
df = df.dropna(axis=1, how='all')

# Replace -200 with NaN (missing values)
df = df.replace(-200, np.nan)

# Combine Date and Time into one datetime column
df['Datetime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str), errors='coerce')
df = df.dropna(subset=['Datetime'])
df = df.set_index('Datetime')
df = df.drop(['Date', 'Time'], axis=1)

df.head()


In [None]:
# Interpolate missing values using time-based interpolation
df = df.interpolate(method='time')


In [None]:
# Feature Engineering
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month
df['CO_roll3'] = df['CO(GT)'].rolling(window=3).mean().shift(1)

# Drop any rows with NaN values from the rolling mean
df = df.dropna()

df.head()


In [None]:
# Exploratory Data Analysis (EDA)
plt.figure(figsize=(12,4))
df['CO(GT)'].plot(title='CO Concentration Over Time')
plt.ylabel('CO (mg/m³)')
plt.show()


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='hour', y='CO(GT)', data=df)
plt.title('CO Concentration by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('CO (mg/m³)')
plt.show()


In [None]:
# Prepare Data for Modeling
X = df.drop('CO(GT)', axis=1)
y = df['CO(GT)']

# Time-based split
split = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]


In [None]:
# Model Selection and Evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results[name] = {
        'MAE': mean_absolute_error(y_test, pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, pred)),
        'R2': r2_score(y_test, pred)
    }

df_results = pd.DataFrame(results).T
df_results


In [None]:
# Hyperparameter Tuning (Gradient Boosting)
tscv = TimeSeriesSplit(n_splits=5)
params = {
    'n_estimators': [200],
    'max_depth': [10],
    'learning_rate': [0.05],
    'subsample': [0.8]
}

gb = GradientBoostingRegressor(random_state=42)
grid = GridSearchCV(gb, params, cv=tscv, scoring='neg_mean_absolute_error')
grid.fit(X_train, y_train)

best_gb = grid.best_estimator_
y_pred_gb = best_gb.predict(X_test)

print("Tuned Gradient Boosting Performance:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_gb):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_gb)):.3f}")
print(f"R²: {r2_score(y_test, y_pred_gb):.3f}")
print(f"Best Params: {grid.best_params_}")


In [None]:
# Final Model (Random Forest)
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Final Performance:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf)):.3f}")
print(f"R²: {r2_score(y_test, y_pred_rf):.3f}")


In [None]:
# Feature Importance
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10,6))
importances.head(10).plot(kind='barh')
plt.title('Top 10 Important Features')
plt.show()


In [None]:
# Save Model and Features
feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, 'feature_columns.pkl')
joblib.dump(rf, 'air_quality_model.pkl')

print("Model and feature columns saved successfully.")
