## Air Quality Index Prediction Model Training

In [None]:
!wget https://raw.githubusercontent.com/92-vasim/datasets/main/aqi-dataset/air%20quality%20data.csv

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import logging

logging.basicConfig(level=logging.INFO, format="[%(asctime)s]: %(message)s")

In [None]:
df = pd.read_csv("air quality data.csv")
df.head()

#### Check for null values

In [None]:
sns.heatmap(data=df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
df = df.dropna()

#### Separating independent & dependent features

In [None]:
df.columns

In [None]:
X = df.drop('PM 2.5', axis=1)
y = df['PM 2.5']

In [None]:
X.isnull().sum()

#### Pair plot
We can know the relations and trends between every two features

In [None]:
sns.pairplot(df)

#### Correlations

In [None]:
df.corr()

#### Correlation matrix with heatmap

Correlation states how the featrues are related to each other or the target variable. <br>
Correlation can be positive (increasein one value of feature increases the value of the target variable) or negative (increase in one value of feature decreases the value of the target variable).<br>
Heatmap makes it easy to indentify which features are most related to the target variable, we will plot heatmap of correlated features using the seaborn library.

In [None]:
correlations = df.corr()
top_corr_features = correlations.index
plt.figure(figsize=(20, 20))

sns.heatmap(df[top_corr_features].corr(), annot=True, cmap="RdYlGn")

In [None]:
correlations.index

#### Feature importance with graph

You can get the feature importance of each feature of your dataset by using the feature importance property of the model. <br>
Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable. <br>
Feature importance is an in-built class that comes with Tree based regressor, we will be using Extra Tree Regressor for extracting the top 10 features for the dataset.

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
tree_regressor = ExtraTreesRegressor()
tree_regressor.fit(X, y)

In [None]:
X.head()

In [None]:
tree_regressor.feature_importances_

In [None]:
import_features = pd.Series(tree_regressor.feature_importances_, index=X.columns)
import_features.nlargest(5).plot(kind='barh')

In [None]:
sns.distplot(y)

#### Training

In [None]:
!pip install xgboost catboost
!pip install numpy pandas

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Define a dictionary of regression models with their respective parameter grids for GridSearchCV
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'param_grid': {}
    },
    'Ridge Regression': {
        'model': Ridge(),
        'param_grid': {
            'alpha': [0.1, 1, 10]
        }
    },
    'Lasso Regression': {
        'model': Lasso(),
        'param_grid': {
            'alpha': [0.1, 1, 10]
        }
    },
    'Elastic Net': {
        'model': ElasticNet(),
        'param_grid': {
            'alpha': [0.1, 1, 10],
            'l1_ratio': [0.1, 0.5, 0.9]
        }
    },
    'Decision Tree Regressor': {
        'model': DecisionTreeRegressor(),
        'param_grid': {
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest Regressor': {
        'model': RandomForestRegressor(),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    # 'Gradient Boosting Regressor': {
    #     'model': GradientBoostingRegressor(verbose=1),
    #     'param_grid': {
    #         'n_estimators': [100, 200, 300],
    #         'max_depth': [3, 4, 5],
    #         'learning_rate': [0.01, 0.1, 0.2]
    #     }
    # },
    'Support Vector Regressor': {
        'model': SVR(),
        'param_grid': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf', 'poly']
        }
    },
    'K-Nearest Neighbors Regressor': {
        'model': KNeighborsRegressor(),
        'param_grid': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    },
    'XGBoost Regressor': {
        'model': XGBRegressor(),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    },
    # 'CatBoost Regressor': {
    #     'model': CatBoostRegressor(verbose=1),
    #     'param_grid': {
    #         'iterations': [100, 200, 300],
    #         'depth': [4, 6, 8],
    #         'learning_rate': [0.01, 0.1, 0.2]
    #     }
    # }
}

# Rest of the code remains the same as provided in the previous responses.


In [None]:
# Define a function to train models with different hyperparameters using GridSearchCV
def train_and_select_best_model(X, y, test_size=0.2, random_state=42):
    best_model = None
    best_r2_score = 0.7  # Initialize with a low value
    best_hyperparameters = None

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    for model_name, model_data in models.items():
        model = model_data['model']
        param_grid = model_data['param_grid']

        # Create GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

        # Fit GridSearchCV to the training data
        grid_search.fit(X_train, y_train)

        # Get the best model and its R-squared (R2) score
        best_model_cv = grid_search.best_estimator_
        r2_score = best_model_cv.score(X_test, y_test)

        print(f"{model_name} - Best R2 Score: {r2_score}")
        print(f"Best Hyperparameters: {grid_search.best_params_}")

        # Check if this model has a better R2 score than the current best
        if r2_score > best_r2_score:
            best_r2_score = r2_score
            best_model = model_name
            best_hyperparameters = grid_search.best_params_

    if best_model is not None:
        print(f"The best model is: {best_model} with R2 Score: {best_r2_score}")
        print(f"Best Hyperparameters: {best_hyperparameters}")
        return models[best_model]['model'], best_hyperparameters
    else:
        print("No best model found.")
        return None, None

# Example usage:
# Replace X and y with your dataset and target variable
# best_model, best_hyperparameters = train_and_select_best_model(X, y)


In [None]:
# Example usage:
# Replace X and y with your dataset and target variable
best_model, best_hyperparameters = train_and_select_best_model(X, y)

In [None]:
best_model