<a href="https://colab.research.google.com/github/BuhariS/DataScienceWithPython/blob/main/AirQo_African_Air_Quality_Prediction_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [25]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

# Load data

In [3]:
sample_sub_df = pd.read_csv('/content/drive/MyDrive/Personal Development/competitions/SampleSubmission.csv')
train_df = pd.read_csv('/content/drive/MyDrive/Personal Development/competitions/Train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Personal Development/competitions/Test.csv')

**Data snapshot**

In [4]:
train_df.head(3)

Unnamed: 0,id,site_id,site_latitude,site_longitude,city,country,date,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,...,cloud_cloud_top_height,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,pm2_5
0,id_vjcx08sz91,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-10-25,13,,,...,,,,,,,,,,12.015
1,id_bkg215syli,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-02,12,,,...,,,,,,,,,,42.2672
2,id_oui2pot3qd,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-03,13,,,...,6791.682888,51171.802486,5791.682829,11.816715,0.192757,-96.41189,61.045123,-121.307414,41.898269,39.450741


In [5]:
test_df.head(3)

Unnamed: 0,id,site_id,site_latitude,site_longitude,city,country,date,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,...,cloud_cloud_top_pressure,cloud_cloud_top_height,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle
0,id_ihxgrbq8bw,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-06,13,-7.2e-05,0.762543,...,74217.403083,2710.544562,83569.504246,1710.544483,3.063105,0.263193,-100.317077,27.059646,-86.88567,25.530511
1,id_dg6s4fhiwe,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-07,13,,,...,,,,,,,,,,
2,id_f7hwwtmuzp,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-08,13,-5.1e-05,1.004265,...,,,,,,,,,,


In [6]:
sample_sub_df.head(3)

Unnamed: 0,id,pm2_5
0,id_ihxgrbq8bw,0
1,id_dg6s4fhiwe,0
2,id_f7hwwtmuzp,0


**Dataset metadata**

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8071 entries, 0 to 8070
Data columns (total 80 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   id                                                        8071 non-null   object 
 1   site_id                                                   8071 non-null   object 
 2   site_latitude                                             8071 non-null   float64
 3   site_longitude                                            8071 non-null   float64
 4   city                                                      8071 non-null   object 
 5   country                                                   8071 non-null   object 
 6   date                                                      8071 non-null   object 
 7   hour                                                      8071 non-null   int64  
 8   sulphurdioxide_so2

In [8]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
site_latitude,8071.0,0.635678,2.154605,-3.459970,0.288630,0.332610,0.374653,6.595400
site_longitude,8071.0,30.246982,9.393151,3.201510,32.568643,32.600480,32.641110,36.914272
hour,8071.0,10.877463,0.815875,10.000000,10.000000,11.000000,11.000000,14.000000
sulphurdioxide_so2_column_number_density,3159.0,0.000024,0.000354,-0.000980,-0.000196,0.000012,0.000221,0.002024
sulphurdioxide_so2_column_number_density_amf,3159.0,0.687295,0.142544,0.299364,0.589849,0.669632,0.766817,1.607052
...,...,...,...,...,...,...,...,...
cloud_sensor_azimuth_angle,4994.0,-18.952179,85.873938,-102.706078,-98.865659,-97.477511,72.814735,77.319656
cloud_sensor_zenith_angle,4994.0,45.881380,14.721454,2.996863,37.901279,49.187592,57.156175,66.238778
cloud_solar_azimuth_angle,4994.0,-84.811978,40.627170,-159.058782,-125.371559,-74.597511,-49.902340,-22.251009
cloud_solar_zenith_angle,4994.0,28.708028,7.038309,10.314990,24.102775,29.002745,33.929767,45.496395


**Observation**

- Latitude and longitude have valid entries, may need transformation
- hour ranges from 10:00 AM to 2:00 PM
- the count of all so2 is 3104
- site id seems to be having the same info as city and country
- date can be separated to its constituent numbers
- replace nans with grouped averages

## Merge test and train

In [9]:
train_predictors = train_df.drop('pm2_5', axis=1)

In [16]:
def process_data(train_df, test_df):
    """
    Process the training and testing DataFrames as specified.

    Args:
        train_df (pandas.DataFrame): The training DataFrame.
        test_df (pandas.DataFrame): The testing DataFrame.

    Returns:
        tuple: A tuple containing two processed DataFrames (processed_train, processed_test).
    """
    # Stack the training predictors and test data vertically
    stacked_df = pd.concat([train_df, test_df], axis=0)

    # Perform one-hot encoding for specified columns
    dum_data_df = pd.get_dummies(stacked_df, columns=['site_id', 'city', 'country'], drop_first=True)

    # Separate processed data back into training and testing DataFrames
    processed_train = dum_data_df.iloc[:len(train_df)]
    processed_test = dum_data_df.iloc[len(train_df):]

    return processed_train, processed_test


In [18]:
processed_train, processed_test = process_data(train_predictors, test_df)

## Transform

In [22]:
def transform_dataframe(df):
    # Drop 'id' column
    df = df.drop('id', axis=1)

    # Convert 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Extract 'year' from 'date' column
    df['year'] = df['date'].dt.year

    # Drop 'date' column
    df = df.drop('date', axis=1)

    # Replace null values with 0
    df = df.fillna(0)

    return df

In [23]:
# Example usage:
transformed_train = transform_dataframe(processed_train)

In [24]:
# Example usage:
transformed_test = transform_dataframe(processed_test)

## Scaling

In [26]:
def scale_data(df):
    """
    Scales the input DataFrame using StandardScaler.

    Args:
        df (pandas.DataFrame): Input DataFrame to be scaled.

    Returns:
        numpy.ndarray: Scaled data.
    """
    # Initialize StandardScaler
    scaler = StandardScaler()

    # Fit the scaler to the data and transform the data
    X_scaled = scaler.fit_transform(df)

    return X_scaled


In [27]:
X_scaled_train = scale_data(transformed_train)

In [28]:
X_scaled_test = scale_data(transformed_test)

In [29]:
y = train_df['pm2_5']

## Train and evaluate

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

def train_test_model(predictors, target, model):
    """
    Train a model on the provided predictors and target, make predictions, and return RMSE.

    Args:
        predictors (pandas.DataFrame or numpy.ndarray): Predictors/features.
        target (pandas.Series or numpy.ndarray): Target variable.
        model: The machine learning model to be trained.

    Returns:
        float: Root Mean Squared Error (RMSE) of the model's predictions.
    """
    # Split the data into train and test sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate RMSE
    rmse = sqrt(mean_squared_error(y_test, y_pred))

    return rmse


In [44]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# Example usage
model = LinearRegression()
r_model = RandomForestRegressor()
rmse = train_test_model(X_scaled_train, y, r_model)
print("Root Mean Squared Error (RMSE):", rmse)


Root Mean Squared Error (RMSE): 16.748629566304647


In [33]:
model = LinearRegression()

In [34]:
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

In [36]:
mean_squared_error(y_test, y_pred, squared=False)

17.835869266917996

In [None]:
r_model = RandomForestRegressor()

In [None]:
r_model.fit(X_scaled_train, y)

In [None]:
y_pred = r_model.predict(X_scaled_test)

In [None]:
mean_squared_error(y_test, y_pred, squared=False)

**Test many**

In [45]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge

def compare_regression_models(predictors, target):
    """
    Compare the performance of multiple regression models using RMSE.

    Args:
        predictors (pandas.DataFrame or numpy.ndarray): Predictors/features.
        target (pandas.Series or numpy.ndarray): Target variable.

    Returns:
        pandas.DataFrame: DataFrame of models with their respective RMSE arranged in ascending order.
    """
    # Instantiate regressor models
    models = [
        LinearRegression(),
        Ridge(),
        Lasso(),
        ElasticNet(),
        SVR(),
        DecisionTreeRegressor(),
        RandomForestRegressor(),
        GradientBoostingRegressor(),
        KNeighborsRegressor(),
        BayesianRidge()
    ]

    # Dictionary to store model names and their RMSE
    model_rmse = {}

    # Train each model and calculate RMSE
    for model in models:
        model_name = model.__class__.__name__
        rmse = train_test_model(predictors, target, model)
        model_rmse[model_name] = rmse

    # Create DataFrame from model_rmse dictionary
    rmse_df = pd.DataFrame(list(model_rmse.items()), columns=['Model', 'RMSE'])

    # Sort DataFrame by RMSE in ascending order
    rmse_df = rmse_df.sort_values(by='RMSE')

    return rmse_df


In [47]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.neural_network import MLPRegressor

def compare_regression_models2(predictors, target):
    """
    Compare the performance of multiple regression models using RMSE.

    Args:
        predictors (pandas.DataFrame or numpy.ndarray): Predictors/features.
        target (pandas.Series or numpy.ndarray): Target variable.

    Returns:
        pandas.DataFrame: DataFrame of models with their respective RMSE arranged in ascending order.
    """
    # Instantiate regressor models
    models = [
        LinearRegression(),
        Ridge(),
        Lasso(),
        ElasticNet(),
        SVR(),
        DecisionTreeRegressor(),
        RandomForestRegressor(),
        GradientBoostingRegressor(),
        KNeighborsRegressor(),
        BayesianRidge(),
        AdaBoostRegressor(),
        HuberRegressor(),
        PassiveAggressiveRegressor(),
        TheilSenRegressor(),
        MLPRegressor()
    ]

    # Dictionary to store model names and their RMSE
    model_rmse = {}

    # Train each model and calculate RMSE
    for model in models:
        model_name = model.__class__.__name__
        rmse = train_test_model(predictors, target, model)
        model_rmse[model_name] = rmse

    # Create DataFrame from model_rmse dictionary
    rmse_df = pd.DataFrame(list(model_rmse.items()), columns=['Model', 'RMSE'])

    # Sort DataFrame by RMSE in ascending order
    rmse_df = rmse_df.sort_values(by='RMSE')

    return rmse_df


In [46]:
compare_regression_models(X_scaled_train, y)

Unnamed: 0,Model,RMSE
6,RandomForestRegressor,16.6787
9,BayesianRidge,17.573038
1,Ridge,17.640094
7,GradientBoostingRegressor,17.681601
0,LinearRegression,17.69953
2,Lasso,18.753111
8,KNeighborsRegressor,18.903024
3,ElasticNet,19.440097
5,DecisionTreeRegressor,21.736285
4,SVR,24.532587


In [48]:
compare_regression_models2(X_scaled_train, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0,Model,RMSE
6,RandomForestRegressor,16.86587
14,MLPRegressor,16.91141
9,BayesianRidge,17.573038
1,Ridge,17.640094
7,GradientBoostingRegressor,17.690142
0,LinearRegression,17.69953
11,HuberRegressor,18.027459
2,Lasso,18.753111
8,KNeighborsRegressor,18.903024
3,ElasticNet,19.440097


## Fine tuning random forest

In [50]:
# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled_train, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

# Instantiate Random Forest Regressor
rf_regressor = RandomForestRegressor()

# Perform grid search
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Fit the model using the best parameters
best_estimator.fit(X_train, y_train)

# Evaluate the model
y_pred = best_estimator.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print("Root Mean Squared Error:", rmse)


## Generate submission

In [40]:
def predict_with_model(predictors, target, model):
    """
    Make predictions using the provided model.

    Args:
        predictors (pandas.DataFrame or numpy.ndarray): Predictors/features.
        target (pandas.Series or numpy.ndarray): Target variable.
        model: The machine learning model.

    Returns:
        numpy.ndarray: Predicted target variable.
    """
    # Fit the model to all data
    model.fit(predictors, target)

    # Make predictions
    y_pred = model.predict(X_scaled_test)

    return y_pred


In [41]:
y_pred = predict_with_model(X_scaled_train, y, model)

In [42]:
y_pred

array([2.78534343e+13, 2.78534343e+13, 2.78534343e+13, ...,
       1.37632009e+13, 1.37632009e+13, 1.37632009e+13])

In [None]:
sample_sub_df['pm2_5'] = y_pred

In [None]:
sample_sub_df.to_csv('submission.csv', index=False)