In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [4]:
# Set the display options to show all rows
pd.set_option('display.max_rows', None)

In [5]:
# Count the number of missing values in each column
train.isna().sum()

ID_LAT_LON_YEAR_WEEK                                            0
latitude                                                        0
longitude                                                       0
year                                                            0
week_no                                                         0
SulphurDioxide_SO2_column_number_density                    14609
SulphurDioxide_SO2_column_number_density_amf                14609
SulphurDioxide_SO2_slant_column_number_density              14609
SulphurDioxide_cloud_fraction                               14609
SulphurDioxide_sensor_azimuth_angle                         14609
SulphurDioxide_sensor_zenith_angle                          14609
SulphurDioxide_solar_azimuth_angle                          14609
SulphurDioxide_solar_zenith_angle                           14609
SulphurDioxide_SO2_column_number_density_15km               14609
CarbonMonoxide_CO_column_number_density                      2122
CarbonMono

In [6]:
# Get the column names
column_names = train.columns

# Display the column names
print(column_names)

Index(['ID_LAT_LON_YEAR_WEEK', 'latitude', 'longitude', 'year', 'week_no',
       'SulphurDioxide_SO2_column_number_density',
       'SulphurDioxide_SO2_column_number_density_amf',
       'SulphurDioxide_SO2_slant_column_number_density',
       'SulphurDioxide_cloud_fraction', 'SulphurDioxide_sensor_azimuth_angle',
       'SulphurDioxide_sensor_zenith_angle',
       'SulphurDioxide_solar_azimuth_angle',
       'SulphurDioxide_solar_zenith_angle',
       'SulphurDioxide_SO2_column_number_density_15km',
       'CarbonMonoxide_CO_column_number_density',
       'CarbonMonoxide_H2O_column_number_density',
       'CarbonMonoxide_cloud_height', 'CarbonMonoxide_sensor_altitude',
       'CarbonMonoxide_sensor_azimuth_angle',
       'CarbonMonoxide_sensor_zenith_angle',
       'CarbonMonoxide_solar_azimuth_angle',
       'CarbonMonoxide_solar_zenith_angle',
       'NitrogenDioxide_NO2_column_number_density',
       'NitrogenDioxide_tropospheric_NO2_column_number_density',
       'NitrogenDioxide

In [7]:
train.dtypes

ID_LAT_LON_YEAR_WEEK                                         object
latitude                                                    float64
longitude                                                   float64
year                                                          int64
week_no                                                       int64
SulphurDioxide_SO2_column_number_density                    float64
SulphurDioxide_SO2_column_number_density_amf                float64
SulphurDioxide_SO2_slant_column_number_density              float64
SulphurDioxide_cloud_fraction                               float64
SulphurDioxide_sensor_azimuth_angle                         float64
SulphurDioxide_sensor_zenith_angle                          float64
SulphurDioxide_solar_azimuth_angle                          float64
SulphurDioxide_solar_zenith_angle                           float64
SulphurDioxide_SO2_column_number_density_15km               float64
CarbonMonoxide_CO_column_number_density         

In [8]:
train.shape

(79023, 76)

In [9]:
# Select the features and target variable for training
X_train = train.drop('emission', axis=1)
y_train = train['emission']

# Select the features for testing
X_test = test

In [10]:
# Split the training data into a training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [11]:
# Select only the columns with missing values
cols_with_missing = X_train.columns[X_train.isnull().any()].tolist()
X_train_missing = X_train[cols_with_missing]
X_val_missing = X_val[cols_with_missing]
X_test_missing = X_test[cols_with_missing]

In [None]:
# Define a pipeline that includes the KNNImputer and a regression model
pipe = Pipeline([
    ('imputer', KNNImputer()),
    ('regressor', RandomForestRegressor())
])

# Define a range of values to try for the n_neighbors parameter
param_grid = {'imputer__n_neighbors': range(1, 11)}

# Use cross-validation to evaluate the performance of each value of n_neighbors
scores = []
for n_neighbors in param_grid['imputer__n_neighbors']:
    pipe.set_params(imputer__n_neighbors=n_neighbors)
    score = cross_val_score(pipe, X_train_missing, y_train, cv=5)
    scores.append(score.mean())

# Find the value of n_neighbors that produced the best cross-validation score
best_n_neighbors = param_grid['imputer__n_neighbors'][np.argmax(scores)]

# Create a KNNImputer object with the best value of n_neighbors
imputer = KNNImputer(n_neighbors=best_n_neighbors)

# Impute missing values in the training data
X_train_result = imputer.fit_transform(X_train_missing)

# Impute missing values in the validation and test data
X_val_result = imputer.transform(X_val_missing)
X_test_result = imputer.transform(X_test_missing)

In [None]:
# Convert the numpy.ndarray to a DataFrame
X_train_df = pd.DataFrame(X_train_result)
X_val_df = pd.DataFrame(X_val_result)
X_test_df = pd.DataFrame(X_test_result)

# Set the column names of the imputed DataFrames
X_train_df.columns = cols_with_missing
X_val_df.columns = cols_with_missing
X_test_df.columns = cols_with_missing

In [None]:
# Columns to be selected
cols = ['ID_LAT_LON_YEAR_WEEK','latitude','longitude','year','week_no']

# Combine the imputed columns with the selected columns
X_train_imputed = pd.concat([X_train[cols].reset_index(drop=True), X_train_df], axis=1)
X_val_imputed = pd.concat([X_val[cols].reset_index(drop=True), X_val_df], axis=1)
X_test_imputed = pd.concat([X_test[cols].reset_index(drop=True), X_test_df], axis=1)

In [None]:
# Save the new dataframe to a CSV file named "Rwanda_CO2_emission_datasets"

# Concatenate the feature DataFrames along the row axis
X = pd.concat([X_train_imputed, X_val_imputed], axis=0)

# Concatenate the target Series along the row axis
y = pd.concat([y_train, y_val], axis=0)

# Add the target variable as a new column in the feature DataFrame
X['emission'] = y

# Sort the DataFrame by 'ID_LAT_LON_YEAR_WEEK' in ascending order
X_sorted = X.sort_values(by='ID_LAT_LON_YEAR_WEEK', ascending=True)

# Save the sorted DataFrame to a CSV file
X_sorted.to_csv('train_imputed.csv', index=False)

# Save the X_test_imputed DataFrame to a CSV file
pd.DataFrame(X_test_imputed).to_csv('test_imputed.csv', index=False)

In [None]:
# Function to calculate the "Sulphur dioxide" value
def calculate_sulphur_dioxide(row):
    return row['SulphurDioxide_SO2_slant_column_number_density'] / row['SulphurDioxide_SO2_column_number_density_amf']

# Function to calculate the "Carbon Monoxide" value
def calculate_co_column_density(row):
    co_column_density = row['CarbonMonoxide_CO_column_number_density']
    h2o_column_density = row['CarbonMonoxide_H2O_column_number_density']
    cloud_height = row['CarbonMonoxide_cloud_height']
    sensor_altitude = row['CarbonMonoxide_sensor_altitude']

    return (co_column_density - h2o_column_density) * (sensor_altitude / (sensor_altitude - cloud_height)) * row['CarbonMonoxide_CO_column_number_density']

# Function to calculate the "Nitrogen Dioxide" value
def calculate_no2_column_density(row):
    no2_slant_column_density = row['NitrogenDioxide_NO2_slant_column_number_density']
    tropospheric_no2_column_density = row['NitrogenDioxide_tropospheric_NO2_column_number_density']
    stratospheric_no2_column_density = row['NitrogenDioxide_stratospheric_NO2_column_number_density']
    return (no2_slant_column_density - tropospheric_no2_column_density) / (1 + (stratospheric_no2_column_density / no2_slant_column_density))

# Function to calculate the "Formaldehyde" value
def calculate_hcho_column_density(row):
    hcho_slant_column_density = row['Formaldehyde_HCHO_slant_column_number_density']
    hcho_tropospheric_column_density = row['Formaldehyde_tropospheric_HCHO_column_number_density']
    hcho_tropospheric_column_number_density_amf = row['Formaldehyde_tropospheric_HCHO_column_number_density_amf']
    return (hcho_slant_column_density - hcho_tropospheric_column_density) / hcho_tropospheric_column_number_density_amf

# Function to calculate the "UV Aerosol Index" value
def calculate_uv_aerosol_index(row):
    return row['UvAerosolIndex_absorbing_aerosol_index'] * (row['UvAerosolLayerHeight_aerosol_height'] / row['UvAerosolLayerHeight_aerosol_pressure'])

# Function to calculate the "Ozone" value
def calculate_ozone(row):
    return row['Ozone_O3_slant_column_number_density'] / row['Ozone_O3_column_number_density_amf']

# Function to calculate the "Cloud" value
def calculate_cloud(row):
    return row['Cloud_cloud_fraction'] * (row['Cloud_cloud_top_pressure'] - row['Cloud_cloud_base_pressure'])

In [None]:
# Define a function to calculate new columns for a DataFrame
def calculate_new_columns(df):
    df['Sulphur_dioxide'] = df.apply(calculate_sulphur_dioxide, axis=1).round(7)
    df['Carbon_Monoxide'] = df.apply(calculate_co_column_density, axis=1).round(7)
    df['Nitrogen_Dioxide'] = df.apply(calculate_no2_column_density, axis=1).round(7)
    df['Formaldehyde'] = df.apply(calculate_hcho_column_density, axis=1).round(7)
    df['UV_Aerosol_Index'] = df.apply(calculate_uv_aerosol_index, axis=1).round(7)
    df['Ozone'] = df.apply(calculate_ozone, axis=1).round(3)
    df['Cloud'] = df.apply(calculate_cloud, axis=1).round(3)

# Calculate new columns for all DataFrames
calculate_new_columns(X_train_imputed)
calculate_new_columns(X_val_imputed)
calculate_new_columns(X_test_imputed)

# Select only the relevant columns
columns = ['ID_LAT_LON_YEAR_WEEK', 'latitude', 'longitude', 'year', 'week_no', 'Sulphur_dioxide', 'Carbon_Monoxide', 'Nitrogen_Dioxide', 'Formaldehyde', 'UV_Aerosol_Index', 'Ozone', 'Cloud']
X_train_imputed = X_train_imputed[columns]
X_val_imputed = X_val_imputed[columns]
X_test_imputed = X_test_imputed[columns]


In [None]:
# Save the new dataframe to a CSV file named "Rwanda_CO2_emission_datasets"

# Concatenate the feature DataFrames along the row axis
X = pd.concat([X_train_imputed, X_val_imputed], axis=0)

# Concatenate the target Series along the row axis
y = pd.concat([y_train, y_val], axis=0)

# Add the target variable as a new column in the feature DataFrame
X['emission'] = y

# Sort the DataFrame by 'ID_LAT_LON_YEAR_WEEK' in ascending order
X_sorted = X.sort_values(by='ID_LAT_LON_YEAR_WEEK', ascending=True)

# Save the sorted DataFrame to a CSV file
X_sorted.to_csv('Rwanda_CO2_emission_dataset_train.csv', index=False)

# Save the X_test_imputed DataFrame to a CSV file
pd.DataFrame(X_test_imputed).to_csv('Rwanda_CO2_emission_dataset_test.csv', index=False)