In [1]:
# Install missing libraries if not already available
!pip install pandas numpy scikit-learn matplotlib seaborn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
# Install missing libraries
!pip install missingno imbalanced-learn xgboost statsmodels

import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from imblearn.over_sampling import SMOTE

import xgboost as xgb
import statsmodels.api as sm




In [8]:
# Step 3: Upload and Load Dataset

import pandas as pd
from google.colab import files

# Upload datasets (select all CSVs including faostat.csv)
uploaded = files.upload()

# Load Kaggle datasets
station_hour = pd.read_csv("station_hour.csv")
stations     = pd.read_csv("stations.csv")
city_hour    = pd.read_csv("city_hour.csv")
station_day  = pd.read_csv("station_day.csv")
city_day     = pd.read_csv("city_day.csv")

# Load FAOSTAT dataset
faostat = pd.read_csv("faostat.csv")

# Check shapes
print("station_hour:", station_hour.shape)
print("stations:", stations.shape)
print("city_hour:", city_hour.shape)
print("station_day:", station_day.shape)
print("city_day:", city_day.shape)
print("faostat:", faostat.shape)

# Quick look at FAOSTAT
faostat.head()


Saving faostat.csv to faostat.csv
Saving station_hour.csv to station_hour (1).csv
Saving stations.csv to stations (1).csv
Saving city_hour.csv to city_hour (1).csv
Saving station_day.csv to station_day (1).csv
Saving city_day.csv to city_day (1).csv


  station_hour = pd.read_csv("station_hour.csv")


station_hour: (2589083, 16)
stations: (230, 5)
city_hour: (707875, 16)
station_day: (108035, 16)
city_day: (29531, 16)
faostat: (253, 8)


Unnamed: 0,Domain,Area,Element,Item,Year,Source,Unit,Value
0,Emissions totals,India,Emissions (CO2eq) from N2O (AR5),Crop Residues,2002,FAO TIER 1,kt,16163.887
1,Emissions totals,India,Emissions (CO2eq) (AR5),Crop Residues,2002,FAO TIER 1,kt,16163.887
2,Emissions totals,India,Emissions (CO2eq) from N2O (AR5),Crop Residues,2003,FAO TIER 1,kt,17746.2285
3,Emissions totals,India,Emissions (CO2eq) (AR5),Crop Residues,2003,FAO TIER 1,kt,17746.2285
4,Emissions totals,India,Emissions (CO2eq) from N2O (AR5),Crop Residues,2004,FAO TIER 1,kt,17508.921


In [9]:
station_hour.info()
station_day.info()
city_hour.info()
city_day.info()
stations.info()
faostat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2589083 entries, 0 to 2589082
Data columns (total 16 columns):
 #   Column      Dtype  
---  ------      -----  
 0   StationId   object 
 1   Datetime    object 
 2   PM2.5       float64
 3   PM10        float64
 4   NO          float64
 5   NO2         float64
 6   NOx         float64
 7   NH3         float64
 8   CO          float64
 9   SO2         float64
 10  O3          float64
 11  Benzene     float64
 12  Toluene     float64
 13  Xylene      float64
 14  AQI         float64
 15  AQI_Bucket  object 
dtypes: float64(13), object(3)
memory usage: 316.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108035 entries, 0 to 108034
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   StationId   108035 non-null  object 
 1   Date        108035 non-null  object 
 2   PM2.5       86410 non-null   float64
 3   PM10        65329 non-null   float64
 4   NO          90

In [10]:
for df, name in [(station_hour, "station_hour"), (station_day, "station_day"),
                 (city_hour, "city_hour"), (city_day, "city_day"),
                 (stations, "stations"), (faostat, "faostat")]:
    print(f"{name} -> Missing values:\n", df.isnull().sum(), "\n")

station_hour -> Missing values:
 StationId           0
Datetime            0
PM2.5          647689
PM10          1119252
NO             553711
NO2            528973
NOx            490808
NH3           1236618
CO             499302
SO2            742737
O3             725973
Benzene        861579
Toluene       1042366
Xylene        2075104
AQI            570190
AQI_Bucket     570190
dtype: int64 

station_day -> Missing values:
 StationId         0
Date              0
PM2.5         21625
PM10          42706
NO            17106
NO2           16547
NOx           15500
NH3           48105
CO            12998
SO2           25204
O3            25568
Benzene       31455
Toluene       38702
Xylene        85137
AQI           21010
AQI_Bucket    21010
dtype: int64 

city_hour -> Missing values:
 City               0
Datetime           0
PM2.5         145088
PM10          296737
NO            116632
NO2           117122
NOx           123224
NH3           272542
CO             86517
SO2           