In [6]:
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg
from pymongo import MongoClient
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from glob import glob

### The goal is to figure out if time series model such as ARIMA or AutoReg will be better predictive models of Air Quality (PM2.5) in Lagos/Nigeria than a regular Linear or Ridge regression

So imagine, you got like 5 csv files from a sleek looking website. Turns out, all the data looking dirty like coz they were processes on MS Excel on a silly MAC

In [73]:
data_files = glob("./data/tmp*.csv")

In [74]:
data_files[0]
pd.read_csv(data_files[0]).head(2)

Unnamed: 0,sensor_id;sensor_type;location;lat;lon;timestamp;value_type;value
0,4852;DHT22;3627;6.515;3.400;2024-02-01T12:02:5...
1,4852;DHT22;3627;6.515;3.400;2024-02-01T12:02:5...


In [64]:
temp_df = pd.read_csv(data_files[0])
columns = temp_df.columns.str.split(";")
temp_df = temp_df["sensor_id;sensor_type;location;lat;lon;timestamp;value_type;value"].str.split(";", expand=True)
temp_df.columns = list(columns)[0]
temp_df.head()

Unnamed: 0,sensor_id,sensor_type,location,lat,lon,timestamp,value_type,value
0,4852,DHT22,3627,6.515,3.4,2024-02-01T12:02:52.744148+00:00,humidity,31.0
1,4852,DHT22,3627,6.515,3.4,2024-02-01T12:02:52.744148+00:00,temperature,30.0
2,4852,DHT22,3627,6.515,3.4,2024-02-01T12:03:47.839650+00:00,humidity,31.3
3,4852,DHT22,3627,6.515,3.4,2024-02-01T12:03:47.839650+00:00,temperature,30.1
4,4852,DHT22,3627,6.515,3.4,2024-02-01T12:04:46.909811+00:00,humidity,31.4


In [66]:
import pytz
temp_df["timestamp"] = pd.to_datetime(temp_df["timestamp"])
temp_df = temp_df.set_index("timestamp")
temp_df.index.tz_convert("Africa/Lagos")[:5]

DatetimeIndex(['2024-02-01 13:02:52.744148+01:00',
               '2024-02-01 13:02:52.744148+01:00',
               '2024-02-01 13:03:47.839650+01:00',
               '2024-02-01 13:03:47.839650+01:00',
               '2024-02-01 13:04:46.909811+01:00'],
              dtype='datetime64[ns, Africa/Lagos]', name='timestamp', freq=None)

In [34]:
temp_df.value_type.value_counts()

value_type
P2             4270
P1             4270
P0             4270
humidity       4144
temperature    4143
Name: count, dtype: int64

In [37]:
## Starting off with the Linear and Ridge Regression Models
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import mean_absolute_error

In [75]:
# Dealing with data wrangling
def wrangle(data_file):
    df = pd.read_csv(data_file)
    columns = df.columns.str.split(";")
    df = df["sensor_id;sensor_type;location;lat;lon;timestamp;value_type;value"].str.split(";", expand=True)
    df.columns = list(columns)[0]

    # Working with only PM2.5 value_type
    df_p2 = df["value_type"] == "P2"
    df = df[df_p2]

    # Remove irrelevant columns
    df.drop(columns=['sensor_id','sensor_type', 'location', 'value_type'], inplace=True)

    # Convert timeezone to Lagos's
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.set_index("timestamp")
    df.index.tz_convert("Africa/Lagos")
    
    return df

In [81]:
data_files

['./data\\tmp29zv95aa.csv',
 './data\\tmpdhpbd64k.csv',
 './data\\tmpf7lx946x.csv',
 './data\\tmpoh50r5me.csv',
 './data\\tmpq5nzwa0h.csv']

In [80]:
for data in data_files:
    df = wrangle(data)
    print(df.head(2))

                                    lat    lon  value
timestamp                                            
2024-02-01 16:31:44.331333+00:00  6.515  3.400  53.57
2024-02-01 16:38:14.403956+00:00  6.515  3.400  56.89
                                    lat    lon  value
timestamp                                            
2023-12-01 05:32:46.225178+00:00  6.540  3.297  45.70
2023-12-01 05:34:06.969479+00:00  6.540  3.297  43.67
                                    lat    lon  value
timestamp                                            
2024-01-01 00:31:56.569920+00:00  6.428  3.435  52.00
2024-01-01 00:37:01.145977+00:00  6.428  3.435  46.00
                                    lat    lon  value
timestamp                                            
2023-11-01 16:44:52.764459+00:00  6.428  3.435  19.00
2023-11-01 17:52:08.924982+00:00  6.540  3.297   3.50
                                    lat    lon  value
timestamp                                            
2024-03-01 00:00:39.896476+0

In [None]:
Merge order of datafiles items [3, 1, 2, 0, 4]