In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Gathering Data

**Since all the csv files contain the same columns, it should be better to just combine all of them for easier**

In [13]:
dataset_path = './data/'
datasets = [os.path.join(dataset_path, filename) for filename in os.listdir(dataset_path)]

def load_and_cat_dataframes(paths):
    dataframes = []
    for path in paths:
        df = pd.read_csv(path)
        dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

df = load_and_cat_dataframes(datasets)
df.head()

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7,Dongsi
1,2,2013,3,1,1,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,NW,3.9,Dongsi
2,3,2013,3,1,2,7.0,7.0,,17.0,300.0,60.0,-1.2,1025.3,-24.6,0.0,NNW,5.3,Dongsi
3,4,2013,3,1,3,3.0,3.0,5.0,18.0,,,-1.4,1026.2,-25.5,0.0,N,4.9,Dongsi
4,5,2013,3,1,4,3.0,3.0,7.0,,200.0,84.0,-1.9,1027.1,-24.5,0.0,NNW,3.2,Dongsi


# Assessing Data

**Assess and find out what the data is about and figure out what information is in the dataframe**

In [29]:
df.describe(include="all")

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
count,420768.0,420768.0,420768.0,420768.0,420768.0,412029.0,414319.0,411747.0,408652.0,400067.0,407491.0,420370.0,420375.0,420365.0,420378.0,418946,420450.0,420768
unique,,,,,,,,,,,,,,,,16,,12
top,,,,,,,,,,,,,,,,NE,,Dongsi
freq,,,,,,,,,,,,,,,,43335,,35064
mean,17532.5,2014.66256,6.52293,15.729637,11.5,79.793428,104.602618,15.830835,50.638586,1230.766454,57.372271,13.538976,1010.746982,2.490822,0.064476,,1.729711,
std,10122.116943,1.177198,3.448707,8.800102,6.922195,80.822391,91.772426,21.650603,35.127912,1160.182716,56.661607,11.436139,10.474055,13.793847,0.821004,,1.246386,
min,1.0,2013.0,1.0,1.0,0.0,2.0,2.0,0.2856,1.0265,100.0,0.2142,-19.9,982.4,-43.4,0.0,,0.0,
25%,8766.75,2014.0,4.0,8.0,5.75,20.0,36.0,3.0,23.0,500.0,11.0,3.1,1002.3,-8.9,0.0,,0.9,
50%,17532.5,2015.0,7.0,16.0,11.5,55.0,82.0,7.0,43.0,900.0,45.0,14.5,1010.4,3.1,0.0,,1.4,
75%,26298.25,2016.0,10.0,23.0,17.25,111.0,145.0,20.0,71.0,1500.0,82.0,23.3,1019.0,15.1,0.0,,2.2,


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420768 entries, 0 to 420767
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   No       420768 non-null  int64  
 1   year     420768 non-null  int64  
 2   month    420768 non-null  int64  
 3   day      420768 non-null  int64  
 4   hour     420768 non-null  int64  
 5   PM2.5    412029 non-null  float64
 6   PM10     414319 non-null  float64
 7   SO2      411747 non-null  float64
 8   NO2      408652 non-null  float64
 9   CO       400067 non-null  float64
 10  O3       407491 non-null  float64
 11  TEMP     420370 non-null  float64
 12  PRES     420375 non-null  float64
 13  DEWP     420365 non-null  float64
 14  RAIN     420378 non-null  float64
 15  wd       418946 non-null  object 
 16  WSPM     420450 non-null  float64
 17  station  420768 non-null  object 
dtypes: float64(11), int64(5), object(2)
memory usage: 57.8+ MB


**There are several notes that can be taken from the general structure of the csv and the above df.describe and df.info:**
- The "No" column is the index of the data grouped by station
- The "year", "month", "day", and "hour" columns can be turned into a single time column
- The data has a time range from 2013 to 2017
- The data contains the concentration density(?) of some atmospheric gas(?) in some metric I'm not sure

**Since there are some terms I am not familiar with, I use ChatGPT to find out more about them, and here is what it said word to word:**

- PM2.5: Particulate matter with a diameter of 2.5 micrometers or less (μg/m³). These fine particles can affect human health.
- PM10 : Particulate matter with a diameter of 10 micrometers or less (μg/m³). These are larger than PM2.5 but still harmful.
- SO2  : Sulfur dioxide concentration (μg/m³), a gas commonly associated with air pollution.
- NO2  : Nitrogen dioxide concentration (μg/m³), another pollutant that can affect respiratory health.
- CO   : Carbon monoxide concentration (mg/m³), a harmful gas resulting from incomplete combustion.
- O3   : Ozone concentration (μg/m³), which can contribute to smog and affect respiratory health.
- TEMP : Temperature (°C), the air temperature at the time of recording.
- PRES : Atmospheric pressure (hPa or millibars), which can influence weather patterns.
- DEWP : Dew point (°C), the temperature at which air becomes saturated and dew forms.
- RAIN : Precipitation amount (mm), indicating how much rainfall occurred.
- wd   : Wind direction, likely categorical (e.g., "N" for north, "S" for south).
- WSPM : Wind speed (m/s), indicating the speed of the wind.

In [24]:
df.isnull().sum().rename('null_count')

Unnamed: 0,null_count
No,0
year,0
month,0
day,0
hour,0
PM2.5,8739
PM10,6449
SO2,9021
NO2,12116
CO,20701


In [20]:
print("Number of duplicated rows:", df.duplicated().sum())

Number of duplicated rows: 0


# Cleaning Data

In [30]:
cleaned_df = df.copy()

**Combine "year", "month", "day", "hour" columns into one column "time" to make the dataframe easier to read**

In [32]:
cleaned_df['time'] = pd.to_datetime(cleaned_df[['year', 'month', 'day', 'hour']])
cleaned_df.drop(['year', 'month', 'day', 'hour'], axis=1, inplace=True)
cleaned_df.head()

Unnamed: 0,No,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station,time
0,1,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7,Dongsi,2013-03-01 00:00:00
1,2,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,NW,3.9,Dongsi,2013-03-01 01:00:00
2,3,7.0,7.0,,17.0,300.0,60.0,-1.2,1025.3,-24.6,0.0,NNW,5.3,Dongsi,2013-03-01 02:00:00
3,4,3.0,3.0,5.0,18.0,,,-1.4,1026.2,-25.5,0.0,N,4.9,Dongsi,2013-03-01 03:00:00
4,5,3.0,3.0,7.0,,200.0,84.0,-1.9,1027.1,-24.5,0.0,NNW,3.2,Dongsi,2013-03-01 04:00:00
