In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set the path to the folder containing your CSV files
folder_path = R"C:\Users\andre\OneDrive - Alma Mater Studiorum Università di Bologna\University\UniBo\Machine Learning\PR2.20\data"

# List all files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv") if file != 'InfoComune.csv']

# Create an empty dictionary to store DataFrames
dataframes = {}

# Iterate through each CSV file
for file in csv_files:
    # Extract the file name
    df_name = os.path.splitext(file)[0]
    
    # Create the DataFrame and store it in the dictionary
    dataframes[df_name] = pd.read_csv(os.path.join(folder_path, file), header=0, skiprows = [1])


In [None]:
# store dictionary items in specific variables to make it easier to loop through them
datasets = dataframes.values()
provinces = dataframes.keys()
data_prov_pairs = dataframes.items()

In [None]:
for df in datasets:
    # change missing values to the proper format
    df.replace('---', np.nan, inplace = True)
    # ensure a unique format
    df = df.convert_dtypes()

In [None]:
dataframes['bologna'].columns.values

In [None]:
dataframes['bologna']

In [None]:
# columns to keep the average value only
pollutants = ['CO', 'NH3', 'NMVOC', 'NO2', 'NO', 'O3', 'PANS', 'PM10', 'PM2.5', 'SO2']

# metereological information
met = ['TG', 'TN', 'TX', 'HU', 'PP', 'QQ', 'RR']
met_pos = range(6, 13)
# date values
date = ['YYYY', 'MM', 'DD']
date_pos = list(range(3))

# rename the columns for date and metereological information
for df in datasets:
    old_date = df.columns[date_pos]
    old_met = df.columns[met_pos]
    
    df.rename(columns=dict(zip(old_date, date)), inplace=True)
    df.rename(columns=dict(zip(old_met, met)), inplace=True)

In [None]:
dataframes['bologna']

In [None]:
selected_columns = date + met + pollutants

for province, df in data_prov_pairs:
    # Keep only selected columns
    dataframes[province] = df[selected_columns]

In [None]:
dataframes['bologna']

In [None]:
# create a date variable for all the dataframes
for province, df in data_prov_pairs:
    # Combine 'YYYY', 'MM', 'DD' columns into a new 'date' column
    df['date'] = pd.to_datetime(df[['YYYY', 'MM', 'DD']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d')
    
    # Remove 'YYYY', 'MM', 'DD' columns
    df.drop(['YYYY', 'MM', 'DD'], axis=1, inplace=True)
    
    # Reorder columns with 'date' as the first column
    dataframes[province] = df[['date'] + [col for col in df.columns if col != 'date']]

In [None]:
dataframes['bologna']

In [None]:
numerics = met + pollutants

for province, df in data_prov_pairs:
    # first convert to numeric the columns in met and pollutants, since they are strings
    df[numerics] = df[numerics].apply(pd.to_numeric, errors = 'coerce')
    # round to the second decimal number for better visualization
    df[numerics] = df[numerics].round(2)

In [None]:
dataframes['bologna']

In [None]:
# we want to filter the series so that we don't have missing values
# We'll start from 2018-01-01 and move until 2020-12-28

for province, df in data_prov_pairs:
    df = df[(df['date'] >= pd.to_datetime('2018-01-01')) & (df['date'] <= pd.to_datetime('2020-12-28'))]

    dataframes[province] = df

In [None]:
dataframes['bologna']

In [None]:
# we still have 1 missing value
dataframes['bologna'].isnull().sum()

In [None]:
# the missing value is the 29th of February
dataframes['bologna'][dataframes['bologna'].isnull().any(axis=1)]

Since the remaining missing value is not missing at the beginning or the end of the series, it was decided to impute the value with the median, since it's robust to extreme values

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

for province, df in data_prov_pairs:
    df[numerics] = imputer.fit_transform(df[numerics])
    
    dataframes[province] = df


In [None]:
dataframes['bologna']

In [None]:
dataframes['bologna'].isnull().sum()

With the following we apply smoothing to the series, in order to reduce noise and the impact of outliers on the dataset.

The smoothing function is the Savitzky-Golay function, which applies polynomial smooting on the time span indicated by the window parameter.

The advantage of this function compared to other smoothing methods like moving average is that it doesn't introduce missing values in the series.

In [None]:
from scipy.signal import savgol_filter

def smooth(window, window_pp, poly_pp, poly = 2):
    
    for province, df in data_prov_pairs:
        
        for column in df[numerics].columns:
            # Extract numerical values from the DataFrame
            values = df[column].values
            
            # Apply Savitzky-Golay filter to the numerical values
            # for PP we need higher smooting due to outliers
            if column == 'PP':
                window = window_pp
                poly = poly_pp
                
            smoothed_values = savgol_filter(values, window, poly)
            # Update the DataFrame with the smoothed values
            df[column] = smoothed_values
            
        dataframes[province] = df

In [None]:
smooth(window = 10, window_pp = 25, poly_pp = 5)