In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Set the path to the folder containing your CSV files
folder_path = R"C:\Users\andre\OneDrive - Alma Mater Studiorum Università di Bologna\University\UniBo\Machine Learning\PR2.20\data"

# List all files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv") if file != 'InfoComune.csv']

# Create an empty dictionary to store DataFrames
dataframes = {}

# Iterate through each CSV file
for file in csv_files:
    # Extract the file name (excluding .csv)
    df_name = os.path.splitext(file)[0]
    
    # Create the DataFrame and store it in the dictionary
    dataframes[df_name] = pd.read_csv(os.path.join(folder_path, file), header=0, skiprows = [1])


In [3]:
for province, df in dataframes.items():
    # change missing values to the proper format
    df.replace('---', pd.NA, inplace = True)
    # ensure a unique format
    df = df.convert_dtypes()
    # update dataframe in the dictionary
    dataframes[province] = df

In [None]:
dataframes['bologna'].columns.values

In [None]:
dataframes['bologna']

In [5]:
# columns to keep the average value only
pollutants = ['CO', 'NH3', 'NMVOC', 'NO2', 'NO', 'O3', 'PANS', 'PM10', 'PM2.5', 'SO2']
# metereological information
met = ['TG', 'TN', 'TX', 'HU', 'PP', 'QQ', 'RR']

# date values
date = ['YYYY', 'MM', 'DD']

for province, df in dataframes.items():
    # Rename columns from 0 to 2
    df.columns.values[0:3] = date
    
    # Rename columns from 6 to 12
    df.columns.values[6:13] = met
    
for province, df in dataframes.items():
    # Keep only selected columns
    selected_columns = date + met + pollutants
    df = df[selected_columns]
    
    # Update the dataframe in the dictionary
    dataframes[province] = df
    

In [6]:
dataframes['bologna']

Unnamed: 0,YYYY,MM,DD,TG,TN,TX,HU,PP,QQ,RR,CO,NH3,NMVOC,NO2,NO,O3,PANS,PM10,PM2.5,SO2
0,2017,1,1,0.34,-3.44,5.83,82.72,1023.09,60.94,0.0,,,,,,,,,,
1,2017,1,2,2.27,-2.24,8.13,84.16,1018.51,60.34,0.0,,,,,,,,,,
2,2017,1,3,1.92,-1.84,6.88,83.48,1018.54,51.3,0.0,,,,,,,,,,
3,2017,1,4,2.82,-2.61,9.56,79.68,1012.73,73.41,0.0,,,,,,,,,,
4,2017,1,5,3.01,-3.44,7.99,51.05,1015.32,70.92,0.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,2020,12,27,1.83,-0.08,4.24,60.21,1009.94,55.89,7.43,230.37,3.99,16.54,11.98,1.67,62.43,0.87,12.86,10.69,1.25
1457,2020,12,28,2.22,-0.17,3.23,77.08,990.34,0.0,18.34,228.37,2.54,19.26,11.950000000000001,2.15,51.54,1.03,11.55,9.38,0.84
1458,2020,12,29,1.34,-2.66,3.25,73.42,998.6,17.23,7.78,,,,,,,,,,
1459,2020,12,30,2.93,0.61,5.29,75.44,1004.8,36.44,0.01,,,,,,,,,,


In [9]:
for province, df in dataframes.items():
    # first convert to numeric the columns in met and pollutants as they are strings
    df = df[met+pollutants].apply(pd.to_numeric, errors = 'coerce')
    # round to the second decimal number for better visualization
    df = df.round(2)
    # update the dataframe in the dictionary
    dataframes[province] = df

In [11]:
# create a date variable for all the dataframes
for province, df in dataframes.items():
    # Combine 'YYYY', 'MM', 'DD' columns into a new 'date' column
    df['date'] = pd.to_datetime(df[['YYYY', 'MM', 'DD']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d')
    
    # Remove 'YYYY', 'MM', 'DD' columns
    df.drop(['YYYY', 'MM', 'DD'], axis=1, inplace=True)
    
    # Reorder columns with 'date' as the first column
    dataframes[province] = df[['date'] + [col for col in df.columns if col != 'date']]

In [12]:
dataframes['bologna']

Unnamed: 0,date,TG,TN,TX,HU,PP,QQ,RR,CO,NH3,NMVOC,NO2,NO,O3,PANS,PM10,PM2.5,SO2
0,2017-01-01,0.34,-3.44,5.83,82.72,1023.09,60.94,0.0,,,,,,,,,,
1,2017-01-02,2.27,-2.24,8.13,84.16,1018.51,60.34,0.0,,,,,,,,,,
2,2017-01-03,1.92,-1.84,6.88,83.48,1018.54,51.3,0.0,,,,,,,,,,
3,2017-01-04,2.82,-2.61,9.56,79.68,1012.73,73.41,0.0,,,,,,,,,,
4,2017-01-05,3.01,-3.44,7.99,51.05,1015.32,70.92,0.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,2020-12-27,1.83,-0.08,4.24,60.21,1009.94,55.89,7.43,230.37,3.99,16.54,11.98,1.67,62.43,0.87,12.86,10.69,1.25
1457,2020-12-28,2.22,-0.17,3.23,77.08,990.34,0.0,18.34,228.37,2.54,19.26,11.95,2.15,51.54,1.03,11.55,9.38,0.84
1458,2020-12-29,1.34,-2.66,3.25,73.42,998.6,17.23,7.78,,,,,,,,,,
1459,2020-12-30,2.93,0.61,5.29,75.44,1004.8,36.44,0.01,,,,,,,,,,
