In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set the path to the folder containing your CSV files
folder_path = R"C:\Users\andre\OneDrive - Alma Mater Studiorum Università di Bologna\University\UniBo\Machine Learning\PR2.20\data"

# List all files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith(".csv") if file != 'InfoComune.csv']

# Create an empty dictionary to store DataFrames
dataframes = {}

# Iterate through each CSV file
for file in csv_files:
    # Extract the file name (excluding .csv)
    df_name = os.path.splitext(file)[0]
    
    # Create the DataFrame and store it in the dictionary
    dataframes[df_name] = pd.read_csv(os.path.join(folder_path, file), header=0, skiprows = [1])


In [None]:
for df in dataframes:
    # change missing values to the proper format
    dataframes[df].replace('---', pd.NA, inplace = True)
    # ensure a unique format
    dataframes[df] = dataframes[df].convert_dtypes()

In [None]:
dataframes['bologna'].columns.values

In [None]:
dataframes['bologna']

In [None]:
# columns to keep the average value only
pollutants = ['CO', 'NH3', 'NMVOC', 'NO2', 'NO', 'O3', 'PANS', 'PM10', 'PM2.5', 'SO2']
# metereological information
met = ['TG', 'TN', 'TX', 'HU', 'PP', 'QQ', 'RR']

# date values
date = ['YYYY', 'MM', 'DD']

for province, df in dataframes.items():
    # Step 1: Rename columns from 0 to 2
    df.columns.values[0:3] = date
    
    # Step 2: Rename columns from 6 to 12
    df.columns.values[6:13] = met
    
    # Step 3: Keep only selected columns
    selected_columns = date + met + pollutants
    df = df[selected_columns]
    
    # Update the dataframe in the dictionary
    dataframes[province] = df
    

In [None]:
dataframes['bologna']

In [None]:
# create a date variable for all the dataframes
for province, df in dataframes.items():
    # Combine 'YYYY', 'MM', 'DD' columns into a new 'date' column
    df['date'] = pd.to_datetime(df[['YYYY', 'MM', 'DD']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d')
    
    # Remove 'YYYY', 'MM', 'DD' columns
    df.drop(['YYYY', 'MM', 'DD'], axis=1, inplace=True)
    
    # Reorder columns with 'date' as the first column
    dataframes[province] = df[['date'] + [col for col in df.columns if col != 'date']]

In [None]:
dataframes['bologna']

In [None]:
# Create an empty list to store modified dataframes
dfs = []

# Iterate through the dictionary items
for province, df in dataframes.items():
    # Add a 'province' column with the current province name
    df['province'] = province
    # Append the modified dataframe to the list
    dfs.append(df)

In [None]:
# Concatenate all dataframes in the list along the rows
full_df = pd.concat(dfs, ignore_index=True)
full_df

In [None]:
# bring province at the beginning
full_df = full_df[['province'] + [col for col in full_df.columns if col != 'province']]
full_df

In [None]:
# Reset the index of the final dataframe to ensure it's not distorted by concatenation
full_df.reset_index(drop=True, inplace=True)
full_df

In [None]:
# we have columns that were supposed to be float that are strings, so we need to convert them
full_df.dtypes

In [None]:
full_df[pollutants] = full_df[pollutants].apply(pd.to_numeric, errors = 'coerce')

In [None]:
full_df.dtypes

In [None]:
# round numbers to the 2nd decimal place
integer_columns = full_df.select_dtypes(include='int').columns
full_df[integer_columns] = full_df[integer_columns].round(2)

In [24]:
full_df

Unnamed: 0,province,date,TG,TN,TX,HU,PP,QQ,RR,CO,NH3,NMVOC,NO2,NO,O3,PANS,PM10,PM2.5,SO2
0,bologna,2017-01-01,0.34,-3.44,5.83,82.72,1023.09,60.94,0.0,,,,,,,,,,
1,bologna,2017-01-02,2.27,-2.24,8.13,84.16,1018.51,60.34,0.0,,,,,,,,,,
2,bologna,2017-01-03,1.92,-1.84,6.88,83.48,1018.54,51.3,0.0,,,,,,,,,,
3,bologna,2017-01-04,2.82,-2.61,9.56,79.68,1012.73,73.41,0.0,,,,,,,,,,
4,bologna,2017-01-05,3.01,-3.44,7.99,51.05,1015.32,70.92,0.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13144,rimini,2020-12-27,1.97,-0.42,5.02,70.08,1007.42,47.68,0.14,201.54,3.22,12.45,9.18,0.89,58.81,0.95,10.37,7.4,1.13
13145,rimini,2020-12-28,5.97,1.29,10.08,80.59,991.55,22.09,13.85,197.51,0.94,8.21,4.71,0.16,74.46,0.96,9.97,7.65,0.98
13146,rimini,2020-12-29,7.94,3.97,11.95,79.58,999.3,35.86,9.03,,,,,,,,,,
13147,rimini,2020-12-30,4.11,2.35,7.23,84.01,1003.39,42.15,3.67,,,,,,,,,,
