In [8]:
import pandas as pd
import os

In [9]:
#Define the clean and combine function
def clean_and_combine(file_path):
    #Load the data
    weather_data = pd.read_csv(file_path)

    #Clean Data
    weather_data['Date'] = pd.to_datetime(weather_data[['Year', 'Month', 'Day']])
    weather_data = weather_data.drop(columns=["Product code", 
                                            "Bureau of Meteorology station number",
                                            "Days of accumulation of maximum temperature", 
                                            "Quality",
                                            "Year", "Month", "Day"])
    weather_data = weather_data.loc[weather_data['Date'] >= '2009-01-01']

    weather_data = weather_data.rename(columns={'Maximum temperature (Degree C)': 'temperature'})
    
    # Correct NaN
    # Calculate the rolling mean with a window of 7 days (3 days before, the current day, and 3 days after)
    weather_data['rolling_mean'] = weather_data['temperature'].rolling(window=3, min_periods=1, center=True).mean()
    weather_data['solar_exposure'] = weather_data['temperature'].fillna(weather_data['rolling_mean'])
    weather_data = weather_data.drop(columns=['rolling_mean'])


    # Define the city-region mapping
    city_region_map = {
            'adelaide': 'SA1',
            'brisbane': 'QLD1',
            'sydney': 'NSW1',
            'melbourne': 'VIC1',
            'hobart': 'TAS1'
                }  
    
    # Extract the city name from the file name
    city_name = os.path.basename(file_name).split('weather_')[-1].split('.csv')[0]
    region_code = city_region_map.get(city_name)
    weather_data['regionid'] = region_code
        
    # Append cleaned data to list
    all_data.append(weather_data)
    print(f'Data cleaned and added to list for {region_code}')

In [10]:
# Specify the directory where your CSV files are stored 
data_dir = 'C:/Users/David/Documents/VWL/Master Toulouse/Semester 2 M1/Applied  Metrics Project/Data'
weather_dir = f'{data_dir}/weather data'
all_data = []
# Loop through each CSV file in the directory
for file_name in os.listdir(weather_dir):
    if file_name.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(weather_dir, file_name)

        # Call the function to clean 
        clean_and_combine(file_path)


# Merge all data frames
merged_data = pd.concat(all_data)

# Save merged data to CSV
merged_data.to_csv(f'{data_dir}/07-weather-merged.csv', index=False)
print('All data merged and saved to CSV')


Data cleaned and added to list for SA1
Data cleaned and added to list for QLD1
Data cleaned and added to list for TAS1
Data cleaned and added to list for VIC1
Data cleaned and added to list for NSW1
All data merged and saved to CSV


In [11]:
merged_data.isna()

Unnamed: 0,temperature,Date,solar_exposure,regionid
19724,False,False,False,False
19725,False,False,False,False
19726,False,False,False,False
19727,False,False,False,False
19728,False,False,False,False
...,...,...,...,...
31062,False,False,False,False
31063,False,False,False,False
31064,False,False,False,False
31065,False,False,False,False
