In [1]:
import pandas as pd
import requests
import os, json
from json import JSONDecodeError

In [27]:
def get_json(from_year, to_year):
    if not 2000 < int(from_year) < 2050 or not 2000 < int(from_year) < 2050:
        return []
    all_holidays = []
    for i in range(from_year, to_year+1):
        r = requests.get(f'https://date.nager.at/api/v2/publicholidays/{i}/CH')
        all_holidays.extend(r.json())
    return all_holidays

def set_date_index(df, col='date'):
    return df.set_index(pd.to_datetime(df[col])).drop(col, axis=1)

def filter_canton(df, can='BS'):
    return df[[(str('CH-' + can) in row) if row is not None else True for row in df.counties]]

def get_holiday_data(canton='BS', from_year=2018, to_year=2018):
    holiday_data = get_json(from_year, to_year)
    holiday_df = (pd.DataFrame.from_records(holiday_data)
                  .pipe(set_date_index)
                  .pipe(filter_canton, can=canton))
    return holiday_df

In [22]:
# 105: Basel
def import_weather_data(station_index=105):    
    stationdata = []
    station_error = 0

    for root, dirs, files in os.walk("weather_data/2018"):
        for file in files: 
            if not file.endswith(".json"): continue
            filename = os.path.join(root, file)
            try:
                with open(filename) as json_file:
                    data = json.load(json_file)
                    stationdata.append(data[station_index])
            except UnicodeDecodeError:
                station_error = station_error + 1
            except JSONDecodeError:
                station_error = station_error + 1
    print ("Loaded: %d - errors: %d" % (len(stationdata), station_error))
    pandata = pd.DataFrame(stationdata)
    pandata = pandata.drop(columns=['code','station'])
    return pandata

def clean_weather_data(weather_df):
    weather_df['sunshine'] = pd.to_numeric(weather_df['sunshine'])
    weather_data['precipitation'] = pd.to_numeric(weather_data['precipitation'])
    return weather_df.set_index(pd.to_datetime(weather_df['dateTime'])).drop('dateTime', axis=1).sort_index()

def get_weather_score():
    weather_data = (import_weather_data()
                    .pipe(clean_weather_data))
    # first version, just take sunshine value
    return weather_data[['sunshine']]
    

In [None]:
def import_basel_data():
    data = pd.read_csv("raw_data/Basel_Daten.csv", delimiter=';')
    return data

def get_location_data(df, location):
    mask = df['SiteName'] == location
    return df[mask], df[~mask]

def clean_location_data(location_df):
    stamps = [x.split('+')[0] for x in location_df['DateTimeFrom'].values]
    location_df = location_df.assign(Time=pd.to_datetime(stamps.copy(), format="%Y-%m-%dT%H:%M:%S"))
    location_df = location_df.set_index('Time')
    return location_df

def get_location_split_dict(data):
    location_dict = {}
    #Alle Orte
    locs = sorted(list(set(data['SiteName'])))
    for loc in locs:
        location_df, data = get_location_data(data, loc)
        location_dict[loc] = clean_location_data(location_df)
    return location_dict

def resample_location_data(location_df, frequency):
    if frequency == 'D':
        cols = ['Total']
    elif frequency == 'H':
        cols = ['Total', 'Month', 'Weekday', 'HourFrom']
    else:
        print("INCORRECT FREQUENCY")
    location_df = location_df[cols].resample(frequency).sum()
    return location_df

In [29]:
holiday_df = get_holiday_data()
weather_df = get_weather_score()
data = import_basel_data()
location_hourly_sum = {}
location_daily_sum = {}
for loc, loc_data in get_location_split_dict(data).items():
    location_daily_sum[loc] = resample_location_data(loc_data, 'D')    
    location_hourly_sum[loc] = resample_location_data(loc_data, 'H')

Loaded: 13691 - errors: 554


In [30]:
holiday_df.head()

Unnamed: 0_level_0,counties,countryCode,fixed,global,launchYear,localName,name,type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01,,CH,True,True,1967.0,Neujahr,New Year's Day,Public
2018-03-30,"[CH-ZH, CH-BE, CH-LU, CH-UR, CH-SZ, CH-OW, CH-...",CH,False,False,,Karfreitag,Good Friday,Public
2018-04-02,"[CH-ZH, CH-BE, CH-LU, CH-UR, CH-SZ, CH-OW, CH-...",CH,False,False,1642.0,Ostermontag,Easter Monday,Public
2018-05-01,"[CH-ZH, CH-FR, CH-SO, CH-BS, CH-BL, CH-SH, CH-...",CH,True,False,,Tag der Arbeit,Labour Day,Public
2018-05-10,,CH,False,True,,Auffahrt,Ascension Day,Public


In [31]:
weather_df.head()

Unnamed: 0_level_0,sunshine
dateTime,Unnamed: 1_level_1
2018-03-03 14:30:00,0.0
2018-03-03 14:40:00,0.0
2018-03-03 14:40:00,0.0
2018-03-03 14:50:00,0.0
2018-03-03 15:00:00,0.0
