In [30]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv('../data/tourismus.csv', delimiter=';', skiprows=3, usecols=[0, 1, 2, 4], names=['Year', 'Month', 'Tour_Arrivals', 'Tour_Stays'])

In [32]:
df['Year'] = df['Year'].ffill()

In [33]:
df = df.drop(index=[0, 1, 2, 3]).reset_index(drop=True)
df.head(10)

Unnamed: 0,Year,Month,Tour_Arrivals,Tour_Stays
0,2019,Januar,914931,2159841
1,2019,Februar,949002,2235628
2,2019,März,1110954,2642288
3,2019,April,1184986,3009169
4,2019,Mai,1277674,3010462
5,2019,Juni,1244717,3047930
6,2019,Juli,1225219,3216242
7,2019,August,1274832,3399163
8,2019,September,1255860,3035799
9,2019,Oktober,1273608,3120607


In [34]:
month_mapping = {
    'Januar': '1', 'Februar': '2', 'März': '3',
    'April': '4', 'Mai': '5', 'Juni': '6', 'Juli': '7',
    'August': '8', 'September': '9', 'Oktober': '10',
    'November': '11', 'Dezember': '12'
}

days_in_month = {
    'Januar': 31, 'Februar': 28, 'März': 31,
    'April': 30, 'Mai': 31, 'Juni': 30,
    'Juli': 31, 'August': 31, 'September': 30,
    'Oktober': 31, 'November': 30, 'Dezember': 31
}

In [35]:
df['Days'] = df['Month'].map(days_in_month)
df['Month'] = df['Month'].map(month_mapping)

df.head(10)

Unnamed: 0,Year,Month,Tour_Arrivals,Tour_Stays,Days
0,2019,1,914931,2159841,31.0
1,2019,2,949002,2235628,28.0
2,2019,3,1110954,2642288,31.0
3,2019,4,1184986,3009169,30.0
4,2019,5,1277674,3010462,31.0
5,2019,6,1244717,3047930,30.0
6,2019,7,1225219,3216242,31.0
7,2019,8,1274832,3399163,31.0
8,2019,9,1255860,3035799,30.0
9,2019,10,1273608,3120607,31.0


In [36]:
df['Tour_Arrivals'] = pd.to_numeric(df['Tour_Arrivals'], errors='coerce')
df['Tour_Stays'] = pd.to_numeric(df['Tour_Stays'], errors='coerce')

# Calculate average daily arrivals and stays
df['Avg_Daily_Arrivals'] = df['Tour_Arrivals'] / df['Days']
df['Avg_Daily_Stays'] = df['Tour_Stays'] / df['Days']

# Retain only necessary columns
df = df[['Month', 'Year', 'Tour_Arrivals', 'Tour_Stays', 'Avg_Daily_Arrivals', 'Avg_Daily_Stays']]

In [37]:
df['Avg_Daily_Arrivals'] = df['Avg_Daily_Arrivals'].apply(np.round)
df['Avg_Daily_Stays'] = df['Avg_Daily_Stays'].apply(np.round)
df["Month"] = df["Month"].astype(str)
df.head()

Unnamed: 0,Month,Year,Tour_Arrivals,Tour_Stays,Avg_Daily_Arrivals,Avg_Daily_Stays
0,1,2019,914931.0,2159841.0,29514.0,69672.0
1,2,2019,949002.0,2235628.0,33893.0,79844.0
2,3,2019,1110954.0,2642288.0,35837.0,85235.0
3,4,2019,1184986.0,3009169.0,39500.0,100306.0
4,5,2019,1277674.0,3010462.0,41215.0,97112.0


In [38]:
df.to_csv('../data/tourism_clean.csv', index=False)