In [89]:
import sys
sys.path.append("../..")

In [90]:
# import libraries
import pandas as pd
import numpy as np
import os

In [91]:
# import custom modules
from survival.utils import lower_case
from survival.utils import show_all


In [92]:
# load the data
raw = pd.read_csv('../../data/raw/raw.csv', encoding='latin1', sep=';')

In [93]:
# rename columns for readability
raw.columns = ['is_institutional', 'is_individual', 'school_type', 'rank', 'country_code', 'email', 'municipality', 'city', 'production', 'season', 'purchase_date',
               'start_date', 'ticket_type', 'currency', 'price', 'is_canceled', 'is_free', 'is_dead',
               'artform', 'ticket_num', 'gender', 'birthdate', 'age', 'id']

In [94]:
# all string to lower case
raw = lower_case(raw)

In [95]:
# datetime conversion
raw['purchase_date'] = pd.to_datetime(raw['purchase_date'], dayfirst=True, errors='coerce')
raw['purchase_date'] = pd.to_datetime(raw['purchase_date'].dt.date, dayfirst=True, errors='coerce')

raw['start_date'] = pd.to_datetime(raw['start_date'], dayfirst=True, errors='coerce')

raw['birthdate'] = pd.to_datetime(raw['birthdate'], dayfirst=True, errors='coerce')
raw['birthdate'] = pd.to_datetime(raw['birthdate'].dt.date, dayfirst=True, errors='coerce')

In [96]:
# delete canceled purchases
raw = raw[raw['is_canceled'] == 0]

# drop unnessesary columns
raw = raw.drop(columns=['is_dead', 'currency', 'is_canceled', 'school_type', 'is_individual', 'is_institutional'])

# retain only ballet and opera
raw = raw[raw['artform'].isin(['ballet', 'opera'])]

# strip all string in production of double spaces
raw['production'] = raw['production'].str.replace('  ', ' ')

# remove the '/ ' in certain production names
raw['production'] = raw['production'].str.replace('/ ', '')

# raw['price'] to float
raw['price'] = raw['price'].str.replace(',', '.')
raw['price'] = pd.to_numeric(raw['price'], errors='coerce')

# delete price outlier
raw = raw[raw['price'] != 2500]

# remove 'seizoen' from season for clarity
raw['season'] = raw['season'].str.replace('seizoen ', '')
raw['season'] = raw['season'].str.replace('-', '_')

# fill missing values of production with dansers van morgen 2022
raw['production'] = raw['production'].fillna('dansers van morgen 2022')

In [97]:
# streamline production names of flirt events
from survival.constants import flirt_mapping_dict
raw['production'] = raw['production'].replace(flirt_mapping_dict)

# streamline production names of danser van morgen events
raw.loc[raw['production'] == 'dansers van morgen 2022', 'production'] = '21/22 dansers van morgen'
raw.loc[(raw['production'] == 'dansers van morgen') & (raw['start_date'] > '2022-08-01') & (raw['start_date'] < '2023-08-01'), 'production'] = raw['production'].str.replace('dansers van morgen', '22/23 dansers van morgen')
raw.loc[(raw['production'] == 'dansers van morgen') & (raw['start_date'] > '2023-08-01') & (raw['start_date'] < '2024-08-01'), 'production'] = raw['production'].str.replace('dansers van morgen', '23/24 dansers van morgen')

# streamline production name typos
raw.loc[raw['production'] == 'la traviata', 'production'] = '21/22 la traviata'
raw.loc[raw['production'] == '18/19 die zauberfloete', 'production'] = '18/19 die zauberflote'

# standardize 21/22 hans van manen programmes
raw.loc[raw['production'] == '21/22 hans van manen festival progr i', 'production'] = '21/22 hans van manen festival'
raw.loc[raw['production'] == '21/22 hans van manen festival progr ii', 'production'] = '21/22 hans van manen festival'
raw.loc[raw['production'] == '21/22 hans van manen festival progr iii', 'production'] = '21/22 hans van manen festival'
raw.loc[raw['production'] == '21/22 hans van manen festival progr iv', 'production'] = '21/22 hans van manen festival'

# standardize walkure adventure seats
raw.loc[raw['production'] == '19/20 die walkure adventure seats', 'production'] = '19/20 die walkÃ¼re'

In [98]:
# load regular programme data
reg = pd.read_csv('../../data/processed/operaballet_reg_prods_clean.csv')

# keep only the productions that are in the regular programme
raw = raw[raw['production'].isin(reg['production'])]

In [99]:
# map countries
from survival.constants import country_mapping_dict

def map_country(country):
    for key, value in country_mapping_dict.items():
        if country in value:
            return key
    return np.nan

raw['country_code'] = raw['country_code'].apply(map_country)

In [100]:
# fix municipality names
raw.loc[raw['municipality'] == 'nuenen- gerwen en nederwetten', 'municipality'] = 'nuenen, gerwen en nederwetten'
raw.loc[raw['municipality'] == 'nuenen. gerwen en nederwetten', 'municipality'] = 'nuenen, gerwen en nederwetten'
raw.loc[raw['municipality'] == 'nuenen', 'municipality'] = 'nuenen, gerwen en nederwetten'

raw.loc[raw['municipality'] == "'s-gravenhage", 'municipality'] = 'den haag'

In [101]:
# reset index and export as parquet
raw = raw.reset_index(drop=True)
raw.to_parquet('../../data/processed/raw_clean.parquet')