In [1601]:
import sys
import os
sys.path.append("../..")

In [1602]:
# import libraries and custom modules
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from rapidfuzz import process, fuzz
from geopy.distance import geodesic

import survival.utils
importlib.reload(survival.utils)
from survival.utils import show_all
from survival.utils import validate_imputation



In [1603]:
# load data
data = pd.read_parquet("../../data/processed/raw_clean.parquet")
hh = pd.read_csv("../../data/processed/hh_clean.csv")
cities = pd.read_csv('../../data/processed/cities.csv')
nl = pd.read_csv('../../data/processed/nl.csv')
countries = pd.read_csv('../../data/processed/countries.csv')
income = pd.read_csv('../../data/processed/income.csv')
gemeenten = pd.read_csv('../../data/processed/gemeenten.csv')

In [1604]:
# find the earliest purchase date per performance, in case people bought multiple tickets at different times
min_purchase_date = data.groupby(['id', 'start_date']).agg(
    min_purchase_date =( 'purchase_date', 'min')).reset_index()

data = data.merge(min_purchase_date, on=['id', 'start_date'], how='left')

In [1605]:
# per id per start_date, count the amount of tickets bought and store in column 'order_size', and count per ticket type the amount of tickets bought and store in columns 'order_size_<ticket_type>'. fill with 0 if no tickets bought
data['order_size'] = data.groupby(['id', 'start_date'])['id'].transform('count')
data['total_order_value'] = data.groupby(['id', 'start_date'])['price'].transform('sum')
data['avg_order_value'] = data.groupby(['id', 'start_date'])['price'].transform('mean')
data['total_order_value'] = data['total_order_value'].round(2)
data['avg_order_value'] = data['avg_order_value'].round(2)

In [1606]:
# drop these, but perhaps drop ticket_num earlier -> figure out if its necessary in a grouping operation
data = data.drop(columns=['ticket_num', 'price'])

In [1607]:
daily_sales = data.groupby(['start_date', 'purchase_date']).size().reset_index(name='tickets_sold')
    
# Calculate cumulative sales for each performance
result = daily_sales.sort_values(['start_date', 'purchase_date'])
result['cumulative_sales'] = result.groupby('start_date')['tickets_sold'].cumsum()

result = result[[
    'start_date',
    'purchase_date',
    'tickets_sold',
    'cumulative_sales'
]]

In [1608]:
data = data.merge(result, on=['start_date', 'purchase_date'], how='left').drop(columns=['purchase_date'])


In [1609]:
#remove subscription tickets
subscription_ticket = ['abo standaard', 'abo vk dno', 'abonnement 22/23', 'kassa abo standaard', 'abonnement 24/25', 'abo vk hnb', 'abo vrij']

# remove all subscription tickets from activity
data = data[~data['ticket_type'].isin(subscription_ticket)]

In [1610]:
# delete records where is_free == 1 and drop the column
data = data[data['is_free'] != 1]
data = data.drop('is_free', axis=1)

In [1611]:
# drop all educatie tickets because these visitors are not unique
data = data[~data['ticket_type'].str.contains('educatie')]

# drop all ticket where ticket_type are related to employees
employee_ticket = [
    'zoekplaats',
    'huiskorting',
    'medewerker',
    'medewerker no&b',
    'vrijplaats',
    'paniek',
    'balletorkest',
    'orkest',
    'nedpho'
    ]

data = data[~data['ticket_type'].isin(employee_ticket)]

# drop all records where email contains @operaballet.nl
data = data[data['email'].notna() & ~data['email'].str.contains('@operaballet.nl', na=False)]

# drop the following ids because they are related to employees, institutions or groups
from survival.constants import nonvisitor_ids
data = data[~data['id'].isin(nonvisitor_ids)]

In [1612]:
data = data.join(
    data.groupby(['id', 'start_date', 'ticket_type'])
    .size()
    .unstack(fill_value=0)
    .add_prefix('tickets_type_'), 
    on=['id', 'start_date']
)

In [1613]:
# drop ticket_type column
data = data.drop(columns='ticket_type')

# group by id and start_date and remove duplicates
data = data.drop_duplicates(subset=['id', 'start_date'])

In [1614]:
data = data[data['total_order_value'] > 0]

In [1615]:
data = pd.get_dummies(data, columns=['season'], dtype=int)

In [1616]:
# map opera and ballet
data['artform'] = data['artform'].map({'opera': 1, 'ballet': 0})

In [1617]:
data = data.reset_index(drop=True)

In [1618]:
# sort data by id and min_purchase_date'])
data = data.sort_values(by=['id', 'min_purchase_date'])

# group by id and get the first 5 min_purchase_date']) values
data = data.groupby('id').head(5)

# Create a column for the purchase number
data['purchase_number'] = data.groupby('id').cumcount() + 1

# pivot the data to get each purchase's order value as a separate column
pivot_data_avg_order_value = data.pivot(index='id', columns='purchase_number', values='avg_order_value')
pivot_data_total_order_value = data.pivot(index='id', columns='purchase_number', values='total_order_value')

# rename the columns for clarity
pivot_data_avg_order_value.columns = [f'avg_order_value_{col}' for col in pivot_data_avg_order_value.columns]
pivot_data_total_order_value.columns = [f'total_order_value_{col}' for col in pivot_data_total_order_value.columns]

# merge back with the original data if you need to keep other columns
data = data.merge(pivot_data_avg_order_value, on='id', how='left')
data = data.merge(pivot_data_total_order_value, on='id', how='left')





In [1619]:
ballet_rank_replace_dict = {
    'premium': 1,
    'rang 1': 2,
    'rang 2': 3,
    'rang 3': 4,
    'rang 4': 5,
    'rang 5': 6,
    'rang 6': 7
}

opera_rank_replace_dict = {
    'rang 1': 1,
    'rang 2': 2,
    'rang 3': 3,
    'rang 4': 4,
    'rang 5': 5,
    'rang 6': 6,
    'rang 7': 7
}

# if artform = 0, replace the rank values with the ballet rank values
data.loc[data['artform'] == 0, 'rank'] = data.loc[data['artform'] == 0, 'rank'].replace(ballet_rank_replace_dict)

# if artform = 1, replace the rank values with the opera rank values
data.loc[data['artform'] == 1, 'rank'] = data.loc[data['artform'] == 1, 'rank'].replace(opera_rank_replace_dict)

# create dummy variables for rank where artform = 0 (ballet) and artform = 1 (opera), name the columns accordingly
#ballet_dummies = pd.get_dummies(data.loc[data['artform'] == 0, 'rank'], prefix='ballet_rank', dtype=int)
#opera_dummies = pd.get_dummies(data.loc[data['artform'] == 1, 'rank'], prefix='opera_rank', dtype=int)

# get dummies for rank
dummies = pd.get_dummies(data['rank'], prefix='rank_', dtype=int).fillna(0)




  data.loc[data['artform'] == 0, 'rank'] = data.loc[data['artform'] == 0, 'rank'].replace(ballet_rank_replace_dict)
  data.loc[data['artform'] == 1, 'rank'] = data.loc[data['artform'] == 1, 'rank'].replace(opera_rank_replace_dict)


In [1620]:
# add lead days feature
data['lead_days'] = (data['start_date'] - data['min_purchase_date']).dt.days

# retain only lead days that are 0 or above
data = data[data['lead_days'] >= 0]


In [1621]:
# create next_purchase_date and time columns
data['next_purchase_date'] = data.groupby('id')['min_purchase_date'].shift(-1)
data['time'] = (data['next_purchase_date'] - data['min_purchase_date']).dt.days

In [1622]:
# age at time of purchase
data['age_at_purchase'] = (data['min_purchase_date'] - data['birthdate']).dt.days / 365.25
data['age_at_purchase'] = data['age_at_purchase'].apply(np.floor)

In [1623]:
# create columns 'age_under_18', 'age_18_35', 'age_36_55', 'age_over_55' and fill with 0
data['age_under_18'] = 0
data['age_18_35'] = 0
data['age_36_55'] = 0
data['age_over_55'] = 0
data['age_unknown'] = 0
data['age_other'] = 0


data.loc[(data['age_at_purchase'] < 18) & (data['age_at_purchase'] > 10), 'age_under_18'] = 1
data.loc[(data['age_at_purchase'] >= 18) & (data['age_at_purchase'] <= 35), 'age_18_35'] = 1
data.loc[(data['age_at_purchase'] >= 36) & (data['age_at_purchase'] <= 55), 'age_36_55'] = 1
data.loc[data['age_at_purchase'] > 55, 'age_over_55'] = 1
data.loc[data['age_at_purchase'].isnull(), 'age_unknown'] = 1
data.loc[(data['age_at_purchase'] < 8) | (data['age_at_purchase'] > 99), 'age_other'] = 1

In [1624]:
# create binary variables for country code if nl or not, email if provided or not, if municipality is provided or not, if city if provided or not
data['is_nl'] = (data['country_code'] == 'nl').astype(int)
data['email_provided'] = data['email'].notna().astype(int)
data['municipality_provided'] = data['municipality'].notna().astype(int)
data['city_provided'] = data['city'].notna().astype(int)

In [1625]:
max_purchase_date = data['min_purchase_date'].max()
data['days_since_first_purchase'] = (max_purchase_date - data['min_purchase_date']).dt.days

In [1626]:
# get month of year and day of week of start_date and min_purchase_date
data['start_month'] = data['start_date'].dt.month
data['start_dayofweek'] = data['start_date'].dt.dayofweek

data['purchase_month'] =  data['min_purchase_date'].dt.month
data['purchase_dayofweek'] = data['min_purchase_date'].dt.dayofweek

# dict for mapping dayofweek
dayofweek_map = {
    0: 'monday',
    1: 'tuesday',
    2: 'wednesday',
    3: 'thursday',
    4: 'friday',
    5: 'saturday',
    6: 'sunday'
}

month_map = {
    1: 'january',
    2: 'february',
    3: 'march',
    4: 'april',
    5: 'may',
    6: 'june',
    7: 'july',
    8: 'august',
    9: 'september',
    10: 'october',
    11: 'november',
    12: 'december'
}

# map start_month, start_dayofweek, purchase_month, purchase_dayofweek to dayofweek_map and month_map
data['start_month'] = data['start_month'].apply(lambda x: month_map.get(x, None))
data['start_dayofweek'] = data['start_dayofweek'].apply(lambda x: dayofweek_map.get(x, None))

data['purchase_month'] = data['purchase_month'].apply(lambda x: month_map.get(x, None))
data['purchase_dayofweek'] = data['purchase_dayofweek'].apply(lambda x: dayofweek_map.get(x, None))

# get dummies for start_month, start_dayofweek, purchase_month, purchase_dayofweek
data = pd.get_dummies(data, columns=['start_month', 'start_dayofweek', 'purchase_month', 'purchase_dayofweek'], dtype=int)


In [1627]:
# dummies just for male and female
data['gender_male'] = (data['gender'] == 'male').astype(int)
data['gender_female'] = (data['gender'] == 'female').astype(int)

### Get the location from municipalities, cities, and then countries (in that order)

In [1628]:
# the first_start date of each production should be 1 since it is the first performance of the production, indicating premiere status
# flirt should remain 0 
data['is_premiere'] = data.groupby('production')['start_date'].transform('min')
data['is_premiere'] = (data['start_date'] == data['is_premiere']).astype(int)
data.loc[data['production'].str.contains(' flirt', case=False, na=False), 'is_premiere'] = 0

data['is_flirt'] = data['production'].str.contains(' flirt', case=False, na=False).astype(int)

# remove ' flirt' from production name
data['production'] = data['production'].str.replace(' flirt', '', case=False)

In [1629]:
from survival.utils import geonames_cleaner
geonames_cleaner(data, ['city', 'municipality'])
geonames_cleaner(nl, ['name'])
geonames_cleaner(hh, ['municipality'])
geonames_cleaner(gemeenten, ['oude_naam', 'nieuwe_naam'])
geonames_cleaner(cities, ['name'])

Unnamed: 0,name,latitude,longitude,country_code
0,vila,42.53176,1.56654,ad
1,soldeu,42.57688,1.66769,ad
2,sispony,42.53368,1.51613,ad
3,el tarter,42.57952,1.65362,ad
4,sant julia de lòria,42.46372,1.49129,ad
...,...,...,...,...
213594,banket,-17.38333,30.40000,zw
213595,epworth,-17.89000,31.14750,zw
213596,chitungwiza,-18.01274,31.07555,zw
213597,harare western suburbs,-17.84150,30.87674,zw


### Cities longitude and latitude

In [1630]:
# First get all Dutch cities from the data
dutch_cities = data[data['country_code'] == 'nl']

# Create a clean mapping of city names to coordinates from nl dataset
city_coords = nl[['name', 'latitude', 'longitude']].drop_duplicates(subset=['name']).loc[~(nl['alternatenames'].isnull() & nl['name'].duplicated())]

# Merge the coordinates with the Dutch cities
data = data.merge(
    city_coords,
    left_on='city',
    right_on='name',
    how='left'
).drop(columns='name')

# Check which cities are missing coordinates
missing_cities = data[
    (data['country_code'] == 'nl') & 
    (data['longitude'].isna()) & 
    (data['city'].notna())
]['city'].unique()

In [1631]:
# Create a dictionary mapping alternative names to the main city name
alt_names_dict = {}
for _, row in nl.iterrows():
    if isinstance(row['alternatenames'], str):  # Check if alternatenames exists
        alt_names = row['alternatenames'].split(',')
        geonames_cleaner(pd.DataFrame(alt_names, columns=['name']), ['name'])
        for alt_name in alt_names:
            alt_names_dict[alt_name] = row['name']

# For cities still missing coordinates, try matching through alternative names
for city in missing_cities:
    if city in alt_names_dict:
        # Get the main name for this city
        main_name = alt_names_dict[city]
        
        # Get coordinates from the main name
        coords = nl[nl['name'] == main_name][['latitude', 'longitude']].iloc[0]
        
        # Update the coordinates in the main dataframe
        mask = (data['country_code'] == 'nl') & (data['city'] == city)
        data.loc[mask, 'latitude'] = coords['latitude']
        data.loc[mask, 'longitude'] = coords['longitude']

# Check remaining missing cities
still_missing = data[
    (data['country_code'] == 'nl') & 
    (data['longitude'].isna()) & 
    (data['city'].notna())
]['city'].unique()

print(f"Number of cities still missing: {len(still_missing)}")
print("Still missing cities:", still_missing)

Number of cities still missing: 887
Still missing cities: ['s gravenhage' 'breukelen ut' 'noordwijk zh' 'ede gld' 'valkenburg lb'
 'hengelo ov' 'elsloo lb' 'capelle a/d shissel' 'katwijk zh' '242' '232'
 'ijsselstein ut' '627234370' 'rijswijk zh' 'spijkerboor nh' 'nijkerk gld'
 'vianen ut' 'heusden gem heusden' 'stuttgart' 'amsterdaam' 'laren nh'
 'vooburg' 'boy@easyshootsnl' '65 58' 'elst gld' '929' '15' 'buren gld'
 '/ s gravenhage' '33' 'beek berg en dal' 'amerpoort' '4' 'vienna'
 'laan op zuid' '2102zr' 'vianen u' 'cotignac   france'
 'nieuwerkerk ad ijssel' 'oosterhout nb' 'voorschotten' 'velp gld'
 'beets nh' 'loenen gld' 'midden beemster' 'molenhoek lb' 'enkuizen'
 'sleenaken' 'nieuw vannep' 'ede / the netherlands' 'anna palowna'
 'amstertdam' 'oudorp nh' 'loenen a/d vecht' 'alphen gld' 'aalst waalre'
 'wyk by duurstede' 'sofia' 'katwyk zh' 'vianen ut nederland'
 'kapel avezaath buren' 'des haag' 'valkenburg zh' 'antwerpen' 'brussel'
 'cappele aan de ijssel' 'seeheim jugenheim' 

In [1632]:
# Create a list of all possible names (main names and alternative names)
main_names = nl['name'].unique()
alt_names = nl['alternatenames'].dropna().str.split(', ').explode().unique()
all_names = np.concatenate([main_names, alt_names])

# Do fuzzy matching for still missing cities
for city in still_missing:
    # Get the best match
    match = process.extractOne(city, all_names)
    
    if match[1] > 85:  # If confidence score is high enough
        matched_name = match[0]
        
        # Case 1: Matched name is a main name in nl dataset
        if matched_name in nl['name'].values:
            coords = nl[nl['name'] == matched_name][['latitude', 'longitude']].iloc[0]
            
        # Case 2: Matched name is an alternative name
        else:
            # Find the main city name for this alternative name
            main_city = nl[nl['alternatenames'].fillna('').str.contains(matched_name, regex=False)]['name']
            if not main_city.empty:
                coords = nl[nl['name'] == main_city.iloc[0]][['latitude', 'longitude']].iloc[0]
            else:
                continue  # Skip if we can't find the main city
                
        # Update coordinates in main dataframe
        mask = (data['country_code'] == 'nl') & (data['city'] == city)
        data.loc[mask, 'latitude'] = coords['latitude']
        data.loc[mask, 'longitude'] = coords['longitude']

# Final check for missing cities
final_missing = data[
    (data['country_code'] == 'nl') & 
    (data['longitude'].isna()) & 
    (data['city'].notna())
]['city'].unique()

print(f"Number of cities still missing after fuzzy matching: {len(final_missing)}")
print("Final missing cities:", final_missing)

Number of cities still missing after fuzzy matching: 133
Final missing cities: ['242' '232' '627234370' 'stuttgart' 'boy@easyshootsnl' '65 58' '929' '33'
 'vienna' '2102zr' 'sofia' 'katwyk zh' '33hs' '358' 'mexico' 'stavanger'
 'grevesmuhlen' 'nul part' 'odessa' 'gfcfxfdxdfy' 'zaporoz' '24 mrt'
 'koln' '141' '1024 ez' '57a' '3040' '89 3l' '3020' '1442' 'hannover'
 'moscow' 'neuendorf' 'mainz' 'karlsruhe' 'dirrietz' '603' 'dublin' '7'
 '16a' '173' 'zulpich' '3532ga' 'lonender' 'arnehm' 'leda' 'sdfdfg'
 'meulebeke' 'uccle' '75006' 'berlin' '1079 vr' '3706ve' '56' 'olathe'
 '204' 'darmstadt' 'stockholm' '48c' '638' 'rhhon' '2497cs' 'lanaken'
 'varna' 'cologne' 'yes' '48b1' 'jougne' 'onbekend' '7hs' 'haenwijck'
 '119' 'lecce' '2b' '5405' '106' 'princeton' '197' 'iasi' 'washington dc'
 '13i' '206' '2352he' 'erfurt' 'kuala lumpur' '3582pn' 'bodø' '1091kw'
 '178' '06 154 27028' '78' '129' 'k1' 'dubai' '147' 'madrid' 'nannet'
 'uppsala' '697' 'kumtich' '1190' 'stephanskirchen' '33a'
 'ydwineza

In [1633]:
# check where lon and lat is null but municipality is not null

### Municipalities

In [1634]:
# get all names which contain gemeente
nl_municipalities = nl[
    (nl['alternatenames'].str.contains(' munici', na=False)) | 
    (nl['alternatenames'].str.contains('gemeente', na=False) |
    (nl['name'].str.contains('gemeente', na=False)) | 
    (nl['feature_code'] == 'adm2'))
    ]
# since the municipality names in data omit the prefix, we delete it here too
nl_municipalities.loc[nl_municipalities['name'].str.startswith('gemeente '), 'name'] = nl_municipalities['name'].str.replace('gemeente ', '')

# change the old name of westvoorne to voorne aan zee in nl_municipalities 
nl_municipalities.loc[nl_municipalities['name'] == 'westvoorne', 'name'] = 'voorne aan zee'

In [1635]:
# Add municipality coordinates only where coordinates are still missing
municipality_coords = nl_municipalities[['name', 'latitude', 'longitude']].rename(
    columns={
        'latitude': 'municipality_latitude',
        'longitude': 'municipality_longitude'
    }
)

# Merge municipality coordinates
data = data.merge(
    municipality_coords,
    left_on='municipality',
    right_on='name',
    how='left'
).drop(columns='name')

# Fill missing coordinates with municipality coordinates where available
mask = data['longitude'].isna()
data.loc[mask, 'longitude'] = data.loc[mask, 'municipality_longitude']
data.loc[mask, 'latitude'] = data.loc[mask, 'municipality_latitude']

# Clean up by dropping the temporary municipality coordinate columns
data = data.drop(columns=['municipality_latitude', 'municipality_longitude'])

# Check remaining missing coordinates
still_missing = data[
    (data['longitude'].isna()) & 
    (data['city'].notna()) &
    (data['country_code'] == 'nl')
]
print(f"Number of records still missing coordinates: {len(still_missing)}")

Number of records still missing coordinates: 97


### Countries

In [1636]:
pcli = countries[countries['feature_code'] == 'pcli'].drop(columns=['name', 'feature_code'])

In [1637]:
# mask records that need country coordinates
missing_coords_mask = (data['longitude'].isna()) & (data['country_code'].notna())

# country coordinates mapping
country_coords = pcli.set_index('country_code')[['latitude', 'longitude']]

# update latitude and longitude
data.loc[missing_coords_mask, 'latitude'] = (
    data.loc[missing_coords_mask, 'country_code']
    .map(country_coords['latitude'])
)
data.loc[missing_coords_mask, 'longitude'] = (
    data.loc[missing_coords_mask, 'country_code']
    .map(country_coords['longitude'])
)

In [1638]:
# check the missing countries by checking the feature_code starting with 'pcl' in countries and add long and lat to data
missing_cc = data.loc[(data['longitude'].isna()) & (data['country_code'] != 'nl'), 'country_code'].unique()

missing_countries = countries[
	countries['country_code'].isin(missing_cc) & 
	countries['feature_code'].str.startswith('pcl', na=False)
].drop_duplicates(subset='country_code')

data.loc[
	(data['longitude'].isna()) & (data['country_code'].notna()), 
	'longitude'
] = data.loc[
	(data['longitude'].isna()) & (data['country_code'].notna()), 
	'country_code'
].map(missing_countries.set_index('country_code')['longitude'])

data.loc[
	(data['latitude'].isna()) & (data['country_code'].notna()), 
	'latitude'
] = data.loc[
	(data['latitude'].isna()) & (data['country_code'].notna()), 
	'country_code'
].map(missing_countries.set_index('country_code')['latitude'])

In [1639]:
# manually look up the lat and long of american samoa and namibia
american_samoa = (-14.23377, -169.47767)
namibia = (-22.00000, 17.00000)

# update the lat and long of american samoa and namibia
data.loc[data['country_code'] == 'as', 'latitude'] = american_samoa[0]
data.loc[data['country_code'] == 'as', 'longitude'] = american_samoa[1]

data.loc[data['country_code'] == 'na', 'latitude'] = namibia[0]
data.loc[data['country_code'] == 'na', 'longitude'] = namibia[1]


### Get the distance of known longitude and latitude to nob

In [1640]:
# for every lan and long, calculate the distance to the nob
nob = (52.367608492466346, 4.901889580039182)

data['distance_to_nob'] = data.apply(
    lambda row: geodesic(nob, (row['latitude'], row['longitude'])).kilometers
    if pd.notnull(row['latitude']) and pd.notnull(row['longitude']) else None,
    axis=1
)


In [1641]:
# create binary flag to indicate whether distance to nob is known
data['nl_spatial_imputed'] = data['distance_to_nob'].isna().astype(int)

# fill missing distances with the median of the distance of country_code nl
data['distance_to_nob'] = data['distance_to_nob'].fillna(data[data['country_code'] == 'nl']['distance_to_nob'].median())

# Median Income 

In [1642]:
# get the income from hh and match with data municipality
data = data.merge(hh, left_on='municipality', right_on='municipality', how='left')

In [1643]:

# filter Dutch records with valid coordinates
nl_data = data[
    (data['country_code'] == 'nl') & 
    data['latitude'].notna() & 
    data['longitude'].notna()
]

# Define features and target
features = ['latitude', 'longitude', 'distance_to_nob']
target = 'median_income'

# Create the imputer
imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=100, 
        random_state=42,
        max_depth=10  # Add depth limit to prevent overfitting
    ),
    random_state=42,
    max_iter=10,
    initial_strategy='mean'
)

# Prepare the data
X = nl_data[features + [target]].copy()

# Fit and transform
X_imputed = imputer.fit_transform(X)


In [1644]:
# After running validation
#results_df = validate_imputation(nl_data, features, target)
#
## Calculate confidence intervals
#for metric in ['RMSE', 'MAE', 'R2', 'NRMSE']:
#    mean = results_df[metric].mean()
#    std = results_df[metric].std()
#    ci = std * 1.96  # 95% ci
#    print(f"\n{metric}:")
#    print(f"Mean: {mean:.4f} ± {ci:.4f}")

In [1645]:
# create a binary flag to indicate whether the income is known and country_code == nl
data['domestic_median_imputed'] = 0

nl_missing_mask = (data['country_code'] == 'nl') & (data['median_income'].isna())
data.loc[X.index, 'median_income'] = np.where(
    nl_data['median_income'].isna(),
    pd.DataFrame(X_imputed, columns=features + [target], index=X.index)['median_income'],
    data.loc[X.index, 'median_income']
)
# flag for imputed nl records
data.loc[nl_missing_mask, 'domestic_median_imputed'] = 1

In [1646]:
# imputation flag
data['foreign_median_imputed'] = 0

# fill non nl records with known country data
non_nl_mask = (data['country_code'].notna()) & (data['country_code'] != 'nl') & (data['median_income'].isna())
data.loc[non_nl_mask, 'median_income'] = data.loc[non_nl_mask, 'country_code'].map(
    income.set_index('country_code')['median_income']
)
data.loc[non_nl_mask, 'foreign_median_imputed'] = 1

# median of non nl records
foreign_median = data.loc[(data['country_code'] != 'nl') & (data['median_income'].notna()), 'median_income'].median()

# fill missing values with foreign median
remaining_missing_mask = data['median_income'].isna()
data.loc[remaining_missing_mask, 'median_income'] = foreign_median
data.loc[remaining_missing_mask, 'foreign_median_imputed'] = 1

In [1647]:
# fill records with no country code using nl median
nl_median = data.loc[data['country_code'] == 'nl', 'median_income'].median()
no_country_mask = data['country_code'].isna() & data['median_income'].isna()
data.loc[no_country_mask, 'median_income'] = nl_median

# mark as domestic median imputed
data.loc[no_country_mask, 'domestic_median_imputed'] = 1

In [1648]:
# export full processed data for eda
data.to_csv('../../data/processed/data_for_eda.csv', index=False)

In [1649]:
data[((data['season_2021_2022'] == 1) |
    (data['season_2022_2023'] == 1) |
    (data['season_2023_2024'] == 1) |
    (data['season_2024_2025'] == 1)) &
    (data['purchase_number'] <= 2)
]
     

Unnamed: 0,rank,country_code,email,municipality,city,production,start_date,artform,gender,birthdate,...,gender_female,is_premiere,is_flirt,latitude,longitude,distance_to_nob,nl_spatial_imputed,median_income,domestic_median_imputed,foreign_median_imputed
0,1.0,nl,ljjschuurmans@gmail.com,delft,delft,22/23 turandot,2022-12-12 20:00:00,1,unknown,1999-03-23,...,0,0,0,52.00667,4.35556,54.854622,0,29.600000,0,0
1,2.0,nl,ljjschuurmans@gmail.com,delft,delft,22/23 messa da requiem,2023-02-13 20:15:00,1,unknown,1999-03-23,...,0,0,0,52.00667,4.35556,54.854622,0,29.600000,0,0
3,2.0,nl,eskersterneberg@gmail.com,amsterdam,amsterdam,21/22 made in amsterdam,2022-02-20 14:00:00,0,female,1991-08-11,...,1,1,0,52.37403,4.88969,1.095892,0,33.400000,0,0
6,4.0,nl,e.houtman@houthoff.com,,amsterdam,22/23 the sleeping beauty,2022-12-27 19:30:00,0,unknown,NaT,...,0,0,0,52.37403,4.88969,1.095892,0,33.434026,1,0
43,2.0,nl,joopmuller7@gmail.com,amsterdam,amsterdam,21/22 der freischutz,2022-06-18 19:30:00,1,female,1939-06-01,...,1,0,0,52.37403,4.88969,1.095892,0,33.400000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434708,1.0,nl,linda.joep@casema.nl,,houten,24/25 die fledermaus,2024-12-29 14:00:00,1,female,1957-02-06,...,1,0,0,52.02833,5.16806,41.909596,0,48.684134,1,0
434709,2.0,nl,f_duursema@hotmail.com,,amsterdam,24/25 idomeneo,2025-02-15 19:30:00,1,unknown,NaT,...,0,0,0,52.37403,4.88969,1.095892,0,33.434026,1,0
434710,4.0,nl,mirna@upcmail.nl,,weesp,24/25 die fledermaus,2024-12-10 19:30:00,1,unknown,2003-10-31,...,0,0,0,52.30750,5.04167,11.640816,0,34.573199,1,0
434711,3.0,,jonathanebrito@gmail.com,,,24/25 notenkraker en muizenkoning,2024-12-28 14:00:00,0,,NaT,...,0,0,0,,,24.667152,1,69.338000,0,1


In [1650]:
data.groupby('id').head(2)

Unnamed: 0,rank,country_code,email,municipality,city,production,start_date,artform,gender,birthdate,...,gender_female,is_premiere,is_flirt,latitude,longitude,distance_to_nob,nl_spatial_imputed,median_income,domestic_median_imputed,foreign_median_imputed
0,1.0,nl,ljjschuurmans@gmail.com,delft,delft,22/23 turandot,2022-12-12 20:00:00,1,unknown,1999-03-23,...,0,0,0,52.00667,4.35556,54.854622,0,29.600000,0,0
1,2.0,nl,ljjschuurmans@gmail.com,delft,delft,22/23 messa da requiem,2023-02-13 20:15:00,1,unknown,1999-03-23,...,0,0,0,52.00667,4.35556,54.854622,0,29.600000,0,0
2,2.0,nl,eskersterneberg@gmail.com,amsterdam,amsterdam,18/19 juditha triumphans,2019-02-01 20:00:00,1,female,1991-08-11,...,1,0,0,52.37403,4.88969,1.095892,0,33.400000,0,0
3,2.0,nl,eskersterneberg@gmail.com,amsterdam,amsterdam,21/22 made in amsterdam,2022-02-20 14:00:00,0,female,1991-08-11,...,1,1,0,52.37403,4.88969,1.095892,0,33.400000,0,0
6,4.0,nl,e.houtman@houthoff.com,,amsterdam,22/23 the sleeping beauty,2022-12-27 19:30:00,0,unknown,NaT,...,0,0,0,52.37403,4.88969,1.095892,0,33.434026,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434708,1.0,nl,linda.joep@casema.nl,,houten,24/25 die fledermaus,2024-12-29 14:00:00,1,female,1957-02-06,...,1,0,0,52.02833,5.16806,41.909596,0,48.684134,1,0
434709,2.0,nl,f_duursema@hotmail.com,,amsterdam,24/25 idomeneo,2025-02-15 19:30:00,1,unknown,NaT,...,0,0,0,52.37403,4.88969,1.095892,0,33.434026,1,0
434710,4.0,nl,mirna@upcmail.nl,,weesp,24/25 die fledermaus,2024-12-10 19:30:00,1,unknown,2003-10-31,...,0,0,0,52.30750,5.04167,11.640816,0,34.573199,1,0
434711,3.0,,jonathanebrito@gmail.com,,,24/25 notenkraker en muizenkoning,2024-12-28 14:00:00,0,,NaT,...,0,0,0,,,24.667152,1,69.338000,0,1


In [1663]:
# data for training
general_cols = ['artform','order_size','min_purchase_date', 'total_order_value', 'avg_order_value',
                'tickets_sold', 'cumulative_sales', 'lead_days', 'next_purchase_date', 'time', 'is_nl',
                'email_provided', 'municipality_provided', 'city_provided', 'days_since_first_purchase',
                'distance_to_nob', 'nl_spatial_imputed', 'domestic_median_imputed', 'foreign_median_imputed',
                'median_income', 'is_flirt','is_premiere']

age_cols = ['age_under_18', 'age_18_35', 'age_36_55', 'age_over_55', 'age_unknown', 'age_other']

rank_cols = [col for col in data.columns if col.startswith('ballet_rank') or col.startswith('opera_rank')]

purchase_date_cols = [col for col in data.columns if col.startswith('purchase_dayofweek') or col.startswith('purchase_month')]
start_date_cols = [col for col in data.columns if col.startswith('start_dayofweek') or col.startswith('start_month')]

season_cols = [col for col in data.columns if col.startswith('season_')]

ticket_type_cols = [col for col in data.columns if col.startswith('tickets_type_')]

# retain only the first and second instance of a purchase (if available)
training_data = data[data['purchase_number'] <= 2]

training_data = data[general_cols + rank_cols + age_cols + purchase_date_cols + start_date_cols + season_cols + ticket_type_cols]

# drop columns with only one unique value
training_data = training_data.drop(columns=training_data.columns[training_data.nunique() == 1])

In [1652]:
# convert all numerical columns to float32
for col in training_data.select_dtypes(include='number').columns:
    training_data[col] = training_data[col].astype('float32')

In [1653]:
training_data = training_data.reset_index(drop=True)

In [1655]:
# export training data as parquet
training_data.to_parquet('../../data/processed/training_data.parquet')