In [858]:
import sys
import os
sys.path.append("../..")

In [859]:
# import libraries and custom modules
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from rapidfuzz import process, fuzz

import survival.utils
importlib.reload(survival.utils)
from survival.utils import show_all



In [860]:
# load data
data = pd.read_parquet("../../data/processed/raw_clean.parquet")
hh = pd.read_csv("../../data/processed/hh_clean.csv")
cities = pd.read_csv('../../data/processed/cities.csv')
nl = pd.read_csv('../../data/processed/nl.csv')

In [861]:
# find the earliest purchase date per performance, in case people bought multiple tickets at different times
min_purchase_date = data.groupby(['id', 'start_date']).agg(
    min_purchase_date =( 'purchase_date', 'min')).reset_index()

data = data.merge(min_purchase_date, on=['id', 'start_date'], how='left')

In [862]:
# per id per start_date, count the amount of tickets bought and store in column 'order_size', and count per ticket type the amount of tickets bought and store in columns 'order_size_<ticket_type>'. fill with 0 if no tickets bought
data['order_size'] = data.groupby(['id', 'start_date'])['id'].transform('count')
data['total_order_value'] = data.groupby(['id', 'start_date'])['price'].transform('sum')
data['avg_order_value'] = data.groupby(['id', 'start_date'])['price'].transform('mean')
data['total_order_value'] = data['total_order_value'].round(2)
data['avg_order_value'] = data['avg_order_value'].round(2)

In [863]:
# drop these, but perhaps drop ticket_num earlier -> figure out if its necessary in a grouping operation
data = data.drop(columns=['ticket_num', 'price'])

In [864]:
data

Unnamed: 0,rank,country_code,email,municipality,city,production,season,purchase_date,start_date,ticket_type,is_free,artform,gender,birthdate,age,id,min_purchase_date,order_size,total_order_value,avg_order_value
0,rang 1,nl,joostplomp@xs4all.nl,waadhoeke,oudebildtzijl,21/22 raymonda,2021_2022,2021-11-02,2022-04-10 14:00:00,jeugdkorting t/m 16 jaar,0,ballet,male,NaT,,0037q000007bfcjqac,2021-11-02,3,180.0,60.0
1,rang 4,nl,joopvanderstraaten@planet.nl,lingewaard,gendt,22/23 carmen,2022_2023,2022-04-12,2022-09-18 14:00:00,abonnement 22/23,0,opera,male,1943-05-02,81.0,0037q00000boc0mqax,2022-04-12,1,86.0,86.0
2,rang 2,nl,diepvriesconijn@quicknet.nl,wormerland,wormer,22/23 the sleeping beauty,2022_2023,2022-05-31,2022-10-29 19:30:00,standaard,0,ballet,female,1967-02-07,57.0,0037q00000bnum0qah,2022-05-31,10,650.0,65.0
3,rang 1,nl,gier@kpnmail.nl,breda,breda,22/23 konigskinder,2022_2023,2022-04-21,2022-10-09 14:00:00,abonnement 22/23,0,opera,male,1951-08-22,73.0,0037q000007b11eqac,2022-04-21,1,123.0,123.0
4,rang 3,nl,jvbelkum@xs4all.nl,amersfoort,amersfoort,22/23 messa da requiem,2022_2023,2022-04-04,2023-02-19 14:00:00,abonnement 22/23,0,opera,male,1937-12-08,86.0,0037q00000bojphqa5,2022-04-04,3,225.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732627,rang 3,nl,carollychia@gmail.com,utrecht,utrecht,24/25 jewels,2024_2025,2024-12-02,2025-02-16 14:00:00,standaard,0,ballet,female,1973-02-14,51.0,0037q00000qcsdqqam,2024-12-02,2,88.0,44.0
1732628,rang 2,nl,hanqingzhou@foxmail.com,den haag,'s-gravenhage,24/25 die fledermaus,2024_2025,2024-12-02,2024-12-10 19:30:00,standaard,0,opera,male,1997-05-20,27.0,0037q000007cvwaqao,2024-12-02,2,308.0,154.0
1732629,rang 2,nl,hanqingzhou@foxmail.com,den haag,'s-gravenhage,24/25 die fledermaus,2024_2025,2024-12-02,2024-12-10 19:30:00,standaard,0,opera,male,1997-05-20,27.0,0037q000007cvwaqao,2024-12-02,2,308.0,154.0
1732630,rang 1,ca,don@blueskier.com,,toronto,24/25 lady macbeth,2024_2025,2024-12-02,2025-04-15 20:15:00,standaard,0,ballet,male,1937-07-12,87.0,003qs00000id4hoiar,2024-12-02,2,144.0,72.0


In [865]:
daily_sales = data.groupby(['start_date', 'purchase_date']).size().reset_index(name='tickets_sold')
    
# Calculate cumulative sales for each performance
result = daily_sales.sort_values(['start_date', 'purchase_date'])
result['cumulative_sales'] = result.groupby('start_date')['tickets_sold'].cumsum()

result = result[[
    'start_date',
    'purchase_date',
    'tickets_sold',
    'cumulative_sales'
]]

In [866]:
data = data.merge(result, on=['start_date', 'purchase_date'], how='left').drop(columns=['purchase_date'])


In [867]:
#remove subscription tickets
subscription_ticket = ['abo standaard', 'abo vk dno', 'abonnement 22/23', 'kassa abo standaard', 'abonnement 24/25', 'abo vk hnb', 'abo vrij']

# remove all subscription tickets from activity
data = data[~data['ticket_type'].isin(subscription_ticket)]

In [868]:
# delete records where is_free == 1 and drop the column
data = data[data['is_free'] != 1]
data = data.drop('is_free', axis=1)

In [869]:
# drop all educatie tickets because these visitors are not unique
data = data[~data['ticket_type'].str.contains('educatie')]

# drop all ticket where ticket_type are related to employees
employee_ticket = [
    'zoekplaats',
    'huiskorting',
    'medewerker',
    'medewerker no&b',
    'vrijplaats',
    'paniek',
    'balletorkest',
    'orkest',
    'nedpho'
    ]

data = data[~data['ticket_type'].isin(employee_ticket)]

# drop the following ids because they are related to employees, institutions or groups
from survival.constants import nonvisitor_ids
data = data[~data['id'].isin(nonvisitor_ids)]

In [870]:
data = data.join(
    data.groupby(['id', 'start_date', 'ticket_type'])
    .size()
    .unstack(fill_value=0)
    .add_prefix('tickets_type_'), 
    on=['id', 'start_date']
)

In [871]:
# drop ticket_type column
data = data.drop(columns='ticket_type')

# group by id and start_date and remove duplicates
data = data.drop_duplicates(subset=['id', 'start_date'])

In [872]:
data = data[data['total_order_value'] > 0]

In [873]:
data = pd.get_dummies(data, columns=['season'], dtype=int)

In [874]:
# map opera and ballet
data['artform'] = data['artform'].map({'opera': 1, 'ballet': 0})

In [875]:
data = data.reset_index(drop=True)

In [876]:
# sort data by id and min_purchase_date'])
data = data.sort_values(by=['id', 'min_purchase_date'])

# group by id and get the first 5 min_purchase_date']) values
data = data.groupby('id').head(5)

# Create a column for the purchase number
data['purchase_number'] = data.groupby('id').cumcount() + 1

# Pivot the data to get each purchase's order value as a separate column
pivot_data_avg_order_value = data.pivot(index='id', columns='purchase_number', values='avg_order_value')
pivot_data_total_order_value = data.pivot(index='id', columns='purchase_number', values='total_order_value')

# rename the columns for clarity
pivot_data_avg_order_value.columns = [f'avg_order_value_{col}' for col in pivot_data_avg_order_value.columns]
pivot_data_total_order_value.columns = [f'total_order_value_{col}' for col in pivot_data_total_order_value.columns]

# Merge back with the original data if you need to keep other columns
data = data.merge(pivot_data_avg_order_value, on='id', how='left')
data = data.merge(pivot_data_total_order_value, on='id', how='left')





In [877]:
ballet_rank_replace_dict = {
    'premium': 1,
    'rang 1': 2,
    'rang 2': 3,
    'rang 3': 4,
    'rang 4': 5,
    'rang 5': 6,
    'rang 6': 7
}

opera_rank_replace_dict = {
    'rang 1': 1,
    'rang 2': 2,
    'rang 3': 3,
    'rang 4': 4,
    'rang 5': 5,
    'rang 6': 6,
    'rang 7': 7
}

# if artform = 0, replace the rank values with the ballet rank values
data.loc[data['artform'] == 0, 'rank'] = data.loc[data['artform'] == 0, 'rank'].replace(ballet_rank_replace_dict)

# if artform = 1, replace the rank values with the opera rank values
data.loc[data['artform'] == 1, 'rank'] = data.loc[data['artform'] == 1, 'rank'].replace(opera_rank_replace_dict)


  data.loc[data['artform'] == 0, 'rank'] = data.loc[data['artform'] == 0, 'rank'].replace(ballet_rank_replace_dict)
  data.loc[data['artform'] == 1, 'rank'] = data.loc[data['artform'] == 1, 'rank'].replace(opera_rank_replace_dict)


In [878]:
# add lead days feature
data['lead_days'] = (data['start_date'] - data['min_purchase_date']).dt.days

# retain only lead days that are 0 or above
data = data[data['lead_days'] >= 0]


In [879]:
# create next_purchase_date and time columns
data['next_purchase_date'] = data.groupby('id')['min_purchase_date'].shift(-1)
data['time'] = (data['next_purchase_date'] - data['min_purchase_date']).dt.days

data[['id', 'min_purchase_date', 'next_purchase_date', 'time']].sort_values(by='min_purchase_date')

Unnamed: 0,id,min_purchase_date,next_purchase_date,time
323061,0037q00000bonyaqap,2014-02-03,2023-01-02,3255.0
323060,0037q00000bonyaqap,2014-02-03,2014-02-03,0.0
338606,0037q00000dmp6fqat,2014-03-12,2015-03-28,381.0
319555,0037q00000bolqnqax,2014-04-01,2015-03-25,358.0
216700,0037q00000bngk3qad,2014-10-14,NaT,
...,...,...,...,...
209997,0037q00000bneueqap,2024-12-01,NaT,
400191,003qs000003jcjeia0,2024-12-01,NaT,
123733,0037q000007cvwaqao,2024-12-02,NaT,
369938,0037q00000qcsdqqam,2024-12-02,NaT,


In [880]:
from survival.utils import geonames_cleaner
geonames_cleaner(data, ['city', 'municipality'])
geonames_cleaner(nl, ['name'])

Unnamed: 0,name,alternatenames,latitude,longitude,feature_code,country_code
0,den oord,oord,51.97083,5.27083,ppl,nl
1,drijberse veld,"drijbersche veld,drijberse veld",52.77077,6.54501,lcty,nl
2,delfshavensche schie,"de schie,delfshavensche schie,delfshavense sch...",51.90172,4.45371,stmc,nl
3,aa,"a,aa,de aa riviere,de aa rivière,l'aa,leie,lei...",51.65000,5.31667,stm,nl
4,zwormertorenbrug,,52.23333,6.21667,bdg,nl
...,...,...,...,...,...,...
22833,bastion hotel eindhoven,,51.40400,5.47621,htl,nl
22834,stadsmuseum grave,,51.75744,5.73881,mus,nl
22835,saskerlei,,52.55951,4.76623,dam,nl
22836,ijsselhunten,,51.90696,6.35163,pplf,nl


In [881]:
# get all city from data where country code is nl
nl_cities = data[data['country_code'] == 'nl']['city'].unique()
nl_cities

array(['delft', 'bloemendaal', 'amsterdam', ..., 'midwolde', 'rohel',
       'oudenhoorn'], shape=(3034,), dtype=object)

In [882]:
# check how many of these cities match with nl['name']
sum(city in nl['name'].values for city in nl_cities)

# check how many are not in nl['name']
sum(city not in nl['name'].values for city in nl_cities)



1027

In [883]:
# get all names which contain gemeente, handle NaN values
gemeenten = nl[(nl['alternatenames'].str.contains(' munici', na=False)) | (nl['alternatenames'].str.contains('gemeente', na=False) | (nl['name'].str.contains('gemeente', na=False)) | (nl['feature_code'] == 'adm2'))]
# since the municipalityy names in data omit the prefix, we delete it here too
gemeenten.loc[gemeenten['name'].str.startswith('gemeente '), 'name'] = gemeenten['name'].str.replace('gemeente ', '')

In [884]:
# add the latitude and longitude of the gemeenten to the data
data = data.merge(gemeenten[['name', 'latitude', 'longitude']], left_on='municipality', right_on='name', how='left').drop(columns='name')

data

Unnamed: 0,rank,country_code,email,municipality,city,production,start_date,artform,gender,birthdate,...,total_order_value_1,total_order_value_2,total_order_value_3,total_order_value_4,total_order_value_5,lead_days,next_purchase_date,time,latitude,longitude
0,1.0,nl,ljjschuurmans@gmail.com,delft,delft,22/23 turandot,2022-12-12 20:00:00,1,unknown,1999-03-23,...,332.0,300.0,,,,171,2022-12-13,172.0,51.99968,4.36405
1,2.0,nl,ljjschuurmans@gmail.com,delft,delft,22/23 messa da requiem,2023-02-13 20:15:00,1,unknown,1999-03-23,...,332.0,300.0,,,,62,NaT,,51.99968,4.36405
2,2.0,nl,r.bernelotmoens@operaballet.nl,bloemendaal,bloemendaal,17/18 mata hari,2017-10-21 20:15:00,0,female,1988-05-16,...,30.0,36.0,322.0,36.0,40.0,19,2017-11-01,30.0,52.36230,4.58968
3,3.0,nl,r.bernelotmoens@operaballet.nl,bloemendaal,bloemendaal,17/18 la boheme,2017-12-26 14:00:00,1,female,1988-05-16,...,30.0,36.0,322.0,36.0,40.0,55,2017-12-06,35.0,52.36230,4.58968
4,2.0,nl,r.bernelotmoens@operaballet.nl,bloemendaal,bloemendaal,17/18 la boheme,2017-12-16 20:00:00,1,female,1988-05-16,...,30.0,36.0,322.0,36.0,40.0,10,2018-05-24,169.0,52.36230,4.58968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446889,2.0,nl,f_duursema@hotmail.com,,amsterdam,24/25 idomeneo,2025-02-15 19:30:00,1,unknown,NaT,...,338.0,,,,,78,NaT,,,
446890,1.0,nl,,,heemstede,24/25 die fledermaus,2024-12-25 14:00:00,1,female,NaT,...,712.0,,,,,26,NaT,,,
446891,4.0,nl,mirna@upcmail.nl,,weesp,24/25 die fledermaus,2024-12-10 19:30:00,1,unknown,2003-10-31,...,98.0,,,,,11,NaT,,,
446892,3.0,,jonathanebrito@gmail.com,,,24/25 notenkraker en muizenkoning,2024-12-28 14:00:00,0,,NaT,...,30.0,,,,,29,NaT,,,


In [885]:
# if data['country_code'] is nl and data['longitude'] is na, match data['city'] with nl['name'] and add the corresponding latitude and longitude
data.loc[(data['country_code'] == 'nl') & (data['longitude'].isna()), 'longitude'] = data.loc[(data['country_code'] == 'nl') & (data['longitude'].isna()), 'city'].map(nl.drop_duplicates(subset='name').set_index('name')['longitude'])
data.loc[(data['country_code'] == 'nl') & (data['latitude'].isna()), 'latitude'] = data.loc[(data['country_code'] == 'nl') & (data['latitude'].isna()), 'city'].map(nl.drop_duplicates(subset='name').set_index('name')['latitude'])


In [886]:
missing_cities = data['city'][(data['country_code'] == 'nl') & (data['longitude'].isna()) & (data['city'].notna())].unique()

In [887]:
# create a dictionary with nl['name'] as keys and nl['alternatenames'] as values. alternatenames should be split by commas. leave nan values as empty lists
nl_alternatenames_dict = nl.set_index('name')['alternatenames'].apply(lambda x: x.split(', ') if isinstance(x, str) else []).to_dict()

In [888]:
# create a list of all unique names in nl['name'] and nl['alternatenames']
nl_names_list = nl['name'].unique()
nl_alternatenames_list = nl['alternatenames'].apply(lambda x: x.split(', ') if isinstance(x, str) else []).explode().unique()
full_names_list = np.concatenate([nl_names_list, nl_alternatenames_list])


In [889]:
# check if any of the missing cities match on the on the nl_alternatenames_list, if so, check the nl_alternatnames_dict for the key, then add the corresponding latitude and longitude
for city in missing_cities:
    if city in nl_alternatenames_list:
        key = [key for key, value in nl_alternatenames_dict.items() if city in value]
        if key:
            data.loc[(data['country_code'] == 'nl') & (data['city'] == city), 'longitude'] = nl.loc[nl['name'] == key[0], 'longitude'].values[0]
            data.loc[(data['country_code'] == 'nl') & (data['city'] == city), 'latitude'] = nl.loc[nl['name'] == key[0], 'latitude'].values[0]

# check if there are any missing cities left
still_missing_cities = data['city'][(data['country_code'] == 'nl') & (data['longitude'].isna()) & (data['city'].notna())].unique()
still_missing_cities

array(['stuttgart', 's gravenhage', 'vienna', '2102zr',
       'cotignac   france', 'gnam@operaballetnl', 'sofia', 'den haag',
       'antwerpen', 'brussel', 'seeheim jugenheim', 'heusden gem heusden',
       '20', 'elst gld', 'st augustine', 'mexico', 'nijkerk gld',
       'stavanger', 'drassburg', 'grevesmuhlen', 'paris', 'beerse',
       'court st etienne', 'kleve', 'state college', 'nh', 'a',
       'trondheim', 'neu reisenberg', 'stad', 'nul part', 'odessa',
       'dusseldorf', 'zaporoz', 'koln', '1024 ez', 'netherlands',
       'almere hout', '1442', 'hannover', 'grenoble',
       'saint hilaire de riez', 'moscow', 'neuendorf', 'mainz',
       'karlsruhe', 'dirrietz', 'munich', 'wichelen', 'zennewijnwn',
       'dublin', 'noord holland', 'praha 10', 'jar', 'zulpich', 'ht',
       'milano', 'p', 'amstelween', 'amsterdam city centre',
       '?s heerenberg', 'fritzlar', 'bergisch gladbach',
       'villars sur glane', 'meulebeke', 'uccle', 'planegg', '75006',
       'berlin', 'fra

In [890]:
# do a fuzzy match on the still missing dutch cities. use the full_names_list for matching, if matched, check if the matched value is a key in the nl_alternatenames_dict, if so, add the corresponding latitude and longitude, if not a key, check if it matched on a value in the nl_alternatenames_dict, if so, add the corresponding latitude and longitude of the key
for city in still_missing_cities:
    match = process.extractOne(city, full_names_list)
    if match[1] > 85:
        if match[0] in nl_alternatenames_dict.keys():
            data.loc[(data['country_code'] == 'nl') & (data['city'] == city), 'longitude'] = nl.loc[nl['name'] == match[0], 'longitude'].values[0]
            data.loc[(data['country_code'] == 'nl') & (data['city'] == city), 'latitude'] = nl.loc[nl['name'] == match[0], 'latitude'].values[0]
        else:
            key = [key for key, value in nl_alternatenames_dict.items() if match[0] in value]
            if key:
                data.loc[(data['country_code'] == 'nl') & (data['city'] == city), 'longitude'] = nl.loc[nl['name'] == key[0], 'longitude'].values[0]
                data.loc[(data['country_code'] == 'nl') & (data['city'] == city), 'latitude'] = nl.loc[nl['name'] == key[0], 'latitude'].values[0]




In [891]:
# check if there are any missing cities left
missing_still = data['city'][(data['country_code'] == 'nl') & (data['longitude'].isna()) & (data['city'].notna())].unique()
missing_still # acceptable amount of missing cities

In [895]:
data[(data['country_code'] == 'nl') & (data['longitude'].isna()) & (data['city'].notna())]

Unnamed: 0,rank,country_code,email,municipality,city,production,start_date,artform,gender,birthdate,...,total_order_value_1,total_order_value_2,total_order_value_3,total_order_value_4,total_order_value_5,lead_days,next_purchase_date,time,latitude,longitude
2771,1.0,nl,nemes.johanna@icloud.com,,stuttgart,19/20 notenkraker en muizenkoning,2019-12-31 14:00:00,0,female,NaT,...,174.0,,,,,47,NaT,,,
5771,2.0,nl,m.altena@gmx.mt,,vienna,18/19 the new classics,2018-09-25 20:15:00,0,female,NaT,...,74.0,,,,,33,NaT,,,
6354,1.0,nl,ingrid47c@hotmail.com,,2102zr,19/20 giselle,2020-02-21 20:15:00,0,female,NaT,...,174.0,,,,,6,NaT,,,
12882,3.0,nl,gi_j.nam@operaballet.nl,,gnam@operaballetnl,19/20 cosi fan tutte,2019-10-21 19:00:00,1,male,NaT,...,36.0,36.0,,,,24,2019-11-28,62.0,,
12883,4.0,nl,gi_j.nam@operaballet.nl,,gnam@operaballetnl,19/20 la cenerentola,2019-12-28 20:00:00,1,male,NaT,...,36.0,36.0,,,,30,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442196,4.0,nl,avertney25@gmail.com,,hoogvliet rotterdam,24/25 don quichot,2024-11-02 20:00:00,0,unknown,2002-04-25,...,118.0,,,,,40,NaT,,,
442737,2.0,nl,lina-rebel@live.nl,,driehuis nh,24/25 don quichot flirt,2024-10-10 20:00:00,0,female,2004-02-13,...,16.0,,,,,14,NaT,,,
443984,4.0,nl,ichristinalai@gmail.com,,1075vk,24/25 don quichot,2024-10-31 20:00:00,0,female,2005-01-08,...,168.0,,,,,19,NaT,,,
444466,3.0,nl,roversbos@ziggo.nl,,driehuis nh,24/25 don quichot,2024-10-30 20:00:00,0,unknown,NaT,...,152.0,,,,,13,NaT,,,


In [893]:
# age at time of purchase
#data['age_at_purchase'] = (data['purchase_date'] - data['birthdate']).dt.days / 365.25
#data['age_at_purchase'] = data['age_at_purchase'].apply(np.floor)
