In [213]:
import sys
import os
sys.path.append("../..")

In [214]:
# import libraries and custom modules
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import survival.utils
importlib.reload(survival.utils)
from survival.utils import show_all



In [215]:
# load data
data = pd.read_parquet("../../data/processed/raw_clean.parquet")
hh = pd.read_csv("../../data/processed/hh_clean.csv")

In [216]:
# find the earliest purchase date per performance, in case people bought multiple tickets at different times
min_purchase_date = data.groupby(['id', 'start_date']).agg(
    min_purchase_date =( 'purchase_date', 'min')).reset_index()

data = data.merge(min_purchase_date, on=['id', 'start_date'], how='left')

In [217]:
# per id per start_date, count the amount of tickets bought and store in column 'order_size', and count per ticket type the amount of tickets bought and store in columns 'order_size_<ticket_type>'. fill with 0 if no tickets bought
data['order_size'] = data.groupby(['id', 'start_date'])['id'].transform('count')
data['total_order_value'] = data.groupby(['id', 'start_date'])['price'].transform('sum')
data['avg_order_value'] = data.groupby(['id', 'start_date'])['price'].transform('mean')
data['total_order_value'] = data['total_order_value'].round(2)
data['avg_order_value'] = data['avg_order_value'].round(2)

In [218]:
# drop these, but perhaps drop ticket_num earlier -> figure out if its necessary in a grouping operation
data = data.drop(columns=['ticket_num', 'price'])

In [219]:
data

Unnamed: 0,is_institutional,rank,country,email,municipality,city,production,season,purchase_date,start_date,...,is_free,artform,gender,birthdate,age,id,min_purchase_date,order_size,total_order_value,avg_order_value
0,1,rang 1,nl,joostplomp@xs4all.nl,waadhoeke,oudebildtzijl,21/22 raymonda,2021_2022,2021-11-02,2022-04-10 14:00:00,...,0,ballet,male,NaT,,0037q000007bfcjqac,2021-11-02,3,180.0,60.0
1,1,rang 4,nl,joopvanderstraaten@planet.nl,lingewaard,gendt,22/23 carmen,2022_2023,2022-04-12,2022-09-18 14:00:00,...,0,opera,male,1943-05-02,81.0,0037q00000boc0mqax,2022-04-12,1,86.0,86.0
2,1,rang 2,nl,diepvriesconijn@quicknet.nl,wormerland,wormer,22/23 the sleeping beauty,2022_2023,2022-05-31,2022-10-29 19:30:00,...,0,ballet,female,1967-02-07,57.0,0037q00000bnum0qah,2022-05-31,10,650.0,65.0
3,1,rang 1,nl,gier@kpnmail.nl,breda,breda,22/23 konigskinder,2022_2023,2022-04-21,2022-10-09 14:00:00,...,0,opera,male,1951-08-22,73.0,0037q000007b11eqac,2022-04-21,1,123.0,123.0
4,1,rang 3,nl,jvbelkum@xs4all.nl,amersfoort,amersfoort,22/23 messa da requiem,2022_2023,2022-04-04,2023-02-19 14:00:00,...,0,opera,male,1937-12-08,86.0,0037q00000bojphqa5,2022-04-04,3,225.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732627,1,rang 3,nl,carollychia@gmail.com,utrecht,utrecht,24/25 jewels,2024_2025,2024-12-02,2025-02-16 14:00:00,...,0,ballet,female,1973-02-14,51.0,0037q00000qcsdqqam,2024-12-02,2,88.0,44.0
1732628,1,rang 2,nl,hanqingzhou@foxmail.com,den haag,'s-gravenhage,24/25 die fledermaus,2024_2025,2024-12-02,2024-12-10 19:30:00,...,0,opera,male,1997-05-20,27.0,0037q000007cvwaqao,2024-12-02,2,308.0,154.0
1732629,1,rang 2,nl,hanqingzhou@foxmail.com,den haag,'s-gravenhage,24/25 die fledermaus,2024_2025,2024-12-02,2024-12-10 19:30:00,...,0,opera,male,1997-05-20,27.0,0037q000007cvwaqao,2024-12-02,2,308.0,154.0
1732630,1,rang 1,ca,don@blueskier.com,,toronto,24/25 lady macbeth,2024_2025,2024-12-02,2025-04-15 20:15:00,...,0,ballet,male,1937-07-12,87.0,003qs00000id4hoiar,2024-12-02,2,144.0,72.0


In [220]:
daily_sales = data.groupby(['start_date', 'purchase_date']).size().reset_index(name='tickets_sold')
    
# Calculate cumulative sales for each performance
result = daily_sales.sort_values(['start_date', 'purchase_date'])
result['cumulative_sales'] = result.groupby('start_date')['tickets_sold'].cumsum()

result = result[[
    'start_date',
    'purchase_date',
    'tickets_sold',
    'cumulative_sales'
]]

In [221]:
data = data.merge(result, on=['start_date', 'purchase_date'], how='left').drop(columns=['purchase_date'])


In [222]:
#remove subscription tickets
subscription_ticket = ['abo standaard', 'abo vk dno', 'abonnement 22/23', 'kassa abo standaard', 'abonnement 24/25', 'abo vk hnb', 'abo vrij']

# remove all subscription tickets from activity
data = data[~data['ticket_type'].isin(subscription_ticket)]

In [223]:
# delete records where is_free == 1 and drop the column
data = data[data['is_free'] != 1]
data = data.drop('is_free', axis=1)

In [224]:
# drop all educatie tickets because these visitors are not unique
data = data[~data['ticket_type'].str.contains('educatie')]

# drop all ticket where ticket_type are related to employees
employee_ticket = [
    'zoekplaats',
    'huiskorting',
    'medewerker',
    'medewerker no&b',
    'vrijplaats',
    'paniek',
    'balletorkest',
    'orkest',
    'nedpho'
    ]

data = data[~data['ticket_type'].isin(employee_ticket)]

# drop the following ids because they are related to employees, institutions or groups
from survival.constants import nonvisitor_ids
data = data[~data['id'].isin(nonvisitor_ids)]

In [225]:
data = data.join(
    data.groupby(['id', 'start_date', 'ticket_type'])
    .size()
    .unstack(fill_value=0)
    .add_prefix('tickets_type_'), 
    on=['id', 'start_date']
)

In [226]:
# drop ticket_type column
data = data.drop(columns='ticket_type')

# group by id and start_date and remove duplicates
data = data.drop_duplicates(subset=['id', 'start_date'])

In [227]:
data = data[data['total_order_value'] > 0]

In [228]:
data = pd.get_dummies(data, columns=['season'], dtype=int)

In [229]:
# map opera and ballet
data['artform'] = data['artform'].map({'opera': 1, 'ballet': 0})

In [230]:
data = data.reset_index(drop=True)

In [231]:
# sort data by id and min_purchase_date'])
data = data.sort_values(by=['id', 'min_purchase_date'])

# group by id and get the first 5 min_purchase_date']) values
data = data.groupby('id').head(5)

# Create a column for the purchase number
data['purchase_number'] = data.groupby('id').cumcount() + 1

# Pivot the data to get each purchase's order value as a separate column
pivot_data_avg_order_value = data.pivot(index='id', columns='purchase_number', values='avg_order_value')
pivot_data_total_order_value = data.pivot(index='id', columns='purchase_number', values='total_order_value')

# rename the columns for clarity
pivot_data_avg_order_value.columns = [f'avg_order_value_{col}' for col in pivot_data_avg_order_value.columns]
pivot_data_total_order_value.columns = [f'total_order_value_{col}' for col in pivot_data_total_order_value.columns]

# Merge back with the original data if you need to keep other columns
data = data.merge(pivot_data_avg_order_value, on='id', how='left')
data = data.merge(pivot_data_total_order_value, on='id', how='left')





In [232]:
ballet_rank_replace_dict = {
    'premium': 1,
    'rang 1': 2,
    'rang 2': 3,
    'rang 3': 4,
    'rang 4': 5,
    'rang 5': 6,
    'rang 6': 7
}

opera_rank_replace_dict = {
    'rang 1': 1,
    'rang 2': 2,
    'rang 3': 3,
    'rang 4': 4,
    'rang 5': 5,
    'rang 6': 6,
    'rang 7': 7
}

# if artform = 0, replace the rank values with the ballet rank values
data.loc[data['artform'] == 0, 'rank'] = data.loc[data['artform'] == 0, 'rank'].replace(ballet_rank_replace_dict)

# if artform = 1, replace the rank values with the opera rank values
data.loc[data['artform'] == 1, 'rank'] = data.loc[data['artform'] == 1, 'rank'].replace(opera_rank_replace_dict)


  data.loc[data['artform'] == 0, 'rank'] = data.loc[data['artform'] == 0, 'rank'].replace(ballet_rank_replace_dict)
  data.loc[data['artform'] == 1, 'rank'] = data.loc[data['artform'] == 1, 'rank'].replace(opera_rank_replace_dict)


In [234]:
# add lead days feature
data['lead_days'] = (data['start_date'] - data['min_purchase_date']).dt.days

# retain only lead days that are 0 or above
data = data[data['lead_days'] >= 0]


In [235]:
# create next_purchase_date and time columns
data['next_purchase_date'] = data.groupby('id')['min_purchase_date'].shift(-1)
data['time'] = (data['next_purchase_date'] - data['min_purchase_date']).dt.days

data[['id', 'min_purchase_date', 'next_purchase_date', 'time']].sort_values(by='min_purchase_date')

Unnamed: 0,id,min_purchase_date,next_purchase_date,time
323061,0037q00000bonyaqap,2014-02-03,2023-01-02,3255.0
323060,0037q00000bonyaqap,2014-02-03,2014-02-03,0.0
338606,0037q00000dmp6fqat,2014-03-12,2015-03-28,381.0
319555,0037q00000bolqnqax,2014-04-01,2015-03-25,358.0
216700,0037q00000bngk3qad,2014-10-14,NaT,
...,...,...,...,...
209997,0037q00000bneueqap,2024-12-01,NaT,
400191,003qs000003jcjeia0,2024-12-01,NaT,
123733,0037q000007cvwaqao,2024-12-02,NaT,
369938,0037q00000qcsdqqam,2024-12-02,NaT,


In [243]:
data[(data['city'] == 'bergen')]

Unnamed: 0,is_institutional,rank,country,email,municipality,city,production,start_date,artform,gender,...,avg_order_value_4,avg_order_value_5,total_order_value_1,total_order_value_2,total_order_value_3,total_order_value_4,total_order_value_5,lead_days,next_purchase_date,time
280,1,2.0,nl,emma30-9@hotmail.com,bergen (nh.),bergen,15/16 der rosenkavalier,2015-09-24 18:30:00,1,female,...,15.0,15.0,15.0,46.0,15.0,15.0,15.0,0,2015-09-25,1.0
281,1,3.0,nl,emma30-9@hotmail.com,bergen (nh.),bergen,15/16 notenkraker en muizenkoning,2016-01-01 14:00:00,0,female,...,15.0,15.0,15.0,46.0,15.0,15.0,15.0,98,2016-09-15,356.0
282,1,2.0,nl,emma30-9@hotmail.com,bergen (nh.),bergen,16/17 le nozze di figaro,2016-09-15 19:00:00,1,female,...,15.0,15.0,15.0,46.0,15.0,15.0,15.0,0,2017-01-24,131.0
283,1,2.0,nl,emma30-9@hotmail.com,bergen (nh.),bergen,16/17 die entfuhrung aus dem serail,2017-01-24 19:30:00,1,female,...,15.0,15.0,15.0,46.0,15.0,15.0,15.0,0,2017-02-16,23.0
284,1,1.0,nl,emma30-9@hotmail.com,bergen (nh.),bergen,16/17 made in amsterdam 1,2017-02-21 20:15:00,0,female,...,15.0,15.0,15.0,46.0,15.0,15.0,15.0,5,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361876,1,5.0,no,lidiakjode@gmail.com,,bergen,22/23 forsythe,2023-06-23 20:15:00,0,female,...,,,40.0,,,,,22,NaT,
362160,1,3.0,nl,marieke.denbleker@gmail.com,bergen (nh.),bergen,23/24 frida,2024-02-11 14:00:00,0,unknown,...,,,110.0,,,,,171,NaT,
402606,1,3.0,no,oksun12232000@yahoo.co.kr,,bergen,23/24 raymonda,2023-12-13 20:00:00,0,female,...,,,76.0,,,,,2,NaT,
409838,1,2.0,unknown,arry@dehaasvroege.nl,bergen (nh.),bergen,23/24 dancing dutch,2024-04-14 14:00:00,0,female,...,,,102.0,,,,,57,NaT,


In [240]:
data['city'].value_counts().head(100)

city
amsterdam        139343
utrecht           18615
haarlem           10680
's-gravenhage      9526
rotterdam          9234
                  ...  
hoorn               453
veenendaal          445
doorn               443
moscow              440
krommenie           438
Name: count, Length: 100, dtype: int64

In [None]:
# age at time of purchase
data['age_at_purchase'] = (data['purchase_date'] - data['birthdate']).dt.days / 365.25
data['age_at_purchase'] = data['age_at_purchase'].apply(np.floor)
