# Process Attraction Features Data

This script reads the attraction features data from JSON and adds it to the ride data.

In [5]:
import os
import json
from pathlib import Path

import pandas as pd

Define paths

In [6]:
data_dir = Path('../data')

json_file = data_dir / 'raw' / 'attraction_features.json'
parquet_file = data_dir / 'processed' / 'ride_features' / 'attraction_features.parquet'

os.makedirs(parquet_file.parent, exist_ok=True)

Read JSON data

In [3]:
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

df_expanded = pd.json_normalize(df['attractions_features'].tolist())

print("DataFrame Info:")
print(df_expanded.info())
print("\nFirst few rows:")
display(df_expanded.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               36 non-null     object 
 1   attraction_type    36 non-null     object 
 2   category           36 non-null     object 
 3   max_height         35 non-null     float64
 4   track_length       27 non-null     float64
 5   max_speed          34 non-null     float64
 6   g_force            36 non-null     float64
 7   min_age            36 non-null     int64  
 8   min_height         35 non-null     float64
 9   capacity_per_hour  36 non-null     int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 2.9+ KB
None

First few rows:


Unnamed: 0,name,attraction_type,category,max_height,track_length,max_speed,g_force,min_age,min_height,capacity_per_hour
0,silver star,Hyper Coaster,thrill,73.0,1620.0,127.0,4.0,8,140.0,1750
1,blue fire megacoaster,Launched Coaster,thrill,38.0,1056.0,100.0,3.8,7,130.0,1720
2,VirtualLine: WODAN - Timburcoaster,Wooden Coaster,thrill,40.0,1050.0,100.0,3.5,8,140.0,1250
3,voletarium,Flying Theater,family,16.0,,25.0,1.5,0,0.0,1400
4,alpine express enzian,Powered Coaster,family,6.0,430.0,35.0,1.5,4,95.0,900


Display first few rows and info

Save as parquet

In [7]:
df_expanded.to_parquet(parquet_file, index=False)
print(f"Data saved to {parquet_file}")

Data saved to ../data/processed/ride_features/attraction_features.parquet


Verify the parquet file can be read

In [8]:
df_parquet = pd.read_parquet(parquet_file)
print("Verification - DataFrame Info:")
print(df_parquet.info())
print("\nVerification - First few rows:")
display(df_parquet.head())

Verification - DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               36 non-null     object 
 1   attraction_type    36 non-null     object 
 2   category           36 non-null     object 
 3   max_height         35 non-null     float64
 4   track_length       27 non-null     float64
 5   max_speed          34 non-null     float64
 6   g_force            36 non-null     float64
 7   min_age            36 non-null     int64  
 8   min_height         35 non-null     float64
 9   capacity_per_hour  36 non-null     int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 2.9+ KB
None

Verification - First few rows:


Unnamed: 0,name,attraction_type,category,max_height,track_length,max_speed,g_force,min_age,min_height,capacity_per_hour
0,silver star,Hyper Coaster,thrill,73.0,1620.0,127.0,4.0,8,140.0,1750
1,blue fire megacoaster,Launched Coaster,thrill,38.0,1056.0,100.0,3.8,7,130.0,1720
2,VirtualLine: WODAN - Timburcoaster,Wooden Coaster,thrill,40.0,1050.0,100.0,3.5,8,140.0,1250
3,voletarium,Flying Theater,family,16.0,,25.0,1.5,0,0.0,1400
4,alpine express enzian,Powered Coaster,family,6.0,430.0,35.0,1.5,4,95.0,900


In [9]:
def get_attraction_features_vectorized(df, features_dict):
    # Create a new dataframe with just the ride names
    result_df = df.copy()
    
    # Get unique ride names to process only once
    unique_rides = df['ride_name'].unique()
    
    # Create a mapping dictionary for faster lookups
    feature_mapping = {}
    for ride in unique_rides:
        features = features_dict.get(ride, {})
        for key, value in features.items():
            if f'feature_{key}' not in feature_mapping:
                feature_mapping[f'feature_{key}'] = {}
            feature_mapping[f'feature_{key}'][ride] = value
    
    # Apply the mappings to the dataframe (vectorized operation)
    for feature_col, ride_map in feature_mapping.items():
        result_df[feature_col] = df['ride_name'].map(ride_map)
    
    return result_df

merged_rides = pd.read_parquet(data_dir / 'processed/ep/merged_with_holidays.parquet')
attraction_features_dict = df_expanded.set_index('name').to_dict('index')

# Replace the slow progress_apply with vectorized operation
merged_rides_with_features = get_attraction_features_vectorized(merged_rides, attraction_features_dict)

In [11]:
merged_rides_with_features.to_parquet(data_dir / 'processed' / 'ep' / 'merged_with_ride_features.parquet', index=False)

print("\nVerification of merged rides with features:")
print(merged_rides_with_features.info())
print("\nSample of merged rides with features:")
display(merged_rides_with_features.head())

new_columns = [col for col in merged_rides_with_features.columns if col not in merged_rides.columns]
print("\nNewly added feature columns:")
print(new_columns)


Verification of merged rides with features:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14196903 entries, 0 to 14196902
Data columns (total 19 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   ride_name                  object        
 1   timestamp                  datetime64[ns]
 2   wait_time                  float64       
 3   closed                     bool          
 4   temperature                float64       
 5   rain                       float64       
 6   wind                       float64       
 7   is_german_holiday          bool          
 8   is_swiss_holiday           bool          
 9   is_french_holiday          bool          
 10  feature_attraction_type    object        
 11  feature_category           object        
 12  feature_max_height         float64       
 13  feature_track_length       float64       
 14  feature_max_speed          float64       
 15  feature_g_force            float64  

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind,is_german_holiday,is_swiss_holiday,is_french_holiday,feature_attraction_type,feature_category,feature_max_height,feature_track_length,feature_max_speed,feature_g_force,feature_min_age,feature_min_height,feature_capacity_per_hour
0,alpine express enzian,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1,False,False,False,Powered Coaster,family,6.0,430.0,35.0,1.5,4.0,95.0,900.0
1,poppy towers,2017-05-23 09:00:00,0.0,False,18.7,0.0,1.1,False,False,False,Children's Drop Tower,children,8.0,,15.0,1.5,4.0,100.0,500.0
2,silver star,2017-05-23 09:00:00,0.0,False,18.7,0.0,1.1,False,False,False,Hyper Coaster,thrill,73.0,1620.0,127.0,4.0,8.0,140.0,1750.0
3,swiss bob run,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1,False,False,False,Bobsled Coaster,family,20.0,500.0,65.0,2.8,6.0,120.0,1200.0
4,tirol log flume,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1,False,False,False,Log Flume,water,22.0,600.0,50.0,2.0,4.0,100.0,1400.0



Newly added feature columns:
['feature_attraction_type', 'feature_category', 'feature_max_height', 'feature_track_length', 'feature_max_speed', 'feature_g_force', 'feature_min_age', 'feature_min_height', 'feature_capacity_per_hour']
