Process Attraction Features Data
This script reads the attraction features data from JSON and converts it to a parquet file

import pandas as pd
import json
from pathlib import Path

In [1]:
import pandas as pd
import json
from pathlib import Path


Define paths

In [2]:
data_dir = Path('../data')
json_file = data_dir / 'attraction_features.json'
parquet_file = data_dir / 'attraction_features.parquet'


Read JSON data

In [3]:
# Read JSON data
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Expand the dictionary into separate columns
df_expanded = pd.json_normalize(df['attractions_features'].tolist())

# Display first few rows and info
print("DataFrame Info:")
print(df_expanded.info())
print("\nFirst few rows:")
display(df_expanded.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               36 non-null     object
 1   attraction_type    36 non-null     object
 2   category           36 non-null     object
 3   max_height         36 non-null     object
 4   track_length       36 non-null     object
 5   max_speed          36 non-null     object
 6   g_force            36 non-null     object
 7   min_age            36 non-null     object
 8   min_height         36 non-null     object
 9   capacity_per_hour  36 non-null     int64 
dtypes: int64(1), object(9)
memory usage: 2.9+ KB
None

First few rows:


Unnamed: 0,name,attraction_type,category,max_height,track_length,max_speed,g_force,min_age,min_height,capacity_per_hour
0,Silver Star,Hyper Coaster,thrill,73 meters,1620 meters,127 km/h,4 G,8 years,140 cm,1750
1,Blue Fire Megacoaster,Launched Coaster,thrill,38 meters,1056 meters,100 km/h,3.8 G,7 years,130 cm,1720
2,Wodan – Timburcoaster,Wooden Coaster,thrill,40 meters,1050 meters,100 km/h,3.5 G,8 years,140 cm,1250
3,Voletarium,Flying Theater,family,16 meters,,25 km/h,1.5 G,All ages,,1400
4,Alpine Express Enzian,Powered Coaster,family,6 meters,430 meters,35 km/h,1.5 G,4 years,95 cm (with adult),900


Display first few rows and info

Save as parquet

In [4]:
df_expanded.to_parquet(parquet_file, index=False)
print(f"Data saved to {parquet_file}")

Data saved to ..\data\attraction_features.parquet


Verify the parquet file can be read

In [5]:
df_parquet = pd.read_parquet(parquet_file)
print("Verification - DataFrame Info:")
print(df_parquet.info())
print("\nVerification - First few rows:")
display(df_parquet.head())

Verification - DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               36 non-null     object
 1   attraction_type    36 non-null     object
 2   category           36 non-null     object
 3   max_height         36 non-null     object
 4   track_length       36 non-null     object
 5   max_speed          36 non-null     object
 6   g_force            36 non-null     object
 7   min_age            36 non-null     object
 8   min_height         36 non-null     object
 9   capacity_per_hour  36 non-null     int64 
dtypes: int64(1), object(9)
memory usage: 2.9+ KB
None

Verification - First few rows:


Unnamed: 0,name,attraction_type,category,max_height,track_length,max_speed,g_force,min_age,min_height,capacity_per_hour
0,Silver Star,Hyper Coaster,thrill,73 meters,1620 meters,127 km/h,4 G,8 years,140 cm,1750
1,Blue Fire Megacoaster,Launched Coaster,thrill,38 meters,1056 meters,100 km/h,3.8 G,7 years,130 cm,1720
2,Wodan – Timburcoaster,Wooden Coaster,thrill,40 meters,1050 meters,100 km/h,3.5 G,8 years,140 cm,1250
3,Voletarium,Flying Theater,family,16 meters,,25 km/h,1.5 G,All ages,,1400
4,Alpine Express Enzian,Powered Coaster,family,6 meters,430 meters,35 km/h,1.5 G,4 years,95 cm (with adult),900


In [6]:
from tqdm import tqdm

# Read the merged_rides parquet file
merged_rides = pd.read_parquet(data_dir / 'raw/51/merged_rides.parquet')

# Create a mapping dictionary from attraction name to its features
attraction_features_dict = df_expanded.set_index('name').to_dict('index')

# Function to get features for an attraction
def get_attraction_features(row):
    features = attraction_features_dict.get(row['ride_name'], {})
    for key, value in features.items():
        row[f'feature_{key}'] = value
    return row

# Apply the features to each row with tqdm progress bar
tqdm.pandas(desc="Adding features to rides")
merged_rides_with_features = merged_rides.progress_apply(get_attraction_features, axis=1)

# Save the updated DataFrame
merged_rides_with_features.to_parquet(data_dir / 'merged_rides_with_features.parquet', index=False)

# Verify the new file
print("\nVerification of merged rides with features:")
print(merged_rides_with_features.info())
print("\nSample of merged rides with features:")
display(merged_rides_with_features.head())

# Show which columns were added
new_columns = [col for col in merged_rides_with_features.columns if col not in merged_rides.columns]
print("\nNewly added feature columns:")
print(new_columns)

Adding features to rides: 100%|██████████| 6268117/6268117 [1:17:03<00:00, 1355.61it/s] 



Verification of merged rides with features:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6268117 entries, 0 to 6268116
Data columns (total 12 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   feature_attraction_type    object        
 1   feature_capacity_per_hour  float64       
 2   feature_category           object        
 3   feature_g_force            object        
 4   feature_max_height         object        
 5   feature_max_speed          object        
 6   feature_min_age            object        
 7   feature_min_height         object        
 8   feature_track_length       object        
 9   ride_name                  object        
 10  timestamp                  datetime64[ns]
 11  wait_time                  float64       
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 573.9+ MB
None

Sample of merged rides with features:


Unnamed: 0,feature_attraction_type,feature_capacity_per_hour,feature_category,feature_g_force,feature_max_height,feature_max_speed,feature_min_age,feature_min_height,feature_track_length,ride_name,timestamp,wait_time
0,,,,,,,,,,Alpine Express 'Enzian',2019-06-11 09:00:00,5.0
1,,,,,,,,,,Alpine Express 'Enzian',2019-06-11 09:05:00,5.0
2,,,,,,,,,,Alpine Express 'Enzian',2019-06-11 09:10:00,1.0
3,,,,,,,,,,Alpine Express 'Enzian',2019-06-11 09:15:00,1.0
4,,,,,,,,,,Alpine Express 'Enzian',2019-06-11 09:20:00,1.0



Newly added feature columns:
['feature_attraction_type', 'feature_capacity_per_hour', 'feature_category', 'feature_g_force', 'feature_max_height', 'feature_max_speed', 'feature_min_age', 'feature_min_height', 'feature_track_length']
