# USS Queue Tolerance Prediction - Feature & Label Engineering

## Project Overview
This notebook performs comprehensive feature engineering and label generation for predicting tourist queue tolerance at Universal Studios Singapore (USS). The focus is on wait time tolerance analysis using NLP-based approaches with minimal hardcoded keywords.

**Main Tasks:**
- Queue tolerance classification (3-class)
- Wait time threshold regression
- Wait experience satisfaction (4-level)
- Time sensitivity classification (binary)

**Features (32D):** Fine-grained sentiment (12D) + Temporal-spatial (5D) + Facility (8D) + User behavior (5D) + Time sensitivity (2D)

## 1. Environment Setup

In [4]:
!pip install vaderSentiment spacy pandas numpy scikit-learn -q
!python -m spacy download en_core_web_sm -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import os

# Ensure working from project root
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    os.chdir('..')
    print("Adjusted working directory to project root")

Adjusted working directory to project root


In [4]:
import pandas as pd
import numpy as np
import json
import re
from collections import Counter
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

## 2. Data Loading & Configuration

In [5]:
# Load NLP models and configurations
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

# Load facility configurations
with open('output/dashboard/config/FullList_normalized_v1.json', 'r') as f:
    facility_config = json.load(f)
with open('output/dashboard/config/others_normalized_v1.json', 'r') as f:
    others_config = json.load(f)

# Load review dataset
data_path = 'data/processed/'
csv_file = "USS_Reviews_Silver_Facility_Labeled.csv"
df = pd.read_csv(os.path.join(data_path, csv_file))

print(f"Data loaded: {df.shape}")

Data loaded: (19224, 9)


In [6]:
# Create facility thrill scores and popularity metrics
def create_facility_scores():
    # Thrill level scores based on research
    rides_scores = {
        "battlestar galactica cylon": 1.0, "battlestar galactica human": 0.9,
        "revenge of the mummy": 0.95, "transformers the ride the ultimate 3d battle": 0.8,
        "jurassic park rapids adventure": 0.7, "canopy flyer": 0.5, "enchanted airways": 0.5,
        "puss in boots giant journey": 0.4, "accelerator": 0.4, "magic potion spin": 0.3,
        "dino soarin": 0.2, "sesame street spaghetti space chase": 0.3, "silly swirly": 0.2,
        "buggie boogie": 0.2, "despicable me minion mayhem": 0.4, "treasure hunters": 0.3,
        "sesame street goes bollywood": 0.3
    }
    
    shows_scores = {
        "waterworld": 0.8, "lights camera action": 0.6, "shrek 4 d adventure": 0.4,
        "transformers voices of cybertron": 0.3, "hatched featuring dr rodney": 0.1,
        "raptor encounter with blue": 0.2, "raptor encounter generations": 0.2,
        "despicable me family portrait": 0.1
    }
    
    all_scores = {**rides_scores, **shows_scores}
    
    # Add default scores for remaining facilities
    for zone, categories in facility_config.items():
        for category, items in categories.items():
            for item in items:
                if item not in all_scores:
                    all_scores[item] = 0.3 if category in ['rides', 'shows'] else 0.0
    
    for item in others_config:
        all_scores[item] = 0.0
    
    return all_scores

facility_thrill_scores = create_facility_scores()

# Calculate facility popularity from dataset mentions
def calculate_facility_popularity(df):
    all_facilities = []
    for _, row in df.iterrows():
        try:
            rides = eval(row['label_rides']) if pd.notna(row['label_rides']) else []
            shows = eval(row['label_shows']) if pd.notna(row['label_shows']) else []
            others = eval(row['label_other']) if pd.notna(row['label_other']) else []
            all_facilities.extend(rides + shows + others)
        except:
            continue
    
    facility_counts = Counter(all_facilities)
    total_reviews = len(df)
    return {facility: count / total_reviews for facility, count in facility_counts.items()}

facility_popularity = calculate_facility_popularity(df)
print(f"Facility configuration completed: {len(facility_thrill_scores)} items")

Facility configuration completed: 92 items


## 3. Data Preprocessing

In [7]:
# Extract temporal information using regex patterns
def extract_temporal_info(integrated_review):
    visit_time = None
    wait_time = None
    
    visit_match = re.search(r'\[VISIT_TIME: ([^\]]+)\]', integrated_review)
    if visit_match:
        visit_time = visit_match.group(1).strip()
    
    wait_match = re.search(r'\[WAIT_TIME: ([^\]]+)\]', integrated_review)
    if wait_match:
        wait_time = wait_match.group(1).strip()
    
    return visit_time, wait_time

# Apply temporal extraction and filter data
df[['visit_time', 'wait_time']] = df['integrated_review'].apply(
    lambda x: pd.Series(extract_temporal_info(x))
)

df_filtered = df.dropna(subset=['visit_time', 'wait_time']).copy()

# Convert wait time strings to numeric values
def convert_wait_time_numeric(wait_time_str):
    if pd.isna(wait_time_str):
        return 0
    wait_time_str = str(wait_time_str).lower()
    if 'no wait' in wait_time_str: return 0
    elif 'up to 10 min' in wait_time_str: return 5
    elif '1030 min' in wait_time_str: return 20
    elif '3060 min' in wait_time_str: return 45
    elif '1 hr' in wait_time_str: return 60
    else: return 0

df_filtered['wait_time_numeric'] = df_filtered['wait_time'].apply(convert_wait_time_numeric)
print(f"Preprocessed data: {df_filtered.shape[0]} rows")

Preprocessed data: 5409 rows


## 4. Feature Engineering - Fine-grained Sentiment Analysis (12D)

In [8]:
# Advanced sentiment analysis using spaCy NLP features
def extract_fine_grained_sentiment_spacy(text):
    """Extract 12D sentiment features using spaCy linguistic analysis"""
    overall_scores = analyzer.polarity_scores(text)
    doc = nlp(text)
    
    # 1-3: Wait Experience Sentiment using dependency parsing
    wait_related_spans = []
    time_entities = [ent for ent in doc.ents if ent.label_ in ["TIME", "DURATION", "CARDINAL"]]
    
    # Find sentences containing time expressions or queue-related dependencies
    for sent in doc.sents:
        # Check for time entities or queue-related tokens
        has_time_ref = any(ent.start >= sent.start and ent.end <= sent.end for ent in time_entities)
        has_queue_deps = any(token.lemma_ in ['wait', 'queue', 'line'] for token in sent)
        
        if has_time_ref or has_queue_deps:
            wait_related_spans.append(sent.text)
    
    if wait_related_spans:
        wait_text = ' '.join(wait_related_spans)
        wait_sentiment = analyzer.polarity_scores(wait_text)
        wait_queue_perception = wait_sentiment['compound']
        wait_queue_management = wait_sentiment['pos'] - wait_sentiment['neg']
        wait_environment = len(wait_related_spans) / len(list(doc.sents))  # Proportion of wait mentions
    else:
        wait_queue_perception = 0
        wait_queue_management = 0
        wait_environment = 0
    
    # 4-6: Facility Experience using semantic roles
    attraction_spans = []
    for sent in doc.sents:
        # Look for entertainment/attraction related semantic patterns
        for token in sent:
            if (token.pos_ in ['NOUN', 'PROPN'] and 
                any(child.dep_ == 'amod' and child.lemma_ in ['fun', 'exciting', 'boring', 'great'] 
                    for child in token.children)):
                attraction_spans.append(sent.text)
                break
    
    if attraction_spans:
        attraction_text = ' '.join(attraction_spans)
        facility_sentiment = analyzer.polarity_scores(attraction_text)
        facility_quality = facility_sentiment['compound']
    else:
        facility_quality = overall_scores['compound']
    
    # Technical issues using negative semantic patterns
    technical_issues = 0
    for token in doc:
        if (token.lemma_ in ['close', 'break', 'maintenance'] and 
            any(child.dep_ == 'neg' for child in token.children)):
            technical_issues -= 0.1
    
    # Safety perception using sentiment-bearing adjectives
    safety_score = 0
    for token in doc:
        if token.pos_ == 'ADJ' and token.lemma_ in ['safe', 'secure']:
            safety_score += 0.2
        elif token.pos_ == 'ADJ' and token.lemma_ in ['dangerous', 'unsafe']:
            safety_score -= 0.2
    safety_perception = np.clip(safety_score, -1, 1)
    
    # 7-8: Service Quality using person entities and service verbs
    service_spans = []
    for sent in doc.sents:
        # Look for staff/service related mentions
        has_person_ref = any(ent.label_ == 'PERSON' for ent in doc.ents 
                           if ent.start >= sent.start and ent.end <= sent.end)
        has_service_verb = any(token.lemma_ in ['help', 'serve', 'assist'] for token in sent)
        
        if has_person_ref or has_service_verb:
            service_spans.append(sent.text)
    
    if service_spans:
        service_text = ' '.join(service_spans)
        service_sentiment = analyzer.polarity_scores(service_text)
        staff_attitude = service_sentiment['compound']
        customer_service = service_sentiment['pos'] - service_sentiment['neg']
    else:
        staff_attitude = 0
        customer_service = 0
    
    # 9-10: Price Perception using money entities and value expressions
    price_spans = []
    money_entities = [ent for ent in doc.ents if ent.label_ == 'MONEY']
    
    for sent in doc.sents:
        has_money_ref = any(ent.start >= sent.start and ent.end <= sent.end for ent in money_entities)
        has_value_term = any(token.lemma_ in ['price', 'cost', 'value', 'worth', 'expensive'] 
                            for token in sent)
        
        if has_money_ref or has_value_term:
            price_spans.append(sent.text)
    
    if price_spans:
        price_text = ' '.join(price_spans)
        price_sentiment = analyzer.polarity_scores(price_text)
        express_value = price_sentiment['compound'] if 'express' in price_text.lower() else 0
        overall_value = price_sentiment['pos'] - price_sentiment['neg']
    else:
        express_value = 0
        overall_value = 0
    
    # 11-12: Overall Satisfaction using semantic patterns
    recommend_score = 0
    for token in doc:
        if token.lemma_ in ['recommend', 'suggest'] and token.dep_ == 'ROOT':
            recommend_score += 0.3
        elif token.lemma_ in ['return', 'again'] and any(child.lemma_ == 'would' for child in token.children):
            recommend_score += 0.2
    
    overall_satisfaction = overall_scores['compound']
    
    return [
        wait_queue_perception, wait_queue_management, wait_environment,
        facility_quality, technical_issues, safety_perception,
        staff_attitude, customer_service,
        express_value, overall_value,
        recommend_score, overall_satisfaction
    ]

# Apply advanced sentiment analysis
sentiment_features = df_filtered['review'].apply(extract_fine_grained_sentiment_spacy)
sentiment_df = pd.DataFrame(
    sentiment_features.tolist(),
    columns=[
        'wait_queue_perception', 'wait_queue_management', 'wait_environment',
        'facility_quality', 'technical_issues', 'safety_perception',
        'staff_attitude', 'customer_service',
        'express_value', 'overall_value',
        'recommend_score', 'overall_satisfaction'
    ]
)

print("Fine-grained sentiment features extracted (12D)")

Fine-grained sentiment features extracted (12D)


In [9]:
sentiment_df.describe()

Unnamed: 0,wait_queue_perception,wait_queue_management,wait_environment,facility_quality,technical_issues,safety_perception,staff_attitude,customer_service,express_value,overall_value,recommend_score,overall_satisfaction
count,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0
mean,0.13895,0.039444,0.254061,0.515927,-7.4e-05,0.001035,0.065949,0.025947,0.010243,0.024823,0.038491,0.527108
std,0.352155,0.110499,0.298179,0.476243,0.002719,0.020326,0.243168,0.103063,0.093706,0.104304,0.114295,0.488991
min,-0.9692,-0.756,0.0,-0.9888,-0.1,-0.2,-0.9361,-0.61,-0.9539,-0.615,0.0,-0.9888
25%,0.0,0.0,0.0,0.3291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3257
50%,0.0,0.0,0.181818,0.6757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7099
75%,0.3612,0.069,0.428571,0.8703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8957
max,0.9925,0.726,1.0,0.9992,0.0,0.6,0.9896,1.0,0.9676,0.839,1.8,0.9997


## 5. Feature Engineering - Temporal-Spatial Context (5D)

In [10]:
# Extract temporal-spatial context features
def extract_temporal_spatial_features(row):
    # Visit time one-hot encoding
    visit_time = row['visit_time']
    weekday = 1 if visit_time == 'Weekday' else 0
    weekend = 1 if visit_time == 'Weekend' else 0
    holiday = 1 if visit_time == 'Public holiday' else 0
    
    # Numeric wait time
    wait_time_numeric = row['wait_time_numeric']
    
    # Season factor based on Singapore climate patterns
    try:
        date_obj = pd.to_datetime(row['publishedAtDate'], format='%m/%d/%y')
        month = date_obj.month
        # Peak season: Dec-Feb (holidays), Jun-Aug (summer)
        season_factor = 1.0 if month in [12, 1, 2, 6, 7, 8] else 0.5
    except:
        season_factor = 0.75
    
    return [weekday, weekend, holiday, wait_time_numeric, season_factor]

temporal_features = df_filtered.apply(extract_temporal_spatial_features, axis=1)
temporal_df = pd.DataFrame(
    temporal_features.tolist(),
    columns=['weekday', 'weekend', 'holiday', 'wait_time_numeric', 'season_factor']
)

print("Temporal-spatial features extracted (5D)")

Temporal-spatial features extracted (5D)


In [11]:
temporal_df.describe()

Unnamed: 0,weekday,weekend,holiday,wait_time_numeric,season_factor
count,5409.0,5409.0,5409.0,5409.0,5409.0
mean,0.667221,0.270845,0.061934,17.542984,0.746534
std,0.471252,0.444437,0.241058,21.431294,0.249999
min,0.0,0.0,0.0,0.0,0.5
25%,0.0,0.0,0.0,0.0,0.5
50%,1.0,0.0,0.0,5.0,0.5
75%,1.0,1.0,0.0,20.0,1.0
max,1.0,1.0,1.0,60.0,1.0


## 6. Feature Engineering - Facility Characteristics (8D)

In [12]:
# Extract facility-related features
def extract_facility_features(row):
    try:
        rides = eval(row['label_rides']) if pd.notna(row['label_rides']) else []
        shows = eval(row['label_shows']) if pd.notna(row['label_shows']) else []
        others = eval(row['label_other']) if pd.notna(row['label_other']) else []
    except:
        rides, shows, others = [], [], []
    
    # Basic facility counts
    rides_count = len(rides)
    shows_count = len(shows)
    other_count = len(others)
    
    # Thrill level calculations
    all_facilities = rides + shows + others
    thrill_scores = [facility_thrill_scores.get(facility, 0) for facility in all_facilities]
    
    avg_thrill_level = np.mean(thrill_scores) if thrill_scores else 0
    max_thrill_level = max(thrill_scores) if thrill_scores else 0
    
    # Facility diversity score
    facility_types = sum([rides_count > 0, shows_count > 0, other_count > 0])
    facility_diversity = facility_types / 3
    
    # Express pass usage indicator
    express_usage = 1 if 'express' in others else 0
    
    # Main facility popularity score
    popularities = [facility_popularity.get(facility, 0) for facility in all_facilities]
    main_facility_popularity = max(popularities) if popularities else 0
    
    return [
        rides_count, shows_count, other_count,
        avg_thrill_level, max_thrill_level,
        facility_diversity, express_usage, main_facility_popularity
    ]

facility_features = df_filtered.apply(extract_facility_features, axis=1)
facility_df = pd.DataFrame(
    facility_features.tolist(),
    columns=[
        'rides_count', 'shows_count', 'other_count',
        'avg_thrill_level', 'max_thrill_level',
        'facility_diversity', 'express_usage', 'main_facility_popularity'
    ]
)

print("Facility features extracted (8D)")

Facility features extracted (8D)


In [13]:
facility_df.describe()

Unnamed: 0,rides_count,shows_count,other_count,avg_thrill_level,max_thrill_level,facility_diversity,express_usage,main_facility_popularity
count,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0,5409.0
mean,0.599187,0.69144,1.527639,0.198509,0.361675,0.544155,0.264929,0.178051
std,1.169212,0.874103,1.415659,0.22167,0.35794,0.24474,0.441336,0.083959
min,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,5.2e-05
25%,0.0,0.0,1.0,0.0,0.0,0.333333,0.0,0.111527
50%,0.0,1.0,1.0,0.15,0.3,0.333333,0.0,0.183885
75%,1.0,1.0,2.0,0.3,0.7,0.666667,1.0,0.269143
max,10.0,9.0,12.0,1.0,1.0,1.0,1.0,0.269143


## 7. Feature Engineering - User Behavior Analysis (5D)

In [14]:
# Extract user behavior features using spaCy linguistic analysis
def extract_user_behavior_features_spacy(text):
    """Extract user behavior patterns using advanced NLP analysis"""
    doc = nlp(text)
    
    # 1. Review complexity (normalized length)
    review_length_norm = min(len(text) / 500, 1.0)
    
    # 2. Detailed facility engagement using entity recognition
    facility_entity_mentions = 0
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT', 'EVENT'] or ent.text.lower() in facility_thrill_scores:
            facility_entity_mentions += 1
    detailed_mention = min(facility_entity_mentions / 3, 1.0)
    
    # 3. Complaint intensity using linguistic features
    exclamation_count = text.count('!')
    caps_ratio = sum(1 for c in text if c.isupper()) / len(text) if text else 0
    
    # Analyze negation patterns and emotional intensity
    negative_modifiers = 0
    for token in doc:
        if token.dep_ == 'neg' or token.lemma_ in ['not', 'never', 'no']:
            negative_modifiers += 1
        elif token.pos_ == 'ADV' and any(child.pos_ == 'ADJ' for child in token.children):
            if token.lemma_ in ['very', 'extremely', 'really']:
                negative_modifiers += 0.5
    
    complaint_intensity = min(
        (exclamation_count / 5 + caps_ratio * 3 + negative_modifiers / len(doc)) / 3,
        1.0
    )
    
    # 4. Positive sentiment ratio from VADER
    sentiment_scores = analyzer.polarity_scores(text)
    positive_sentiment_ratio = sentiment_scores['pos']
    
    # 5. Group interaction patterns using dependency parsing
    personal_pronouns = [token.text.lower() for token in doc if token.pos_ == "PRON"]
    we_us_count = sum(1 for p in personal_pronouns if p in ['we', 'us', 'our'])
    i_me_count = sum(1 for p in personal_pronouns if p in ['i', 'me', 'my'])
    
    # Detect family/group indicators using named entities and semantic patterns
    group_indicators = 0
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and len(ent.text.split()) > 1:  # Multiple names
            group_indicators += 1
    
    for token in doc:
        if token.lemma_ in ['family', 'friend', 'kid', 'child', 'together', 'group']:
            group_indicators += 1
    
    # Calculate group interaction score
    if we_us_count > i_me_count and group_indicators > 0:
        group_interaction_indicator = 1.0
    elif we_us_count > 0 or group_indicators > 0:
        group_interaction_indicator = 0.5
    else:
        group_interaction_indicator = 0.0
    
    return [
        review_length_norm, detailed_mention, complaint_intensity,
        positive_sentiment_ratio, group_interaction_indicator
    ]

user_features = df_filtered['review'].apply(extract_user_behavior_features_spacy)
user_df = pd.DataFrame(
    user_features.tolist(),
    columns=[
        'review_length_norm', 'detailed_mention', 'complaint_intensity',
        'positive_sentiment_ratio', 'group_interaction_indicator'
    ]
)

print("User behavior features extracted (5D)")

User behavior features extracted (5D)


In [15]:
user_df.describe()

Unnamed: 0,review_length_norm,detailed_mention,complaint_intensity,positive_sentiment_ratio,group_interaction_indicator
count,5409.0,5409.0,5409.0,5409.0,5409.0
mean,0.480635,0.169656,0.060851,0.202715,0.219264
std,0.32559,0.289308,0.086207,0.147405,0.286678
min,0.02,0.0,0.0,0.0,0.0
25%,0.198,0.0,0.021978,0.099,0.0
50%,0.382,0.0,0.032609,0.179,0.0
75%,0.774,0.333333,0.059091,0.281,0.5
max,1.0,1.0,1.0,1.0,1.0


## 8. Feature Engineering - Time Sensitivity Indicators (2D)

In [16]:
# Extract time sensitivity features
def extract_time_sensitivity_features(row):
    # Holiday pressure factor (increased sensitivity during holidays)
    if row['visit_time'] == 'Public holiday':
        holiday_pressure_factor = row['wait_time_numeric'] / 60
    else:
        holiday_pressure_factor = 0
    
    # Seasonal mood adjustment (peak season amplifies wait time impact)
    season_factor = row['season_factor']
    wait_time_norm = row['wait_time_numeric'] / 60
    seasonal_mood_adjustment = season_factor * wait_time_norm
    
    return [holiday_pressure_factor, seasonal_mood_adjustment]

# Combine data for time sensitivity calculation
temp_combined = pd.concat([
    df_filtered[['visit_time', 'wait_time_numeric']].reset_index(drop=True), 
    temporal_df[['season_factor']].reset_index(drop=True)
], axis=1)

time_sensitivity_features = temp_combined.apply(extract_time_sensitivity_features, axis=1)
time_sensitivity_df = pd.DataFrame(
    time_sensitivity_features.tolist(),
    columns=['holiday_pressure_factor', 'seasonal_mood_adjustment']
)

print("Time sensitivity features extracted (2D)")

Time sensitivity features extracted (2D)


In [17]:
time_sensitivity_df.describe()

Unnamed: 0,holiday_pressure_factor,seasonal_mood_adjustment
count,5409.0,5409.0
mean,0.023248,0.226559
std,0.130764,0.304287
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.083333
75%,0.0,0.333333
max,1.0,1.0


## 9. Label Engineering - Queue Tolerance Tasks

In [19]:
# Generate queue tolerance classification labels
def generate_tolerance_labels_advanced(row):
    """Generate tolerance labels using layered decision tree with 100% coverage"""
    doc = nlp(row['review'])
    
    # Extract basic metrics
    stars = row['stars']
    wait_time = row['wait_time_numeric']
    overall_sentiment = analyzer.polarity_scores(row['review'])['compound']
    
    # Extract wait-related sentiment for middle/long wait analysis
    wait_related_spans = []
    time_entities = [ent for ent in doc.ents if ent.label_ in ["TIME", "DURATION", "CARDINAL"]]
    
    for sent in doc.sents:
        has_time_ref = any(ent.start >= sent.start and ent.end <= sent.end for ent in time_entities)
        has_wait_verb = any(token.lemma_ in ['wait', 'queue', 'line'] for token in sent)
        
        if has_time_ref or has_wait_verb:
            wait_related_spans.append(sent.text)
    
    if wait_related_spans:
        wait_text = ' '.join(wait_related_spans)
        wait_sentiment = analyzer.polarity_scores(wait_text)['compound']
    else:
        wait_sentiment = overall_sentiment
    
    # Layer 1: Wait time grouping with Layer 2: Stars + sentiment classification
    if wait_time == 0:
        # No wait scenario - based on overall satisfaction potential
        if stars >= 4:
            return 'high_tolerance'  # High satisfaction suggests tolerance for future waits
        elif stars == 3:
            return 'critical_tolerance'  # Neutral satisfaction at baseline
        else:
            return 'tolerance_collapse'  # Low satisfaction even without wait
            
    elif 1 <= wait_time <= 15:
        # Short wait scenario - direct satisfaction mapping
        if stars >= 4:
            return 'high_tolerance'  # Satisfied with short wait
        elif stars == 3:
            return 'critical_tolerance'  # Acceptable short wait experience
        else:
            return 'tolerance_collapse'  # Dissatisfied even with short wait
            
    elif 16 <= wait_time <= 45:
        # Medium wait scenario - sentiment becomes important
        if stars >= 4 and wait_sentiment >= 0.0:
            return 'high_tolerance'  # High stars + positive/neutral wait sentiment
        elif stars == 3 or (stars >= 4 and wait_sentiment < 0.0):
            return 'critical_tolerance'  # Neutral stars OR high stars with negative wait sentiment
        else:
            return 'tolerance_collapse'  # Low satisfaction with medium wait
            
    else:  # wait_time > 45
        # Long wait scenario - stricter criteria for high tolerance
        if stars == 5 and wait_sentiment >= 0.0:
            return 'high_tolerance'  # Perfect rating with positive wait experience
        elif stars >= 3:
            return 'critical_tolerance'  # Acceptable rating despite long wait
        else:
            return 'tolerance_collapse'  # Poor rating with long wait

# Generate wait time threshold regression labels
def generate_wait_threshold_labels(row):
    """Generate wait time threshold labels using psychological model with multi-factor adjustment"""
    stars = row['stars']
    wait_time = row['wait_time_numeric']
    visit_time = row['visit_time']
    overall_sentiment = analyzer.polarity_scores(row['review'])['compound']
    
    # Extract facility information for thrill level calculation
    try:
        rides = eval(row['label_rides']) if pd.notna(row['label_rides']) else []
        shows = eval(row['label_shows']) if pd.notna(row['label_shows']) else []
        others = eval(row['label_other']) if pd.notna(row['label_other']) else []
        all_facilities = rides + shows + others
    except:
        all_facilities = []
    
    # Calculate average thrill level for visited facilities
    if all_facilities:
        thrill_scores = [facility_thrill_scores.get(facility, 0.3) for facility in all_facilities]
        avg_thrill_level = np.mean(thrill_scores)
    else:
        avg_thrill_level = 0.3  # Default moderate thrill level
    
    # Base threshold calculation based on facility thrill level
    if avg_thrill_level >= 0.7:
        base_threshold = 45  # High thrill facilities warrant longer waits
    elif avg_thrill_level >= 0.4:
        base_threshold = 30  # Medium thrill facilities
    else:
        base_threshold = 20  # Low thrill facilities and services
    
    # Personal adjustment factor based on satisfaction indicators
    satisfaction_factor_map = {5: 1.5, 4: 1.2, 3: 1.0, 2: 0.8, 1: 0.6}
    personal_factor = satisfaction_factor_map.get(stars, 1.0)
    
    # Sentiment adjustment within personal factor
    if overall_sentiment >= 0.3:
        personal_factor *= 1.1  # Boost for very positive sentiment
    elif overall_sentiment <= -0.3:
        personal_factor *= 0.9  # Penalty for negative sentiment
    
    # Group interaction adjustment based on review patterns
    review_text = row['review'].lower()
    group_indicators = ['family', 'friend', 'kid', 'child', 'together', 'we', 'us', 'our']
    individual_indicators = ['i ', 'my ', 'me ']
    
    group_score = sum(1 for indicator in group_indicators if indicator in review_text)
    individual_score = sum(1 for indicator in individual_indicators if indicator in review_text)
    
    if group_score > individual_score and group_score >= 2:
        group_factor = 0.8  # Families/groups less tolerant due to coordination complexity
    elif individual_score > group_score:
        group_factor = 1.0  # Individual baseline
    else:
        group_factor = 1.2  # Friend groups more tolerant and flexible
    
    # Temporal adjustment factor based on visit timing
    temporal_factor_map = {
        'Weekday': 1.2,     # More tolerant during weekdays (less crowded)
        'Weekend': 1.0,     # Baseline weekend expectations
        'Public holiday': 0.8  # Less tolerant during holidays (higher stress)
    }
    temporal_factor = temporal_factor_map.get(visit_time, 1.0)
    
    # Calculate final threshold using psychological model
    calculated_threshold = base_threshold * personal_factor * group_factor * temporal_factor
    
    # Actual wait time consideration for threshold adjustment
    if wait_time > 0:
        # If user waited X minutes and gave rating Y, adjust threshold accordingly
        if stars >= 4:
            # High satisfaction suggests tolerance above actual wait time
            experience_threshold = wait_time + 10
        elif stars <= 2:
            # Low satisfaction suggests threshold below actual wait time
            experience_threshold = max(wait_time - 10, 5)
        else:
            # Neutral satisfaction suggests actual wait time approximates threshold
            experience_threshold = wait_time
        
        # Take maximum of calculated and experience-based thresholds
        final_threshold = max(calculated_threshold, experience_threshold)
    else:
        final_threshold = calculated_threshold
    
    # Ensure threshold stays within reasonable bounds
    return max(min(final_threshold, 120), 5)  # Clamp between 5-120 minutes

# Generate wait experience satisfaction labels
def generate_wait_experience_satisfaction(row):
    """Generate wait experience satisfaction labels for all reviews"""
    stars = row['stars']
    wait_time = row['wait_time_numeric']
    overall_sentiment = analyzer.polarity_scores(row['review'])['compound']
    
    # Combined satisfaction score considering wait time impact
    wait_penalty = wait_time / 60 * 0.2  # Penalty increases with wait time
    adjusted_satisfaction = overall_sentiment - wait_penalty
    
    if adjusted_satisfaction >= 0.3 and stars >= 4:
        return 'very_satisfied'
    elif adjusted_satisfaction >= 0.0 and stars >= 3:
        return 'satisfied'
    elif adjusted_satisfaction >= -0.3 and stars >= 2:
        return 'neutral'
    else:
        return 'dissatisfied'

# Generate time sensitivity classification labels
def generate_time_sensitivity_labels(row):
    """Generate time sensitivity labels based on temporal response patterns"""
    visit_time = row['visit_time']
    stars = row['stars']
    wait_time = row['wait_time_numeric']
    
    # Time sensitive if satisfaction drops significantly during peak times with wait
    if visit_time in ['Public holiday', 'Weekend'] and wait_time >= 30 and stars <= 3:
        return 'time_sensitive'
    elif wait_time >= 45 and stars <= 2:
        return 'time_sensitive'
    else:
        return 'time_tolerant'

# Apply all label generation functions
df_filtered['tolerance_label'] = df_filtered.apply(generate_tolerance_labels_advanced, axis=1)
df_filtered['wait_threshold'] = df_filtered.apply(generate_wait_threshold_labels, axis=1)
df_filtered['wait_satisfaction'] = df_filtered.apply(generate_wait_experience_satisfaction, axis=1)
df_filtered['time_sensitivity'] = df_filtered.apply(generate_time_sensitivity_labels, axis=1)

print("All labels generated successfully")

All labels generated successfully


## 10. Rides Ranking Analysis

In [20]:
# Generate rides-specific tolerance ranking data
def extract_rides_ranking_data(df_filtered):
    """Extract rides tolerance ranking analysis"""
    rides_data = []
    
    for _, row in df_filtered.iterrows():
        try:
            rides = eval(row['label_rides']) if pd.notna(row['label_rides']) else []
            if rides:
                for ride in rides:
                    rides_data.append({
                        'ride_name': ride,
                        'stars': row['stars'],
                        'wait_time_numeric': row['wait_time_numeric'],
                        'tolerance_label': row['tolerance_label'],
                        'wait_threshold': row['wait_threshold'],
                        'visit_time': row['visit_time'],
                        'thrill_level': facility_thrill_scores.get(ride, 0),
                        'review_sentiment': analyzer.polarity_scores(row['review'])['compound']
                    })
        except:
            continue
    
    rides_df = pd.DataFrame(rides_data)
    
    if len(rides_df) > 0:
        # Calculate aggregated tolerance metrics per ride
        rides_ranking = rides_df.groupby('ride_name').agg({
            'stars': 'mean',
            'wait_time_numeric': 'mean',
            'wait_threshold': 'mean',
            'tolerance_label': lambda x: (x == 'high_tolerance').mean(),
            'thrill_level': 'first',
            'review_sentiment': 'mean'
        }).round(3)
        
        rides_ranking.columns = ['avg_stars', 'avg_wait_time', 'avg_threshold', 
                                'tolerance_rate', 'thrill_level', 'avg_sentiment']
        
        # Calculate composite tolerance score
        rides_ranking['tolerance_score'] = (
            rides_ranking['tolerance_rate'] * 0.4 + 
            (rides_ranking['avg_stars'] / 5) * 0.3 + 
            ((rides_ranking['avg_sentiment'] + 1) / 2) * 0.3
        )
        
        return rides_ranking.sort_values('tolerance_score', ascending=False)
    else:
        return pd.DataFrame()

rides_ranking = extract_rides_ranking_data(df_filtered)
print(f"Rides ranking analysis completed: {len(rides_ranking)} rides")

Rides ranking analysis completed: 17 rides


## 11. Data Export & Summary

In [21]:
# Combine all features into final dataset
all_features = pd.concat([
    sentiment_df,
    temporal_df,
    facility_df,
    user_df,
    time_sensitivity_df
], axis=1)

# Create final dataset with metadata and labels
final_dataset = pd.concat([
    df_filtered[['review_index', 'stars', 'name', 'review', 'publishedAtDate', 
                'visit_time', 'wait_time', 'tolerance_label', 'wait_threshold', 
                'wait_satisfaction', 'time_sensitivity']].reset_index(drop=True),
    all_features.reset_index(drop=True)
], axis=1)

# Export datasets
final_dataset.to_csv('data/processed/uss_features_labels.csv', index=False)
if len(rides_ranking) > 0:
    rides_ranking.to_csv('data/processed/rides_ranking_analysis.csv')

# Display summary statistics
print(f"Final dataset shape: {final_dataset.shape}")
print(f"Feature dimensions: {all_features.shape[1]}D")
print(f"Tolerance labels: {dict(final_dataset['tolerance_label'].value_counts())}")
print(f"Wait satisfaction: {dict(final_dataset['wait_satisfaction'].value_counts())}")
print(f"Time sensitivity: {dict(final_dataset['time_sensitivity'].value_counts())}")
print("\nData export completed successfully")

Final dataset shape: (5409, 43)
Feature dimensions: 32D
Tolerance labels: {'high_tolerance': np.int64(4006), 'critical_tolerance': np.int64(745), 'tolerance_collapse': np.int64(658)}
Wait satisfaction: {'very_satisfied': np.int64(3511), 'dissatisfied': np.int64(764), 'satisfied': np.int64(684), 'neutral': np.int64(450)}
Time sensitivity: {'time_tolerant': np.int64(4987), 'time_sensitive': np.int64(422)}

Data export completed successfully
