In [14]:
import sys
sys.path.append("../..")

In [35]:
# import libraries
import pandas as pd
import numpy as np
import os
from rapidfuzz import process, fuzz

In [43]:
# import custom modules
from survival.utils import lower_case
from survival.utils import show_all


In [17]:
# load the data
raw = pd.read_csv('../../data/raw/raw.csv', encoding='latin1', sep=';')

In [18]:
# rename columns for readability
raw.columns = ['is_institutional', 'is_individual', 'school_type', 'rank', 'country_code', 'email', 'municipality', 'city', 'production', 'season', 'purchase_date',
               'start_date', 'ticket_type', 'currency', 'price', 'is_canceled', 'is_free', 'is_dead',
               'artform', 'ticket_num', 'gender', 'birthdate', 'age', 'id']

In [19]:
# all string to lower case
raw = lower_case(raw)

In [20]:
# datetime conversion
raw['purchase_date'] = pd.to_datetime(raw['purchase_date'], dayfirst=True, errors='coerce')
raw['purchase_date'] = pd.to_datetime(raw['purchase_date'].dt.date, dayfirst=True, errors='coerce')

raw['start_date'] = pd.to_datetime(raw['start_date'], dayfirst=True, errors='coerce')

raw['birthdate'] = pd.to_datetime(raw['birthdate'], dayfirst=True, errors='coerce')
raw['birthdate'] = pd.to_datetime(raw['birthdate'].dt.date, dayfirst=True, errors='coerce')

In [21]:
# delete canceled purchases
raw = raw[raw['is_canceled'] == 0]

# drop unnessesary columns
raw = raw.drop(columns=['is_dead', 'currency', 'is_canceled', 'school_type', 'is_individual', 'is_institutional'])

# retain only ballet and opera
raw = raw[raw['artform'].isin(['ballet', 'opera'])]

# strip all string in production of double spaces
raw['production'] = raw['production'].str.replace('  ', ' ')

# remove the '/ ' in certain production names
raw['production'] = raw['production'].str.replace('/ ', '')

# raw['price'] to float
raw['price'] = raw['price'].str.replace(',', '.')
raw['price'] = pd.to_numeric(raw['price'], errors='coerce')

# delete price outlier
raw = raw[raw['price'] != 2500]

# remove 'seizoen' from season for clarity
raw['season'] = raw['season'].str.replace('seizoen ', '')
raw['season'] = raw['season'].str.replace('-', '_')

# fill missing values of production with dansers van morgen 2022
raw['production'] = raw['production'].fillna('dansers van morgen 2022')

In [22]:
# streamline production names of flirt events
from survival.constants import flirt_mapping_dict
raw['production'] = raw['production'].replace(flirt_mapping_dict)

# streamline production names of danser van morgen events
raw.loc[raw['production'] == 'dansers van morgen 2022', 'production'] = '21/22 dansers van morgen'
raw.loc[(raw['production'] == 'dansers van morgen') & (raw['start_date'] > '2022-08-01') & (raw['start_date'] < '2023-08-01'), 'production'] = raw['production'].str.replace('dansers van morgen', '22/23 dansers van morgen')
raw.loc[(raw['production'] == 'dansers van morgen') & (raw['start_date'] > '2023-08-01') & (raw['start_date'] < '2024-08-01'), 'production'] = raw['production'].str.replace('dansers van morgen', '23/24 dansers van morgen')

# streamline production name typos
raw.loc[raw['production'] == 'la traviata', 'production'] = '21/22 la traviata'
raw.loc[raw['production'] == '18/19 die zauberfloete', 'production'] = '18/19 die zauberflote'

# standardize 21/22 hans van manen programmes
raw.loc[raw['production'] == '21/22 hans van manen festival progr i', 'production'] = '21/22 hans van manen festival'
raw.loc[raw['production'] == '21/22 hans van manen festival progr ii', 'production'] = '21/22 hans van manen festival'
raw.loc[raw['production'] == '21/22 hans van manen festival progr iii', 'production'] = '21/22 hans van manen festival'
raw.loc[raw['production'] == '21/22 hans van manen festival progr iv', 'production'] = '21/22 hans van manen festival'

# standardize walkure adventure seats
raw.loc[raw['production'] == '19/20 die walkure adventure seats', 'production'] = '19/20 die walküre'

In [23]:
# load regular programme data
reg = pd.read_csv('../../data/processed/operaballet_reg_prods_clean.csv')

# keep only the productions that are in the regular programme
raw = raw[raw['production'].isin(reg['production'])]

In [24]:
# map countries
from survival.constants import country_mapping_dict

def map_country(country):
    for key, value in country_mapping_dict.items():
        if country in value:
            return key
    return np.nan

raw['country_code'] = raw['country_code'].apply(map_country)

In [25]:
# fix municipality names
raw.loc[raw['municipality'] == 'nuenen- gerwen en nederwetten', 'municipality'] = 'nuenen'
raw.loc[raw['municipality'] == 'nuenen. gerwen en nederwetten', 'municipality'] = 'nuenen'
raw.loc[raw['municipality'] == "'s-gravenhage", 'municipality'] = 'den haag'

In [27]:
cities = pd.read_csv('../../data/processed/cities.csv')

In [36]:
def get_best_city_match(raw, cities):
    """
    Match city names from raw dataframe with standardized names from cities dataframe
    using fuzzy matching within the same country.
    
    Args:
        raw: Series containing 'city' and 'country_code'
        cities: DataFrame with columns 'name' and 'country_code'
    
    Returns:
        str: Best matching city name or None if no good match found
    """
    try:
        # Check if required fields exist and are not empty
        if pd.isna(raw['city']) or pd.isna(raw['country_code']) or raw['city'].strip() == '':
            return None
        
        # Standardize input
        input_city = raw['city'].strip().lower()
        input_country = raw['country_code'].strip().upper()
        
        # Filter cities by country code
        country_cities = cities[cities['country_code'] == input_country]
        
        if country_cities.empty:
            return None
            
        # Create list of candidate cities
        candidate_cities = country_cities['name'].str.lower().tolist()
        
        if not candidate_cities:
            return None
            
        # Perform fuzzy matching with token_sort_ratio for better handling of word order
        best_match, score, _ = process.extractOne(
            input_city,
            candidate_cities,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=80
        )
        
        if best_match:
            # Return the original case version from the cities dataframe
            original_case = country_cities[country_cities['name'].str.lower() == best_match]['name'].iloc[0]
            return original_case
            
        return None
        
    except Exception as e:
        print(f"Error processing city match: {e}")
        return None

# Apply the function to your dataframe
raw['corrected_city'] = raw.apply(lambda x: get_best_city_match(x, cities), axis=1)

KeyboardInterrupt: 

In [46]:
from rapidfuzz import process, fuzz
import pandas as pd

def create_city_mapping(raw_df, cities_df):
    """
    Create city name mapping in three passes:
    1. Exact match using sets
    2. Fuzzy match on city with country filter
    3. Fuzzy match on remaining cities
    """
    # Prepare mappings dictionary
    mappings = {}
    
    # Get unique city-country combinations
    unique_combinations = raw_df[['city', 'country_code']].drop_duplicates()
    
    # Standardize cities dataframe once
    cities_df = cities_df.copy()
    cities_df['name_std'] = cities_df['name'].apply(lambda x: str(x).strip().lower() if pd.notna(x) else None)
    cities_df['country_code'] = cities_df['country_code'].str.upper()
    
    # PASS 1: Exact matching using sets
    print("Pass 1: Exact matching...")
    unmatched = []
    
    # Create dictionary of sets for each country code
    country_city_sets = {}
    city_to_original = {}
    for _, row in cities_df.iterrows():
        country = row['country_code']
        std_name = row['name_std']
        orig_name = row['name']
        
        if country not in country_city_sets:
            country_city_sets[country] = set()
        country_city_sets[country].add(std_name)
        city_to_original[(country, std_name)] = orig_name
    
    # Do exact matching
    for _, row in unique_combinations.iterrows():
        city = str(row['city']).strip().lower() if pd.notna(row['city']) else None
        country = str(row['country_code']).upper() if pd.notna(row['country_code']) else None
        
        if city is None or country is None:
            mappings[(row['city'], row['country_code'])] = None
            continue
            
        # Check if city exists in country set
        if country in country_city_sets and city in country_city_sets[country]:
            mappings[(row['city'], row['country_code'])] = city_to_original[(country, city)]
        else:
            unmatched.append(row)
    
    # PASS 2: Fuzzy matching within country
    print(f"Pass 2: Fuzzy matching within countries for {len(unmatched)} cities...")
    still_unmatched = []
    
    for row in unmatched:
        city = str(row['city']).strip().lower() if pd.notna(row['city']) else None
        country = str(row['country_code']).upper() if pd.notna(row['country_code']) else None
        
        if city is None or country is None:
            mappings[(row['city'], row['country_code'])] = None
            continue
        
        # Get cities for this country
        country_cities = cities_df[cities_df['country_code'] == country]
        
        if not country_cities.empty:
            result = process.extractOne(
                city,
                country_cities['name_std'].tolist(),
                scorer=fuzz.token_sort_ratio,
                score_cutoff=80
            )
            
            if result is not None:  # Check if a match was found
                best_match, score, _ = result
                original_case = country_cities[
                    country_cities['name_std'] == best_match
                ]['name'].iloc[0]
                mappings[(row['city'], row['country_code'])] = original_case
            else:
                still_unmatched.append(row)
        else:
            still_unmatched.append(row)
    
    # PASS 3: Fuzzy matching globally for remaining cities
    print(f"Pass 3: Global fuzzy matching for {len(still_unmatched)} remaining cities...")
    all_cities = cities_df['name_std'].tolist()
    
    for row in still_unmatched:
        city = str(row['city']).strip().lower() if pd.notna(row['city']) else None
        
        if city is None:
            mappings[(row['city'], row['country_code'])] = None
            continue
            
        result = process.extractOne(
            city,
            all_cities,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=75
        )
        
        if result is not None:  # Check if a match was found
            best_match, score, _ = result
            original_case = cities_df[
                cities_df['name_std'] == best_match
            ]['name'].iloc[0]
            mappings[(row['city'], row['country_code'])] = original_case
        else:
            mappings[(row['city'], row['country_code'])] = None
    
    return mappings

def apply_city_corrections(raw_df, cities_df):
    """Apply city corrections to the dataframe"""
    # Create mapping
    mapping = create_city_mapping(raw_df, cities_df)
    
    # Apply corrections
    raw_df['corrected_city'] = raw_df.apply(
        lambda x: mapping.get((x['city'], x['country_code'])), 
        axis=1
    )
    
    # Print summary
    total = len(mapping)
    corrected = sum(1 for v in mapping.values() if v is not None)
    print(f"\nSummary:")
    print(f"Total unique cities: {total}")
    print(f"Successfully matched: {corrected}")
    print(f"Unmatched: {total - corrected}")
    
    return raw_df

def show_corrections(df):
    """Show all corrections made"""
    changes = df[df['city'] != df['corrected_city']][
        ['city', 'corrected_city', 'country_code']
    ].drop_duplicates()
    
    print("\nCorrections made:")
    print(changes)

In [47]:
# Apply corrections
corrected_df = apply_city_corrections(raw, cities)

# Show what changes were made
show_corrections(corrected_df)

Pass 1: Exact matching...
Pass 2: Fuzzy matching within countries for 5875 cities...
Pass 3: Global fuzzy matching for 3673 remaining cities...

Summary:
Total unique cities: 13106
Successfully matched: 10925
Unmatched: 2181

Corrections made:
                      city corrected_city country_code
0            oudebildtzijl           None           nl
6                   wormer        cormery           nl
11                vreeland        zeeland           nl
14              katwijk zh           None           nl
18                laren nh          laren           nl
...                    ...            ...          ...
2120202           andouque        nanuque           fr
2120326  akasaka minato-ku           None           jp
2121438       kitaku,tokyo           None           jp
2121604               baar           None          NaN
2121776     the plains, va     the plains           us

[5785 rows x 3 columns]


In [45]:
# Show all unmatched cities
unmatched = raw[raw['corrected_city'].isna()][['city', 'country_code']].drop_duplicates()
print("All unmatched cities:")

with show_all():
    display(unmatched.value_counts())


All unmatched cities:


city                                        country_code
#naam?                                      nl              1
oudendijk nh                                nl              1
ouder a/d amstel                            nl              1
ouder-amstel                                nl              1
ouderamstel                                 nl              1
ouderkerk                                   nl              1
ouderkerk a.d.ijssel                        nl              1
ouderkerk a/d amstel                        nl              1
ouderkerk a/d/ amstel                       nl              1
ouderkerk a:d amstel                        nl              1
oudheusden                                  nl              1
oudheuseden                                 nl              1
oudorp (nh)                                 nl              1
oudorp nh                                   nl              1
oudwoude                                    nl              1
ouroux en mor

In [28]:
# reset index and export as parquet
raw = raw.reset_index(drop=True)
#raw.to_parquet('../../data/processed/raw_clean.parquet')