In [18]:
import difflib  # For fuzzy matching (80% similarity)
from metaphone import doublemetaphone  # For phonetic matching (sounds like)

# Step 1: Define a mapping of standardized city names with unique identifiers
city_mapping = {
    "תל אביב": {
        "variations": ["תל-אביב", "תל אביב", "tel-aviv", "tel aviv", "תל אביב 00" ,"יפו", "jafa", "jaffa", "יפו 00"],
        "code": 111
},
    "ירושלים": {
        "variations": ["ירושלים", "jerusalem", "yerusalem", "ירושלים 00"],
        "code": 112
    },
    "חיפה": {
        "variations": ["חיפה", "haifa", "hifa", "חיפה 00"],
        "code": 113
    }
}

In [19]:
# Step 2: Function to preprocess city names
def preprocess_city_name(city_name):
    # Remove extra spaces and special characters
    city_name = city_name.strip().replace("-", " ").replace("'", "").replace('"', "").lower()
    # Return the cleaned name
    return city_name

In [20]:
# Step 3: Function to check for fuzzy match (80% similarity)
def fuzzy_match(city_name, city_list):
    # Use difflib to find the best match with a similarity ratio
    closest_match = difflib.get_close_matches(city_name, city_list, n=1, cutoff=0.8)
    return closest_match[0] if closest_match else None

In [21]:
# Step 4: Extract all variations into a flattened list (for fuzzy matching)
def get_all_variations():
    variations = []
    for data in city_mapping.values():
        variations.extend(data["variations"])
    return variations

In [22]:
# Step 5: Function to normalize city names
def normalize_city(city_name):
    # Preprocess the input city name
    city_name = preprocess_city_name(city_name)

    # First, check for exact matches in the mapping
    for standard_city, data in city_mapping.items():
        if city_name in [preprocess_city_name(var) for var in data["variations"]]:
            return standard_city, data["code"]  # Return the standardized city name and code

    # If no exact match, try fuzzy matching
    all_variations = [preprocess_city_name(var) for var in get_all_variations()]
    fuzzy_match_result = fuzzy_match(city_name, all_variations)
    if fuzzy_match_result:
        # Find the corresponding standard city for the fuzzy match
        for standard_city, data in city_mapping.items():
            if fuzzy_match_result in [preprocess_city_name(var) for var in data["variations"]]:
                return standard_city, data["code"]

    # If no fuzzy match, try phonetic matching
    city_phonetic = doublemetaphone(city_name)[0]  # Get the primary phonetic code
    for standard_city, data in city_mapping.items():
        for var in data["variations"]:
            if city_phonetic == doublemetaphone(preprocess_city_name(var))[0]:
                return standard_city, data["code"]

    # If no match found, return "Unknown" with code -1
    return "Unknown", -1

In [23]:
# Step 6: Normalize a list of city names
input_cities = ["תל-אביב", "תל אביב 00", "tel-aviv", "jafa", "יפו", "טל אביב", "tel-avib", "ירושלים", "חיפה", "hifa", "yerusalem"]
normalized_cities = [(city, *normalize_city(city)) for city in input_cities]

# Step 7: Print the results
for original, normalized, code in normalized_cities:
    print(f"Original: {original} -> Normalized: {normalized}, Code: {code}")

Original: תל-אביב -> Normalized: תל אביב, Code: 111
Original: תל אביב 00 -> Normalized: תל אביב, Code: 111
Original: tel-aviv -> Normalized: תל אביב, Code: 111
Original: jafa -> Normalized: תל אביב, Code: 111
Original: יפו -> Normalized: תל אביב, Code: 111
Original: טל אביב -> Normalized: תל אביב, Code: 111
Original: tel-avib -> Normalized: תל אביב, Code: 111
Original: ירושלים -> Normalized: ירושלים, Code: 112
Original: חיפה -> Normalized: חיפה, Code: 113
Original: hifa -> Normalized: חיפה, Code: 113
Original: yerusalem -> Normalized: ירושלים, Code: 112
