# Allocate the "true" city with Levenhstein distance

### Reading the data from CSV files

In [5]:
import pandas as pd

# Load the datasets
raw_cities_df = pd.read_csv('raw_cities.csv')
normalized_cities_df = pd.read_csv('normalized_cities - normalized_cities.csv')

# Ensure all city names are strings and handle NaN values by filling them with an empty string
raw_cities_df['city'] = raw_cities_df['city'].astype(str).fillna('')
normalized_cities_df['city'] = normalized_cities_df['city'].astype(str).fillna('')

# Display the first few rows of each dataframe
print(raw_cities_df.head())
print(normalized_cities_df.head())

                  city
0               cleron
1            aveillans
2  paray-vieille-poste
3                issac
4                rians
                    city
0  abergement clemenciat
1    abergement de varey
2      amberieu en bugey
3    amberieux en dombes
4                ambleon


### Compute the Levenshtein distance

In [6]:
!pip3 install Levenshtein


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [9]:
import Levenshtein

def find_closest_city(raw_city, normalized_cities, threshold=10):
    min_distance = float('inf')
    closest_city = None
    
    for norm_city in normalized_cities:
        distance = Levenshtein.distance(raw_city, norm_city)
        if distance < min_distance:
            min_distance = distance
            closest_city = norm_city
    
    if min_distance <= threshold:
        return closest_city
    else:
        return None

# Apply the function to find closest city for each raw city with a given threshold
threshold = 10  # Set the threshold here
raw_cities = raw_cities_df['city'].tolist()
normalized_cities = normalized_cities_df['city'].tolist()

results = []
for raw_city in raw_cities:
    closest_city = find_closest_city(raw_city, normalized_cities, threshold)
    results.append((raw_city, closest_city))

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['raw_city', 'closest_normalized_city'])
print(results_df.head())

# Evaluate the performance by inspecting a sample of results
sample_size = 20
sample_results = results_df.sample(sample_size)
print(sample_results)


              raw_city closest_normalized_city
0               cleron                  cleron
1            aveillans                voillans
2  paray-vieille-poste     paray vieille poste
3                issac                   issac
4                rians                   rians
                                       raw_city closest_normalized_city
4160                                     hodoul                   hadol
9252                           epinay sur-seine        epinay sur seine
6570                                    vertain                 vertain
2026                        st michel de la roe  saint michel de la roë
7399                                 pernes n/a                  pernes
2963                          tresques tresques            questrecques
3878                       boutier-saint-trojan   boutiers saint trojan
772                                 peyregrosse                peyrusse
9949                          le kremlin-bictre         kremlin bicêtr

# Does this method works

##### Based on the results, we can make several observations about the effectiveness of the Levenshtein distance method for matching cities from the 'raw_cities' dataset to the 'normalized_cities' dataset.

##### (1)Successful Matches: The method successfully matched some cities with exact or very similar names (e.g., "cleron" matched with "cleron", "issac" matched with "issac").

##### (2)Partial Matches: There are cases where the method found a reasonably close match, but it might not be the best possible match. For example, "aveillans" matched with "voillans". While "voillans" is somewhat close, it is not the exact match.

##### (3)Exact and Close Matches: For some cities with slight differences in formatting or spelling, the method was able to correctly identify the matches (e.g.,bourg-lastic france          bourg lastic)

##### (4)Incorrect Matches: There are instances where the method incorrectly matched cities (e.g., "epinay sur-seine" matched with "epinay sur seine" but "hodoul" matched with "hadol").

##### (5) Unmatched Entries: In some cases, the method returned None, indicating that no sufficiently close match was found (e.g., "aigrefeuille-d'aunis aigrefeuille-d'aunis" matched with None).

##### The Levenshtein distance method works reasonably well for matching cities with minor variations in spelling, formatting, or typographical errors. 


# Could you do a better work with addition resources

##### Yes! Of course!

##### Adjust Threshold: Experiment with different threshold values to balance between missing matches and incorrect matches.

##### Geographic Data: Utilize external geographic databases or APIs to cross-reference and validate city names.

##### Machine Learning: Explore machine learning models trained on labeled datasets to predict the correct city matches based on various features of the city names.
