In [28]:
from thefuzz import fuzz
from thefuzz import process
import pandas as pd

In [5]:
# input strings
string_1 = "Data Science and AI"
string_2 = "Data Science and Artificial Intelligence"

In [7]:
similarity_score = fuzz.ratio(string_1, string_2)
print(f"{similarity_score}%")

64%


### Finding best match

In [17]:
choices = ["Data Science & AI", "Machine Learning", "Data Sci and AI", "AI and Date Science",]
query = "Data Science and AI"
best_match = process.extractOne(query, choices)
print(best_match)

('Data Science & AI', 95)


In [18]:
# Extracting best matches based on threshold
threshold = 90
matches = process.extractBests(query, choices, score_cutoff=threshold)
for match in matches:
    print(match)

('Data Science & AI', 95)
('AI and Date Science', 90)


### Partial Ratio

In [25]:
partial_score = fuzz.partial_ratio("Data Science", "Data Sci. and AI")
print(f'{partial_score}%')

80%


In [32]:
# Sample data
data = {
    "Company Name": [
        "Google Inc.", "Google LLC", "Goooogle", "GooGle!", "Apple Inc.", "Apple", "Microsoft Corp.", 
        "Microsoft Corporation", "MicroSoft", "Amazon.com", "Amazon Inc", "Amaaaazon"
    ]
}

df = pd.DataFrame(data)

def standardize_company_names(df, threshold=80):
    # Create a dictionary to map original names to standardized names
    standard_name_map = {}

    # Iterate over each company name
    for name in df['Company Name']:
        if name not in standard_name_map:
            # Find the best match for the current name within existing names in the map
            match_result = process.extractOne(name, standard_name_map.keys(), scorer=fuzz.partial_ratio, score_cutoff=threshold)

            if match_result:
                match, score = match_result
                standard_name_map[name] = standard_name_map[match]
            else:
                standard_name_map[name] = name

    # Replace the names in the DataFrame with the standardized names
    df['Standardized Company Name'] = df['Company Name'].map(standard_name_map)

    return df

df_standardized = standardize_company_names(df)
print("\nStandardized Data:\n", df_standardized)


Standardized Data:
              Company Name Standardized Company Name
0             Google Inc.               Google Inc.
1              Google LLC               Google Inc.
2                Goooogle               Google Inc.
3                 GooGle!               Google Inc.
4              Apple Inc.               Google Inc.
5                   Apple               Google Inc.
6         Microsoft Corp.           Microsoft Corp.
7   Microsoft Corporation           Microsoft Corp.
8               MicroSoft           Microsoft Corp.
9              Amazon.com                Amazon.com
10             Amazon Inc                Amazon.com
11              Amaaaazon                Amazon.com
