In [1]:
!pip install Levenshtein

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.9 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
from Levenshtein import matching_blocks, editops, ratio
import difflib

# Read data

In [3]:
brands_1_path = "data/brands_list_1_short.csv"
brands_2_path = "data/brands_list_2_short.csv"

brands_1 = pd.read_csv(brands_1_path, delimiter=';', header=None)[0]
brands_2 = pd.read_csv(brands_2_path, delimiter=';', header=None)[0]


# Individual similarities

In [4]:
def similar_words(compared_brand, series_of_brand):
    """
    Compare one word with a series and find number of similar words. 

    Parameters:
    - compared_brand (string): Compared brand.
    - series_of_brand (pd.Series): Series of other brands to compare to. 
    
    Returns: 
    A sorted dataframe where there is at least one similar word.
    """
    intersections = pd.DataFrame(columns=['compared_brand', 'other_brand', 'similar_words'])
    
    for i in range(len(series_of_brand)):
        other_brand = series_of_brand[i]
        intersections.loc[i, 'compared_brand'] = compared_brand
        intersections.loc[i, 'other_brand'] = other_brand
        similar_words = len(set(compared_brand.split()).intersection(set(other_brand.split())))
        intersections.loc[i, 'similar_words'] = similar_words
    
    return intersections[intersections.similar_words > 0].sort_values(by='similar_words', ascending=False)

similar_words('A2C', brands_2)

Unnamed: 0,compared_brand,other_brand,similar_words
2,A2C,A2C GRANULAT,1


In [5]:
def calculate_levenshtein_ratio(compared_brand, series_of_brand):
    """
    Calculates ratio between a word and a series. 

    Parameters:
    - compared_brand (string): Compared brand.
    - series_of_brand (pd.Series): Series of other brands to compare to. 
    
    Returns: 
    A sorted dataframe where levenshtein_ratio is superior to 0.5.
    """
    levenshtein = pd.DataFrame(columns=['compared_brand', 'other_brand', 'levenshtein_ratio'])

    for i in range(len(series_of_brand)):
        levenshtein.loc[i, 'compared_brand'] = compared_brand
        levenshtein.loc[i, 'other_brand'] = series_of_brand[i]
        levenshtein.loc[i, 'levenshtein_ratio'] = ratio(compared_brand, series_of_brand[i])

    return levenshtein[levenshtein.levenshtein_ratio > .5].sort_values(by='levenshtein_ratio', ascending=False)

calculate_levenshtein_ratio('A2C', brands_2)

Unnamed: 0,compared_brand,other_brand,levenshtein_ratio
11,A2C,ADC,0.666667
56,A2C,APCO,0.571429
67,A2C,ARIC,0.571429
361,A2C,EDAC,0.571429


In [6]:
def difflib_ratio(compared_brand, series_of_brand):
    
    difflib_df = pd.DataFrame(columns=['compared_brand', 'other_brand', 'difflib_ratio'])

    for i in range(len(series_of_brand)):
        difflib_sq_matcher = difflib.SequenceMatcher(None, compared_brand, series_of_brand[i])

        difflib_df.loc[i, 'compared_brand'] = compared_brand
        difflib_df.loc[i, 'other_brand'] = series_of_brand[i]
        difflib_df.loc[i, 'difflib_ratio'] = difflib_sq_matcher.ratio()

    return difflib_df[difflib_df.difflib_ratio > .5].sort_values(by='difflib_ratio', ascending=False)


difflib_ratio('A2C', brands_2)

Unnamed: 0,compared_brand,other_brand,difflib_ratio
11,A2C,ADC,0.666667
56,A2C,APCO,0.571429
67,A2C,ARIC,0.571429
361,A2C,EDAC,0.571429


# Combined similarities

In [7]:
def combine_similarities(compared_brand, series_of_brands):
    """Returns a dataframe where similar words > 0 and ratios > .5"""
    
    combined_similarities = pd.DataFrame(columns=['compared_brand', 'other_brand', 'similar_words', 'levenshtein_ratio', 'difflib_ratio'])
    
    similar_words_df = similar_words(compared_brand, series_of_brands)
    levenshtein_ratio_df = calculate_levenshtein_ratio(compared_brand, series_of_brands)
    difflib_ratio_df = difflib_ratio(compared_brand, series_of_brands)
    
    combined_similarities = pd.merge(similar_words_df, levenshtein_ratio_df, on=['compared_brand', 'other_brand'], how='outer')
    combined_similarities = pd.merge(combined_similarities, difflib_ratio_df, on=['compared_brand', 'other_brand'], how='outer')

    return combined_similarities

combine_similarities('A2C', brands_2)

Unnamed: 0,compared_brand,other_brand,similar_words,levenshtein_ratio,difflib_ratio
0,A2C,A2C GRANULAT,1.0,,
1,A2C,ADC,,0.666667,0.666667
2,A2C,APCO,,0.571429,0.571429
3,A2C,ARIC,,0.571429,0.571429
4,A2C,EDAC,,0.571429,0.571429


# Execute for all brands

In [8]:
for i in range(len(brands_1)):
    combine_similarities(brands_1[i], brands_2)