In [10]:
import json
import pandas as pd

# sentence_transformers from here: https://github.com/UKPLab/sentence-transformers
# just pip install it
from sentence_transformers import (SentenceTransformer, util)

In [2]:
# import and select the machine learning model
# types of models available here: https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
# load the list of sectors/names to be used as reference
with open('Data/IOIC_sectors.json','r') as f:
    IOIC_sectors = json.load(f)
# load the reference list into the machine learning model
IOIC_embeddings = model.encode(IOIC_sectors)

In [4]:
# enter a list of the names/products to be matched to the reference
products = ['ADPE System Configuration','Geophysical Instruments']
# load those names/products in the machine learning model
products_embeddings = model.encode(products)

In [8]:
# calculate the similarity between each names/products to-be-matched and the reference list
scores = util.pytorch_cos_sim(products_embeddings, IOIC_embeddings)
# sort and extract indices of each scores
sorted_scores, indices = scores.sort(dim=1, descending=True)

In [15]:
# store data in a nice dataframe
df_results = pd.DataFrame(None, ['order', 'sector', 'similarity'])

# the number of similarities per name/product to-be-matched that will be provided
# you can see this as the number of attempts the algorithm is trying to matched products to reference
number_of_matches = 5

for i, product in enumerate(products):
    for j in range(0, number_of_matches):
        df_results = pd.concat([df_results, 
                   pd.DataFrame([product, 
                                 j+1,
                                 IOIC_sectors[indices[i][j].cpu().numpy()], 
                                 sorted_scores[i][j].cpu().numpy().tolist()],
                                ['product', 'order', 'sector', 'similarity'])],
                               axis=1)
        
df_results = df_results.T.set_index(['product', 'order'])

In [16]:
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,sector,similarity
product,order,Unnamed: 2_level_1,Unnamed: 3_level_1
ADPE System Configuration,1,Office administrative services,0.254899
ADPE System Configuration,2,Computer systems design and related services (...,0.237132
ADPE System Configuration,3,Custom software design and development services,0.226263
ADPE System Configuration,4,"Advertising, public relations, and related ser...",0.223295
ADPE System Configuration,5,Facilities and other support services,0.201962
Geophysical Instruments,1,"Measuring, control and scientific instruments",0.544412
Geophysical Instruments,2,Navigational and guidance instruments,0.40421
Geophysical Instruments,3,Other civil engineering works,0.386468
Geophysical Instruments,4,"Other professional, scientific and technical s...",0.353572
Geophysical Instruments,5,Other electrical equipment and components,0.315113
