In [78]:
pip install sparse_dot_topn

Collecting sparse_dot_topn
[?25l  Downloading https://files.pythonhosted.org/packages/59/e0/284cd42f13b5605c2d446c55b045aed482b000b926ac55f8a940fe6de968/sparse_dot_topn-0.3.1-cp37-cp37m-macosx_10_15_x86_64.whl (307kB)
[K     |████████████████████████████████| 317kB 1.9MB/s eta 0:00:01
Collecting cython>=0.29.15 (from sparse_dot_topn)
[?25l  Downloading https://files.pythonhosted.org/packages/da/ac/e28e88d5846adf97ea81477e8567d5a333a0ab3b8d0f1836a8d2e3083aba/Cython-0.29.24-cp37-cp37m-macosx_10_9_x86_64.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 888kB/s eta 0:00:01
[?25hCollecting setuptools>=42 (from sparse_dot_topn)
[?25l  Downloading https://files.pythonhosted.org/packages/bd/25/5bdf7f1adeebd4e3fa76b2e2f045ae53ee208e40a4231ad0f0c3007e4353/setuptools-57.4.0-py3-none-any.whl (819kB)
[K     |████████████████████████████████| 819kB 1.4MB/s eta 0:00:01
[?25hInstalling collected packages: cython, setuptools, sparse-dot-topn
  Found existing installation: Cython 0.29

In [79]:
import pandas as pd
import numpy as np
import re
import math
import sys
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # Leading Juice for us
import time

In [2]:
## Loading Data
# Main tables
new=pd.read_excel('Produkty.xlsx')
old=pd.read_excel('Artikly.xlsx')

# Scraped data
mcat=pd.read_csv('dataset_moda-category.csv')
mprod=pd.read_csv('dataset_moda-product.csv')
mdet=pd.read_csv('dataset_moda-detail.csv')
scat=pd.read_csv('dataset_sport-category.csv')
sprod=pd.read_csv('dataset_sport-product.csv')
sdet=pd.read_csv('dataset_sport-detail.csv')

In [3]:
#Concatenating scraped tables
scr_cat = mcat.append(scat)
scr_prod=mprod.append(sprod)
scr_det=mdet.append(sdet)

## Clients data

In [4]:
#Setting data types and key columns parsing

new['Produktový kód'] = new['Produktový kód'].str.extract('(\d+)', expand=False)
new.dropna(subset=['Produktový kód'])
old.dropna(subset=['Kód výrobku'])
old=old.convert_dtypes()
old['Kód výrobku']=old['Kód výrobku'].astype(str)

#Merging catalogs to add missing product name for new table

prod=pd.merge(new,old, left_on=['Produktový kód'], right_on=['Kód výrobku'], how='left')
prod['Název produktu'] = np.where(prod['Název produktu'] == '-1', prod['Popis výrobku'], prod['Název produktu'])

In [5]:
#keep only items from recent years (>2017)

prod['Rok']=prod['Sezóna_x'].str.extract('(\d+)')
prod['Rok']=[i.lstrip('0') for i in prod['Rok'].astype(str)]
prod=prod[prod['Rok']!='nan']
prod['Rok']=prod['Rok'].astype(int)
#prod=prod[prod['Rok']>17]


In [6]:
# Splitting category info into separate columns

cat_details = prod['Kategorie'].str.split('-', n=6, expand=True)
prod['Material']=cat_details[0]
prod['Sex']=cat_details[1]
prod['Sex']=prod['Sex'].str.strip()
prod['Type']=cat_details[2]
prod['Type']=prod['Type'].str.strip()
prod["Specification"]=cat_details[3]
prod["Other specification"]=cat_details[4]

In [64]:
#rename selected values
prod["Sex"] = prod["Sex"].str.strip().replace({'PÁNSKÉ':'PANSKE','DÁMSKÉ':'DAMSKE','UNISEX DĚTSKÉ':'DETSKE','CHLAPECKÉ':'PANSKE', 'DÍVČÍ':'DAMSKE','UNISEX DOSPĚLÉ':'UNISEX'})
prod.columns = prod.columns.str.replace(' ', '_')

## Scraped data

In [8]:
#remove leading letter in product_id column and drop duplicated product codes
scr_prod['product_id']=scr_prod['product_id'].str[1:]
scr_prod=scr_prod.drop_duplicates(subset=['product_id'], keep='last', inplace=False)

# Delete columns containing either 65% or more than 65% NaN Values 
perc = 65.0
min_count =  int(((100-perc)/100)*scr_det.shape[0] + 1)
scr_det = scr_det.dropna( axis=1, 
                thresh=min_count)

In [9]:
#convert data types
scr_det=scr_det.dropna(subset=['product_id'])
scr_det.loc[:, ['product_name']]=scr_det.loc[:, ['product_name']].fillna('-')
scr_det['product_id']=scr_det['product_id'].astype(str)

In [10]:
#Reshape data frame using values from parameter_0_name columns as column headers and parameter_0_label as values
scr_det_reshape=scr_det.pivot(index='product_id', columns='parameter/0/parameter_name', values='parameter/0/parameter_label').reset_index()
scr_det_reshape = scr_det_reshape.rename_axis(None, axis=1) 
scr_det_reshape = scr_det_reshape.rename_axis(None, axis=0)
scr_det_reshape=scr_det_reshape[['product_id', 'Výrobce', 'Výrobca', 'Značka']]
scr_det_reshape['Výrobce']=np.where(scr_det_reshape['Výrobce'].isna(), scr_det_reshape['Výrobca'], scr_det_reshape['Výrobce'])

In [11]:
#Reshape data frame using values from parameter_1_name columns as column headers and parameter_1_label as values
scr_det_reshape1=scr_det.pivot(index='product_id', columns='parameter/1/parameter_name', values='parameter/1/parameter_label').reset_index()
scr_det_reshape1 = scr_det_reshape1.rename_axis(None, axis=1) 
scr_det_reshape1 = scr_det_reshape1.rename_axis(None, axis=0)
scr_det_reshape1=scr_det_reshape1[['product_id', 'Výrobce', 'Značka']]

In [12]:
#Join reshaped tables
scr_det_resh=pd.merge(scr_det_reshape, scr_det_reshape1, on='product_id', how='inner')
scr_det_resh['Výrobce_x']=np.where(scr_det_resh['Výrobce_x'].isna(),scr_det_resh['Výrobce_y'], scr_det_resh['Výrobce_x'])
scr_det_resh['Značka_x']=np.where(scr_det_resh['Značka_x'].isna(),scr_det_resh['Značka_y'], scr_det_resh['Značka_x'])
scr_det_resh=scr_det_resh[['product_id', 'Výrobce_x','Značka_x']]
scr_det_resh=scr_det_resh.rename(columns={"Výrobce_x": "Výrobce", "Značka_x": "Značka"})
scr_det=scr_det[['product_id', 'product_name', 'product_url', 'category_name','category_tree']]
scr_det=pd.merge(scr_det, scr_det_resh, on='product_id', how='inner')

In [13]:
scr_cat=scr_cat.drop_duplicates(subset=['category_name'], keep='last', inplace=False)
scr_det=pd.merge(scr_det, scr_cat, on='category_name', how='left')
scr_det=scr_det.convert_dtypes()
scr_det['product_id']=scr_det['product_id'].astype(str)
scr_det['product_id']=scr_det['product_id'].map(lambda x: str(x)[:-2])
scr_det=scr_det.drop(['category_tree_y'], axis=1)
scr_det=scr_det.rename(columns={'category_tree_x':'category_tree'})

In [14]:
#scraped data_final
scr_prod=pd.merge(scr_prod, scr_det, on='product_id', how="left")
scr_prod=scr_prod.drop(['product_name_y','product_url_y'], axis=1)
scr_prod=scr_prod.rename(columns={'product_name_x':'product_name', 'product_url_x':'product_url'})
scr_prod['Značka'] = np.where(scr_prod['Značka'].isna(), scr_prod['Výrobce'], scr_prod['Značka'])
scr_prod['Značka'] = scr_prod['Značka'].str.upper()

In [15]:
#scraped df: function for url parsing with regex
def parse_url(product_urls, re_string):
    '''For a given product urls find give re_string and returns list of matches.'''
    stor = []
    for url in product_urls:
        match = re.search(re_string, str(url))
        if match is not None:
            result = match.group(1)
            stor.append(result)
        else:
            stor.append('')
            
    return stor

In [16]:
def keep_shorter_then(val, n):
    if len(val) > n:
        return ''
    else:
        return val

In [17]:
#scraped df: sex column parsing using regex
sex = parse_url(scr_prod['product_url'], '//(.*?)-')
sex2 = parse_url(scr_prod['category_tree'], '(Dětské|Dámské|Pánské)')
sex = pd.Series(sex).str.upper()
sex2 = pd.Series(sex2).str.upper()
sex = sex.apply(lambda x: keep_shorter_then(x, 6))
scr_prod['sex'] = sex
scr_prod['sex2'] = sex2
scr_prod['sex'] = np.where(scr_prod['sex'].isin(['DAMSKE',
'DETSKE',
'PANSKE',
'PANSKA', 
'DAMSKA']), scr_prod['sex'], None)
# Enlarging dataset
scr_prod['sex'] = np.where(scr_prod['sex'].isna(), scr_prod['sex2'], scr_prod['sex'])
scr_prod=scr_prod.drop(columns=['sex2'])
scr_prod['sex'] = scr_prod['sex'].replace({'PANSKA':'PANSKE', 'DAMSKA':'DAMSKE','DĚTSKÉ':'DETSKE','DÁMSKÉ':'DAMSKE'})

In [18]:
#scraped df: enreaching brand column using regex
brand = parse_url(scr_prod['product_url'], '\.cz/(.*?)-') 
brand=[keep_shorter_then(word, 20) for word in brand]
brand=pd.Series(brand).str.upper()
scr_prod['brand']=brand
scr_prod['Značka']=np.where(scr_prod['Značka'].isna(), scr_prod['brand'], scr_prod['Značka'])

In [19]:
#scraped df: category column parsing using regex
category = parse_url(scr_prod['product_url'], '-(.*?).heureka')
category=pd.Series(category)
category.value_counts()[0:50]

                      54186
obuv                  14083
kalhoty                4880
kola                   4863
ponozky                4428
mikiny                 3720
bryle                  3402
bundy-kabaty           3390
kosile                 3134
doplnky                2613
aktovky                2463
cepice                 2391
lahve                  2387
rukavice               2191
sportovni-obleceni     2140
na-kolo                1921
plavky                 1836
tricka                 1766
pradlo-a-kosilky       1742
svetry-a-rolaky        1741
cyklistika             1740
pro-kolobezky          1528
staji-a-sedloven       1477
sortky                 1437
a-mece                 1416
vazeliny-cistice       1409
nadobi                 1396
saka                   1194
saly                   1193
na-kufry               1147
halenky                1081
zavazadla              1065
sukne                  1051
kotliky                1050
pyzama-a-kosilky       1033
svetry              

In [20]:
prod['Type'].value_counts()

TRIKA                    28693
BUNDY                    16793
MIKINY                   14340
CYKLISTIKA                4737
SJEZD                     1888
INLINE                    1460
SVETRY                    1196
TURISTIKA                  866
MOBILITA                   726
KABÁTY                     708
OSTATNÍ ZIMNÍ SPORTY       573
OSTATNÍ MÍČOVÉ SPORTY        6
OSTATNÍ VODNÍ SPORTY         4
Name: Type, dtype: int64

In [21]:
prod[prod['Type']=='TRIKA'].sample(50)

Unnamed: 0,Produktový kód,Název produktu,Značka_x,Kategorie,Sezóna_x,URL obrázku,URL Web,Kód výrobku,Popis výrobku,Značka_y,Stáří,Skupina artiklů,Sezóna_y,Disciplína,Rok,Material,Sex,Type,Specification,Other specification
66450,1805000094,W HH WARM ICE CREW,HELLY HANSEN,TEXTIL - DÁMSKÉ - TRIKA - KRÁTKÉ RUKÁVY - TERMO,FW16,http://www.sportisimo.cz/pub/products/images/4...,http://www.sportisimo.cz/helly-hansen/w-hh-war...,1805000094.0,48569-983 W HH WARM ICE CREW,HELLY HANSEN,2016/Q3,TEXTIL TERMOPRÁDLO TRIKO DL. DÁMSKÉ,CELOROK,PRADLO,16,TEXTIL,DAMSKE,TRIKA,KRÁTKÉ RUKÁVY,TERMO
40527,1112392680,DROP SHADOW GRAPHIC TEE,UMBRO,TEXTIL - PÁNSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,SS18,http://www.sportisimo.cz/pub/products/images/6...,http://www.sportisimo.cz/umbro/drop-shadow-gra...,1112392680.0,64852U-R97 DROP SHADOW GRAPHIC TEE,UMBRO,2018/Q1,TEXTIL VRCHNÍ TRIKA PÁNSKÉ,CELOROK,VOLNY_CAS,18,TEXTIL,PANSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS
33729,106037,FW PIQUE POLO WHI,UMBRO,TEXTIL - PÁNSKÉ - TRIKA - KRÁTKÉ RUKÁVY POLO -...,-1,http://www.sportisimo.cz/pub/products/images/1...,http://www.sportisimo.cz/umbro/fw-pique-polo-w...,,,,,,,,1,TEXTIL,PANSKE,TRIKA,KRÁTKÉ RUKÁVY POLO,VOLNÝ ČAS
44135,1113264398,W NSW AV15 TOP,NIKE,TEXTIL - DÁMSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,FW16,http://www.sportisimo.cz/pub/products/images/4...,http://www.sportisimo.cz/nike/w-nsw-av15-top/1...,1113264398.0,804062-100 W NSW AV15 TOP,NIKE,2016/Q3,TEXTIL VRCHNÍ TRIKA DÁMSKÉ,CELOROK,VOLNY_CAS,16,TEXTIL,DAMSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS
45575,1113265952,COMMERCIAL CHANNEL LOGO TEE,REEBOK,TEXTIL - DÁMSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,FW19,http://www.sportisimo.cz/pub/products/images/9...,http://www.sportisimo.cz/reebok/commercial-cha...,1113265952.0,EH5808 COMMERCIAL CHANNEL LOGO TEE,REEBOK,2019/Q4,TEXTIL VRCHNÍ TRIKA DÁMSKÉ,CELOROK,VOLNY_CAS,19,TEXTIL,DAMSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS
59988,1383005172,FR SN SS TEE W,ADIDAS,TEXTIL - DÁMSKÉ - TRIKA - KRÁTKÉ RUKÁVY - BĚH,SS18,http://www.sportisimo.cz/pub/products/images/6...,http://www.sportisimo.cz/adidas/fr-sn-ss-tee-w...,1383005172.0,CG0480 FR SN SS TEE W,ADIDAS,2018/Q1,TEXTIL RUNNING TRIKA DÁMSKÉ,CELOROK,BEH,18,TEXTIL,DAMSKE,TRIKA,KRÁTKÉ RUKÁVY,BĚH
42096,1112394456,MH PHOTO TEE,ADIDAS,TEXTIL - PÁNSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,FW19,http://www.sportisimo.cz/pub/products/images/8...,http://www.sportisimo.cz/adidas/mh-photo-tee/2...,1112394456.0,ED7287 MH PHOTO TEE,ADIDAS,2019/Q3,TEXTIL VRCHNÍ TRIKA PÁNSKÉ,CELOROK,VOLNY_CAS,19,TEXTIL,PANSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS
3911,1112390135,502396-1030 LM O'NEILL S/SLV TEE,ONEILL,TEXTIL - PÁNSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,SS15,http://www.sportisimo.cz/pub/products/images/1...,http://www.sportisimo.cz/oneill/502396-1030-lm...,1112390135.0,502396-1030 LM O'NEILL S/SLV TEE,ONEILL,2015/Q1,TEXTIL VRCHNÍ TRIKA PÁNSKÉ,CELOROK,VOLNY_CAS,15,TEXTIL,PANSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS
42671,1112395056,GRAPHIC SERIES REEBOK LINEAR READ TEE,REEBOK,TEXTIL - PÁNSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,SS20,http://www.sportisimo.cz/pub/products/images/9...,http://www.sportisimo.cz/reebok/graphic-series...,,,,,,,,20,TEXTIL,PANSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS
16427,1112389157,06613C-410 CONS M16 SPRAY STAR CHEV TEE,CONVERSE,TEXTIL - PÁNSKÉ - TRIKA - KRÁTKÉ RUKÁVY - VOLN...,SS14,http://www.sportisimo.cz/pub/products/images/9...,http://www.sportisimo.cz/converse/06613c-410-c...,1112389157.0,06613C-410 CONS M16 SPRAY STAR CHEV TEE,CONVERSE,2014/Q1,TEXTIL VRCHNÍ TRIKA PÁNSKÉ,CELOROK,VOLNY_CAS,14,TEXTIL,PANSKE,TRIKA,KRÁTKÉ RUKÁVY,VOLNÝ ČAS


In [57]:
scr_prod['category_name'].value_counts()[0:50]

Hodinky                      5254
Dámská obuv                  5086
Jízdní kola                  4823
Pánská obuv                  4435
Batohy                       3681
Dámské bundy a kabáty        3246
Dětská obuv                  2796
Dámské mikiny                2702
Tašky a aktovky              2316
Dámské ponožky               2092
Pánské košile                1941
Dámské kalhoty               1846
Elektrokola                  1822
Pánské kalhoty               1747
Noční prádlo a košilky       1732
Cyklistické helmy            1723
Pánské svetry a roláky       1666
Pánská trička                1628
Zimní čepice                 1617
Peněženky                    1608
Aminokyseliny                1553
Cyklistické brýle            1525
Komponenty pro koloběžky     1502
Golfové doplňky              1485
Zámky na kolo                1478
Duše                         1449
Proteiny                     1445
Outdoor láhve                1444
Vybavení stájí a sedloven    1442
Ledvinky      

In [22]:
scr_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178634 entries, 0 to 178633
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   product_id     178634 non-null  object 
 1   product_name   178634 non-null  object 
 2   product_url    178634 non-null  object 
 3   category_name  172975 non-null  string 
 4   category_tree  172975 non-null  string 
 5   Výrobce        71878 non-null   string 
 6   Značka         178634 non-null  object 
 7   area           133352 non-null  string 
 8   category_id    133352 non-null  Int64  
 9   category_url   133352 non-null  string 
 10  isLeaf         133352 non-null  boolean
 11  level          133352 non-null  Int64  
 12  parent_id      133352 non-null  Int64  
 13  parent_name    133352 non-null  string 
 14  parent_url     133352 non-null  string 
 15  sex            178634 non-null  object 
 16  brand          178634 non-null  object 
dtypes: Int64(3), boolean(1), obje

In [65]:
prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71990 entries, 0 to 71990
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Produktový_kód       71990 non-null  object
 1   Název_produktu       68790 non-null  object
 2   Značka_x             71990 non-null  object
 3   Kategorie            71990 non-null  object
 4   Sezóna_x             71990 non-null  object
 5   URL_obrázku          50631 non-null  object
 6   URL_Web              57933 non-null  object
 7   Kód_výrobku          55886 non-null  object
 8   Popis_výrobku        55886 non-null  string
 9   Značka_y             55885 non-null  string
 10  Stáří                55809 non-null  string
 11  Skupina_artiklů      55885 non-null  string
 12  Sezóna_y             55885 non-null  string
 13  Disciplína           55885 non-null  string
 14  Rok                  71990 non-null  int64 
 15  Material             71990 non-null  object
 16  Sex 

## Product matching

In [218]:
scr_prod['product_name']=scr_prod['product_name'].apply(lambda x: x.upper())
prod['Název_produktu']=prod['Název_produktu'].dropna().apply(lambda x: x.upper())
scr_prod_subset=scr_prod[['product_name']]
prod_subset=prod[['Název_produktu']].rename(columns={'Název_produktu':'product_name'})

In [251]:
products=prod_subset.append(scr_prod_subset).dropna()

In [237]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./:\(\)]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


In [238]:
prodcts=products['product_name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(prodcts)

In [239]:
print(tf_idf_matrix[0])


  (0, 12386)	0.2630495916835194
  (0, 12604)	0.2657276466802666
  (0, 5048)	0.2706151067833461
  (0, 5369)	0.23177149131524064
  (0, 320)	0.17064448251617426
  (0, 6572)	0.1987592801604975
  (0, 9649)	0.1627692984395724
  (0, 7745)	0.2901181190247922
  (0, 10769)	0.293578076435035
  (0, 8543)	0.18280013123207733
  (0, 13066)	0.2439810907453657
  (0, 897)	0.23516075799236152
  (0, 9490)	0.21922821041769996
  (0, 6851)	0.19326426234726904
  (0, 14561)	0.23056864041939612
  (0, 8686)	0.20937170113290476
  (0, 12460)	0.2639232123786796
  (0, 7797)	0.2597281834848195


In [240]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [241]:
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.39108991622924805


In [242]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [246]:
# store the  matches into new dataframe called matched_df and 
# printing 10 samples
matches_df = get_matches_df(matches, prodcts)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches
matches_df

KeyError: 0

In [248]:
room=pd.read_csv('room_type.csv')

In [252]:
products

Unnamed: 0,product_name
0,L35198200 TRAIL RUNNER WARM LS ZIP
1,644701-370 PRINTED MILER SS V-NECK (SU15)
2,80320-B ROM JACKET 6282 RED
3,351422-9009 LM VAL JEAN SWEAT
4,ROSEL MACAW GREEN/ANTHRACITE MIKINA PÁNSKÁ
...,...
178629,MAXXWIN BETA ALANINE 300 G
178630,VISION NUTRITION L-GLUTAMINE 500 G
178631,WEFOOD BETA-ALANINE 120 TABLET
178632,OPTIMUM NUTRITION GLUTAMINE POWDER 1050 G
