In [21]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint
import pandas as pd
import json

In [36]:
items = pd.read_csv("../datasets/Total/itemsDescription.csv",sep=";",encoding="latin").drop_duplicates(subset="ARTICLE_ID")
with open("../datasets/Total/descriptions_mapping.json","r") as file:
    descriptions_mapping = json.load(file)

# Mapping of similar description to a unique 
items.DESCRIPTION = items.DESCRIPTION.replace(descriptions_mapping)

# Group items having similar description to a single one 
items.DESCRIPTION = items.DESCRIPTION.apply(lambda x : x.upper().replace('"','').strip())
items = items.groupby('DESCRIPTION')["ARTICLE_ID"].apply(list).to_frame().reset_index()
items["id"]= items.ARTICLE_ID.apply(lambda x : x[0])
items.columns = ["DESCRIPTION","IDS_LIST","ARTICLE_ID"]
items.sort_values(by="DESCRIPTION").to_csv("../datasets/Total/items.csv")

# Save to mapping file 
mapping = {}
def get_mapping(x):
    for i in set(x["IDS_LIST"]):
        if i==x['ARTICLE_ID']:
            continue
        mapping[i]=x['ARTICLE_ID']
items.apply(get_mapping,axis=1)

with open("../datasets/Total/ids_mapping.json", 'w') as outfile:
    json.dump(mapping, outfile)
    

In [37]:
items

Unnamed: 0,DESCRIPTION,IDS_LIST,ARTICLE_ID
0,1 6 RILLETTES THON 125G,[3019081236250],3019081236250
1,"1,5L SUMOL ORAN",[5601045000427],5601045000427
2,1/ BAGUETTE CLUB KAP,[3464381000276],3464381000276
3,1/2 BAG. VIENNOISE X2 170G,"[3760049790801, 3760049790818]",3760049790801
4,10 GOBELETS 27C,[3700232615702],3700232615702
...,...,...,...
11018,ZIGOH DESO FLEUR FUSHIA,[3700619604725],3700619604725
11019,ZIGOTO REGLISSE,[3103220009055],3103220009055
11020,ZIP AMIS LUDENDO,[3700300542008],3700300542008
11021,ZIP POUPEE LAVANDE,[3453131039413],3453131039413


# Cosine similarity

In [33]:
vectorizer = CountVectorizer()
vect_words = vectorizer.fit_transform(items.DESCRIPTION)
vect_similarities = cosine_similarity(vect_words,dense_output=False)

In [34]:
df_similarities = pd.DataFrame(vect_similarities.toarray(),index=items.DESCRIPTION,columns=items.DESCRIPTION)

In [38]:
df_similarities = df_similarities[(df_similarities>0.8)&(df_similarities<0.9)].stack().to_frame()

In [40]:
df_similarities.sort_values(0,ascending=False).to_csv("test.csv")

In [44]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz


In [142]:
promo_products = pd.read_csv("../datasets/Total/promo_products.csv")

AttributeError: 'numpy.ndarray' object has no attribute 'isin'

In [226]:
for i in sorted(items.DESCRIPTION.unique()):
    if i in found:
        continue
    print(i)

1 6 RILLETTES THON 125G
1,5L SUMOL ORAN
1/ BAGUETTE CLUB KAP
1/2 BAG. VIENNOISE X2 170G
10 GOBELETS 27C
10 LINGETTES NETTOYANT CUIR
10 M.FUSIBLES ENFICHABLE
10 MINI FUSIBLE
10 PAINS AU LAIT 350G
10 TRANCHES D'EMMENTAL 200G
100 GOBELETS BLANCS 20CL
12 MADELEINE VE
12 MADELEINE VERI 250G BQ
140G MORTADELL.
150 FILTRES RIZLA 6MM
1664 25CL
1664 33CL BTE
1664 5D5 6X25CL
1664 BLANC 6X25
1664 BLLE 5D5 6X25CL
1664 DIVA H.T 5D5 75CL
1L ENERG.CRAZY TIG
1PILE PWRLIF PH
2 AVERTISSEURS ULTRA SON
2 BROYES DU POI
2 ECLAIRS AU CHOCOLAT 160G
2 LAMPES W5W 12V TKA FLAURAUD
2 MINI BD GARFIELD
2 MINI BD PETIT SPIROU
2 PILES LR14 HI
2 SACHETS LAV ET 1 SAVON LAV COEUR 100G
2 SACHETS LAV ET 1 SAVON LAV OVALE 100G
2 T PREMIUM 5L
2 TEMPS S DOSET
2 TSR MOTO MINERVAOIL 1L
2-1 MICRO USB K
20 MADELEINES L
20 MADELEINES L. CHOCO 250G
20 MADELEINES L.RAISIN 250G
200G EMMENTAL FRANCAIS
24 PANS LAV TRA
250G CAMEMBERT
2TEMPS Z SYNTHE
2TR JAMBON BLANC MADRANGE 80G
2TR JAMBON BLC
3 AMPOULES H4 NORMA
3 D NATURE 135G
3 ROUL

DORITOS CHEESE
DORITOS CHILLI
DORITOS NACH CH
DORITOS SWEET44G
DORTIOS AMERICAN 44G
DOS QF NOR MAXWEL 25X45G
DOS SP PUR ARABICA 125G CO
DOS. SENSEO DECAFE. 18D 125G
DOS.SP.CAFE FAM
DOSET SP EXPRESSO 125G CO
DOSETTE 2 TEMPS
DOSETTES  ETHIO
DOSETTES ARABIC
DOSETTES CAFE 125G
DOUBEL AC 12 24V 70W EN T 1 USB 1A
DOUBLE A.C.T+LED
DOUBLE AC - IL 12 24V GPS
DOUBLE AC 12 24V 1USB 1000MA PIVOT 90DEG
DOUBLE AC 12/24V 15A 300W MAXI
DOUBLE AC 12/24V 16A
DOUBLE AC 12/24V 2M 15A MAX 300W
DOUBLE AC 12/24V MAXI 5A + 2 PRISE USB G
DOUBLE AC 2USB
DOUBLE AC AVEC FIL GPS
DOUBLE AC EN T
DOUBLE AC EN T 12 24V SYNCHRO
DOUBLE AC FIL
DOUBLE CHARGEUR USB 12/24V
DOUBLE FICHE ALL.CIGARE
DOUBLE PRISE AC 2USB
DOUBLE SOUR DIP
DOUBLEUR ALLUME CIGARE TNB
DOUCEUR MIEL CO
DOUCHE AM DCE LPM 250ML
DOUCHE LT LPM 250ML
DOUCHE À  2
DOUCHOU CACHUETES 250G
DOUDOUX NOISET 150G LOU
DOVE DEODORANT 200ML
DRAG SS CHLORO
DRAGIBUS 120G H
DRAGIBUS BICOOL 120G
DRAGIBUS COLOR 100G HARIBO
DRAGIBUS COLOR POPS 30G
DRAGIBUS MINI HARIBO 40
DR

MINI POSTER UE EU AEDIS
MINI PRESIDENT
MINI ROCHERS CHOCO 250G JLC
MINI ROCHERS COCO
MINI ROCHERS NA
MINI ROSETTE 12
MINI SACHET ASS
MINI SAUCISSES 3X50G CO
MINI SAUCISSON
MINI SAUCISSON AUX NOIX 75G DM
MINI SAUCISSON S. 75G CO
MINI SAVANE BAR
MINI SIMPLICITIY SECU EN
MINI STICKS ROQUEFORT&NOIX 100G
MINI SUPPORT SMARTPHON 5.5 T
MINI TOASTS 3X8
MINI TOBLERONE
MINI TORTILLA NATURE 200G LUSTUCRU
MINI VIENNOIS
MINI VIENNOIS POULET EMMENTAL 130G
MINI YP FRAISE
MINI'S PEPITES CHOCOLAT 70G KER CADELAC
MINI'STICKS CHORIZO 100G
MINI-ATLAS FRANCE 2017 MICH
MINIATURE PEUGEOT 2008 DKR16 6CM
MINIBURGER TROLLI 10G
MINIKIT H4 MD 24V PHILIPS
MINIONS VENTILAT BONBONS 8G
MINIROULE FRAMB
MINIS CUBES SNICKERS 150G
MINIZZA BELIN 85G
MINNIE : MON COLORIAGE
MINNIE BALLON M
MINTI         125G LUTTI
MINUTE MAID CITRONNADE 50CL
MINUTE MAID MULTIFRUIT PET 33CL
MINUTE MAID ORA
MINUTE MAID ORANGE PET 33CL
MINUTE MAID POM
MINUTE MAID RGE
MINUTE MAID TROPIC. 33CL
MIR VSL ECORCE
MIRAVAL BLANC 2
MIRAVAL ROSE
MIREILLE 

TW NETTOYANT  P
TW NETTOYANT TEXTILES 300ML
TW NETTOYANT VITRE 500ML
TW PACK ENT EXT
TW PACK ENT INT
TW PCK EFFACE RAYURE
TW PEAU CHAM 28
TW PEAU CHAMOIS 12.5DM
TW PEAU CHAMOISEE 12,5 DM
TW PLASTIC BRIL
TW POLISH MICRO
TW POLISH MICRO 500ML
TW POLISH MICRO RAYURE 500ML
TW RACLETTE INTERIEUR TELESCOPIQUE
TW RACLETTE VITRE ET CAROSSERIE
TW RECHARGE PAD
TW SHAMP. BRILL
TW SHAMP. BRILL 500ML
TW. CHIFFON MICROFIBRE MULTI U. JAUNE
TW.DEMOUST. ANTI FIENTE 400ML
TW.LINGETTES ENT. PLASTIQ X16
TWINUTS  +20% 1
TWINUTS SALE 150G
TWIST GREEN POP
TWIST PINK ROCK
TWIST&WRAP SAUMON 190G
TWIX  CHOCOLAT    58G
TWIX 20G
TWIX 50G
TWIX CAPPUCCINO 46G
TWIX MINIS CUBE
TWIX TOP 21G
TWIX WHITE
TWIX X 3 150G
TWIX X6 300G
TWIX'XTRA 75G
TXW LAVE GLACE
TYGREA TRL1 50AH/420A
TYRELLS CHEDDAR
TYRELLS VINAIGR
TYRRELLS SEA SA
UNE AVALANCHE DE CONSEQUENCES
UNIVERSAL BULB KIT
V. ROCKET 85ML
VACH BOIR FRAMB MA 250ML
VACH BOIR MYRT MA 250ML
VACHE QUI RIT 50% 12PORT. 200G
VACHE-QUI-RIT  8 PORTIONS
VAIVA COCO FRUIT PASSION 3

In [104]:
found = []
matches = {}

In [219]:
to_match  = "TOTAL ADBLUE 1"
result = process.extract(to_match,items[~items.DESCRIPTION.isin(found)].DESCRIPTION,scorer=fuzz.ratio)
pd.DataFrame(result)

Unnamed: 0,0,1,2
0,TOTAL ADBLUE 1,100,10311
1,TOTAL ADBLUE BEC V. 5L,72,10316
2,TOTAL FLUIDE DA 1L,69,10338
3,TOTAL FLUIDE LD,69,10339
4,TOTAL LIQUIDE R,69,10363


In [220]:
min_trigger = 100
[i[0] for i in filter(lambda x : x[1]>=min_trigger,result)]

['TOTAL ADBLUE 1']

In [221]:
matches[to_match]= [i[0] for i in filter(lambda x : x[1]>=min_trigger,result)]
found += [i[0] for i in filter(lambda x : x[1]>=min_trigger,result)]

In [227]:
for i in matches:
    print(i, " : ",matches[i])

BIDON AD BLUE 10L  :  ['AD BLUE 10 L', 'ADBLUE 10L', 'ADBLUE 10L AD', 'TOTAL ADBLUE BIDON 10L', 'TOTAL ADBLUE 10 L', 'TOTAL ADBLUE 10L', 'AD Blue 10L']
ADBLUE BIDON 5L  :  ['ADBLUE 5L', 'ADBLUE BIDON 5L', 'ADBLUE 5L BIDON', 'TOTAL ADBLUE 5', 'TOTAL ADBLUE 5L', 'ADBLUE 5 LITRE', 'ADBLUE 5 L', 'ADBLUE 5 LITRE', 'ADBLUE 5 L']
EVIAN 50CL  :  ['EVIAN 50CL', 'EVIAN 50CL PET', 'Evian 50cl']
TOTAL ACTIVA 9000 FUTUR NFC E 5W30 5L  :  ['TOTAL ACTIVA 9000 FUTUR NFC E 5W30 5L', 'ACTIVA 9000 FUTUR NFC 5L', 'ACTIVA 9000E 5W40 5L']
ELF MOTO 4 ROAD 10W40 1L  :  ['ELF MOTO 4 ROAD 1L 10W40']
TOTAL 5000 D 15W40 2L  :  ['TOTAL 5000 D 15W40 2L']
ADBLUE 1.5L  :  ['ADBLUE 1.5L', 'ADBLUE 4L']
TOTAL ADBLUE 1  :  ['TOTAL ADBLUE 1']


In [223]:
found = []
for i in matches:
    found +=[i]
    found +=matches[i]
found

['BIDON AD BLUE 10L',
 'AD BLUE 10 L',
 'ADBLUE 10L',
 'ADBLUE 10L AD',
 'TOTAL ADBLUE BIDON 10L',
 'TOTAL ADBLUE 10 L',
 'TOTAL ADBLUE 10L',
 'AD Blue 10L',
 'ADBLUE BIDON 5L',
 'ADBLUE 5L',
 'ADBLUE BIDON 5L',
 'ADBLUE 5L BIDON',
 'TOTAL ADBLUE 5',
 'TOTAL ADBLUE 5L',
 'ADBLUE 5 LITRE',
 'ADBLUE 5 L',
 'EVIAN 50CL',
 'EVIAN 50CL',
 'EVIAN 50CL PET',
 'Evian 50cl',
 'TOTAL ACTIVA 9000 FUTUR NFC E 5W30 5L',
 'TOTAL ACTIVA 9000 FUTUR NFC E 5W30 5L',
 'ACTIVA 9000 FUTUR NFC 5L',
 'ACTIVA 9000E 5W40 5L',
 'ELF MOTO 4 ROAD 10W40 1L',
 'ELF MOTO 4 ROAD 1L 10W40',
 'TOTAL 5000 D 15W40 2L',
 'TOTAL 5000 D 15W40 2L',
 'ADBLUE 1.5L',
 'ADBLUE 1.5L',
 'ADBLUE 4L',
 'TOTAL ADBLUE 1',
 'TOTAL ADBLUE 1']

In [224]:
matches["ADBLUE BIDON 5L"] +=['ADBLUE 5 LITRE', 'ADBLUE 5 L']

In [231]:
list_of_matches= [matches[i] for i in matches]

In [238]:
mapping = {}
for i in matches:
    for j in matches[i]:
        mapping[j] = i
        

In [250]:
items.DESCRIPTION = items.DESCRIPTION.replace(mapping)

In [258]:
tst = items.groupby("DESCRIPTION")["ARTICLE_ID"].apply(list).to_frame()

In [271]:
pd.DataFrame([mapping]).T.to_csv("../datasets/Total/descriptions_mapping.csv")

### Update ids_mapping.json file

In [20]:
items = pd.read_csv("../datasets/Total/items.csv",index_col=0)
res = {}
def get_mapping(x):
    for i in x["IDS_LIST"][1:-1].split():
        if i == x["ARTICLE_ID"]:
            continue
        res[i.strip()] = x["ARTICLE_ID"]
items.apply(get_mapping,axis=1)
with open("../datasets/Total/ids_mapping.json","w") as file:
    json.dump(res,file)

In [29]:
items

Unnamed: 0,DESCRIPTION,IDS_LIST,ARTICLE_ID
0,1 6 RILLETTES THON 125G,[3019081236250],3019081236250
1,"1,5L SUMOL ORAN",[5601045000427],5601045000427
2,1/ BAGUETTE CLUB KAP,[3464381000276],3464381000276
3,1/2 BAG. VIENNOISE X2 170G,"[3760049790801, 3760049790818]",3760049790801
4,10 GOBELETS 27C,[3700232615702],3700232615702
...,...,...,...
11018,ZIGOH DESO FLEUR FUSHIA,[3700619604725],3700619604725
11019,ZIGOTO REGLISSE,[3103220009055],3103220009055
11020,ZIP AMIS LUDENDO,[3700300542008],3700300542008
11021,ZIP POUPEE LAVANDE,[3453131039413],3453131039413
