In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import random
from mlxtend.frequent_patterns import apriori, association_rules
from tqdm import tqdm
from gensim.models import Word2Vec 
import warnings;
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("content_rec_df.xlsx" , engine= "openpyxl")

In [3]:
df.shape

(38299, 6)

In [4]:
df.head()

Unnamed: 0,Montant,Ville de l'affilié,Nom de l'affilié,Adresse PV,Famille Aff.,Groupe Aff.
0,39.0,BARDO,ECHEMI,"AV. HEDI NOUIRA ,ENNASR II",FAST FOOD,RESTAURATION
1,13.0,LA SOUKRA,WOOD'S PIZZA,"3, RES. EL BOSTENE E4",PIZZERIA,RESTAURATION
2,22.31,CITE EL GHAZELA,DI NAPOLI,"45B, AV. HABIB BOURGUIBA",RESTAURANT A LA CARTE,RESTAURATION
3,8.5,CITE EL GHAZELA,EL MAGICO,"95, AV. FETHI ZOUHIR",PIZZERIA,RESTAURATION
4,24.0,DEN-DEN,PIZZERIA KAPARI,"10,RUE IBN DHIEF",RESTAURANT A LA CARTE,RESTAURATION


In [5]:
df['Montant']= df['Montant'].astype(str)
df["Ville de l'affilié"]= df["Ville de l'affilié"].astype(str)
df["Nom de l'affilié"]= df["Nom de l'affilié"].astype(str)
df['Adresse PV']= df['Adresse PV'].astype(str)
df['Famille Aff.']= df['Famille Aff.'].astype(str)
df['Groupe Aff.']= df['Groupe Aff.'].astype(str)

In [6]:
familles = df["Famille Aff."].unique().tolist()
len(familles)

8

In [7]:
# shuffle companies ID's
random.shuffle(familles)

# extract 90% of companies ID's
familles_train = [familles[i] for i in range(round(0.9*len(familles)))]

# split data into train and validation set
train_df = df[df['Famille Aff.'].isin(familles_train)]
validation_df = df[~df['Famille Aff.'].isin(familles_train)]

In [8]:
# list to capture purchase history of the company
locations_train = []

# populate the list with the location codes
for i in tqdm(familles_train):
    temp = train_df[train_df["Famille Aff."] == i]["Ville de l'affilié"].tolist()
    locations_train.append(temp)

100%|██████████| 7/7 [00:00<00:00, 386.39it/s]


In [9]:
locations_train

[['CITE EL GHAZELA',
  'DEN-DEN',
  'BARDO',
  'LES BERGES DU LAC',
  'TUNIS REPUBLIQUE',
  'TUNIS REPUBLIQUE',
  'TUNIS BELVEDERE',
  'CITE EL GHAZELA',
  'DEN-DEN',
  'LES BERGES DU LAC',
  'TUNIS BELVEDERE',
  'NABEUL',
  'FOUCHANA',
  'TUNIS REPUBLIQUE',
  'DEN-DEN',
  'FOUCHANA',
  'DEN-DEN',
  'LES BERGES DU LAC',
  'LES BERGES DU LAC',
  'CENTRE URBAIN NORD',
  'TUNIS REPUBLIQUE',
  'TUNIS REPUBLIQUE',
  'LES BERGES DU LAC',
  'AV. AMMAR DAKHLAOUI IMM. GHAZELA CENTRE ',
  'LES BERGES DU LAC',
  'BARDO',
  'DEN-DEN',
  'LES BERGES DU LAC',
  'FOUCHANA',
  'LE KEF',
  'BARDO',
  'LES BERGES DU LAC',
  'AV. AMMAR DAKHLAOUI IMM. GHAZELA CENTRE ',
  'DEN-DEN',
  'FOUCHANA',
  'NABEUL',
  'DEN-DEN',
  'TUNIS REPUBLIQUE',
  'TUNIS REPUBLIQUE',
  'DEN-DEN',
  'FOUCHANA',
  'TUNIS REPUBLIQUE',
  'TUNIS REPUBLIQUE',
  'TUNIS BELVEDERE',
  'TUNIS REPUBLIQUE',
  'CITE EL GHAZELA',
  'TUNIS REPUBLIQUE',
  'TUNIS REPUBLIQUE',
  'DEN-DEN',
  'TUNIS REPUBLIQUE',
  'LES BERGES DU LAC',
  'MONTPL

In [10]:
# list to capture purchase history of the companies
location_val = []

# populate the list with the location codes
for i in tqdm(validation_df['Famille Aff.'].unique()):
    temp = validation_df[validation_df["Famille Aff."] == i]["Ville de l'affilié"].tolist()
    location_val.append(temp)

100%|██████████| 1/1 [00:00<00:00, 1003.90it/s]


In [11]:
# train word2vec model
model2 = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model2.build_vocab(locations_train, progress_per=200)

model2.train(locations_train, total_examples = model2.corpus_count, 
            epochs=10, report_delay=1)


(64379, 367460)

In [12]:
model2.init_sims(replace=True)

In [13]:
# extract all vectors
X = model2.wv.index_to_key
X

['MONTPLAISIR',
 'LES BERGES DU LAC',
 'TUNIS REPUBLIQUE',
 'CITE EL GHAZELA',
 'TUNIS BELVEDERE',
 'EL MENZAH 9',
 'LE KRAM',
 'TUNIS CARTHAGE',
 'CHOTRANA 2',
 'ARIANA ESSOGHRA',
 'CITE MAHRAJENE',
 'LA SOUKRA',
 'BARDO',
 'TECHNOPOLE EL GHAZELA',
 'EL MENZAH 8',
 'CENTRE URBAIN NORD',
 'LA MARSA',
 'CHARGUIA',
 'AV. AMMAR DAKHLAOUI IMM. GHAZELA CENTRE ',
 'LES BERGES DU LAC ',
 'NABEUL',
 'EL MANAR',
 'CENTRE URBAIN',
 'BOUMHEL',
 'EL MENZAH 7',
 'MEGRINE',
 'DEN-DEN',
 'FOUCHANA',
 'SIDI BOUSAID',
 'EL MOUROUJ',
 'SFAX',
 'KALAA EL KBIRA',
 'EL MENZAH 1',
 'TUNIS',
 'SIDI SALEM ',
 'LE KEF',
 'MARSA ZEPHYR',
 'CITE MEHIRI']

In [14]:
restaurants = train_df[["Ville de l'affilié", "Nom de l'affilié"]]

# remove duplicates
restaurants.drop_duplicates(inplace=True, subset="Ville de l'affilié", keep="last")

# create product-ID and product-description dictionary
restaurants_dict = restaurants.groupby("Ville de l'affilié")["Nom de l'affilié"].apply(list).to_dict()

In [15]:
restaurants_dict['MONTPLAISIR']

['PRET A MANGER']

In [16]:
restaurants_dict['BARDO']

[' LE COIN']

In [17]:
def recommanded_restaurants(v, n = 6):
    
    # extract most similar locations for the input vector
    ms = model2.wv.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar locations
    new_ms = []
    for j in ms:
        pair = (restaurants_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms      

In [18]:
recommanded_restaurants('BARDO')

[('PLAN B', 0.7305173873901367),
 ('PLAN B', 0.6753979921340942),
 ('ELY S COFFE SHOP', 0.6617214679718018),
 ('CANTINE NEWREST', 0.6476643681526184),
 ('PLAN B', 0.6021220684051514),
 ('BAGUETTE ET BAGUETTE', 0.5799875855445862)]

In [19]:
recommanded_restaurants('LES BERGES DU LAC ')

[('PRET A MANGER', 0.7774235010147095),
 ('GOURMANDISE', 0.7257835865020752),
 ('PLAN B', 0.717279851436615),
 ('MALIBU', 0.6239416599273682),
 (' MALIBU ', 0.6236106753349304),
 ('CANTINE NEWREST', 0.5374137163162231)]

In [20]:
recommanded_restaurants('MONTPLAISIR')

[('GOURMANDISE', 0.9252773523330688),
 ('PLAN B', 0.8404488563537598),
 ('SKIFA ARBI', 0.7774235010147095),
 ('MALIBU', 0.7748578190803528),
 ('BROWN SUGAR', 0.765684962272644),
 ('MALIBU', 0.7646350264549255)]