In [289]:
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib import pyplot as plt
from sklearn import preprocessing

In [7]:
all_purchases = pd.read_csv('all_purchases.csv')
clients = np.unique(all_purchases['CLI_ID'])

In [3]:
clients_items_list = pickle.load(open('items_list_for_client.txt', 'rb'))

def prepSparseMatrix(list_of_str):
    cv = CountVectorizer()
    sparseMatrix = cv.fit_transform(list_of_str)
    return sparseMatrix, cv.get_feature_names_out()

sparseMatrix, feature_names = prepSparseMatrix(clients_items_list)

In [59]:
df_sparseMatrix = pd.DataFrame.sparse.from_spmatrix(sparseMatrix, index = clients, columns = feature_names)
df_sparseMatrix

Unnamed: 0,10,100,1000,1001,1002,1003,1004,1005,1006,1007,...,990,991,992,993,994,995,996,997,998,999
1490281,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13290776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20163348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20200041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20561854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997048745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997048751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997048769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997048777,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
model = pickle.load(open('model.pkl', 'rb'))

In [68]:
df = pd.DataFrame({"CLI_ID": clients,"CLUSTER": model.predict(sparseMatrix), "ITEMS": clients_items_list})
df

Unnamed: 0,CLI_ID,CLUSTER,ITEMS
0,1490281,0,"730, 730, 200, 361, 732"
1,13290776,2,"415, 925, 733, 733, 733, 908, 908, 463, 463"
2,20163348,2,1119
3,20200041,2,"194, 269"
4,20561854,2,"705, 948, 1412"
...,...,...,...
853509,997048745,2,"1440, 1365, 1421, 690"
853510,997048751,2,"81, 281, 925, 448, 652"
853511,997048769,6,"49, 1082, 1081, 874, 874, 559, 1017, 1101, 253..."
853512,997048777,5,"1327, 324, 71, 844, 941, 155, 302, 58, 1305, 7..."


In [149]:
items_by_cluster_df = df.groupby("CLUSTER")["ITEMS"].apply(','.join).reset_index()
items_by_cluster_df

Unnamed: 0,CLUSTER,ITEMS
0,0,"730, 730, 200, 361, 732,746, 1305, 712, 729, 1..."
1,1,"671, 302,964, 302, 470, 302, 416,246, 991, 943..."
2,2,"415, 925, 733, 733, 733, 908, 908, 463, 463,11..."
3,3,"297, 724, 470, 1306, 727, 302, 760,189, 355, 9..."
4,4,"308, 1360, 713, 1359, 729, 369, 308, 694, 977,..."
5,5,"1451, 356, 1084, 786, 862, 1010, 302, 956, 7, ..."
6,6,"1334, 141, 1108, 1105,1327, 705, 705, 1016, 13..."
7,7,"302, 302, 1082, 302, 969, 1082, 302,302,302, 9..."


In [288]:
THRESHOLD = 750
NB_OF_CLUSTERS = 8

In [256]:
def get_relevant_items():
    all_clusters = list()
    for c in range(NB_OF_CLUSTERS):
        items_set = items_by_cluster_df[items_by_cluster_df["CLUSTER"] == c]["ITEMS"].values[0]
        items_set = items_set.split(",")
        items_set = np.array(items_set).astype(int)

        cluster = list()

        for i in np.unique(items_set):
            count = (items_set == i).sum()
            if count > THRESHOLD:
                cluster.append(dict(item = i, count= count))
        cluster = sorted(cluster, key=lambda d: d['count'], reverse=True)
        all_clusters.append(cluster)
    return all_clusters

In [257]:
cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6, cluster_7 = get_relevant_items()

In [305]:
le = preprocessing.LabelEncoder()
le.fit(all_purchases['LIBELLE'])

In [306]:
def get_top_k_items(cli_id, TOP_K=5):
    already_bought_items = np.array(df[df["CLI_ID"] == cli_id]["ITEMS"].values[0].split(",")).astype(int)
    rec = dict()
    increment = 0
    for i in range(TOP_K):
        key = f"top {increment+1} item"
        if not (np.isin(cluster_0[increment]["item"], already_bought_items)):
            rec[key] = le.inverse_transform([cluster_0[increment]["item"]])
        increment += 1
    return rec

In [311]:
client_to_rec = input("Get recommendations for client :")
print(get_top_k_items(int(client_to_rec)))

Get recommendations for client :1490281
{'top 1 item': array(['MDT SD DES LAGONS 150 ML'], dtype=object), 'top 2 item': array(['SVC CREME QUOTIDIENNE T75ml'], dtype=object), 'top 3 item': array(['GD JDM4 GRENADE FL200ML'], dtype=object), 'top 4 item': array(['GD JDM4 LOTUS FL200ML'], dtype=object), 'top 5 item': array(['CD JDM4 AMANDE  FL 200ML'], dtype=object)}
