In [None]:
# Import all necessary libraries
import pandas as pd
print('Pandas version: ', pd.__version__)

import numpy as np
print('NumPy version: ', np.__version__)

import matplotlib
print('Matplotlib version: ', matplotlib.__version__)

from matplotlib import pyplot as plt

import sklearn
print('Scikit-Learn version: ', sklearn.__version__)

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering

import pickle
print('Pickle version: ', pickle.format_version)

import sys
print('Sys version: ', sys.version[0:5])

from sys import exc_info

import ast

In [None]:
# Define carts : a ticket contains one or more libelles
carts = pd.read_csv('KaDo.csv', usecols = ['TICKET_ID', 'LIBELLE'])
print('Shape of carts dataset is: ',carts.shape, '\n')
print('Number of different tickets is:', len(carts["TICKET_ID"].value_counts()))
carts.T
carts.to_csv('all_carts.csv')

In [None]:
# Get a list of all unique tickets
tickets = np.unique(carts['TICKET_ID'])

In [None]:
# Encode libelles so we can use them in our algorithms 
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(carts['LIBELLE'])
print('Number of items is:', len(le.classes_))

libelle_encoded = le.transform(carts['LIBELLE'])
carts['LIBELLE_ENCODED'] = libelle_encoded

# Save encoded libelles into csv file
carts_encoded = carts.drop(['LIBELLE'], axis=1)
carts_encoded.to_csv('all_carts_encoded.csv')

In [None]:
# Get a list of strings
# Each string corresponds to the items that are in the ticket (=cart)
def itemsListForTickets(tickets, tickets_data):
    tickets_items_list = []
    for ticket in tickets:
        tickets_items_list.append(str(list(tickets_data[tickets_data['TICKET_ID'] == ticket]['LIBELLE_ENCODED'])).split('[')[1].split(']')[0])
    return tickets_items_list

In [None]:
# Apply fonction using list of unique tickers and the carts_endoded df
tickets_items_list = itemsListForTickets(tickets, carts_encoded)
print('Items list for', len(carts), ' tickets')
print('A list of first 10 tickets bought items: \n', tickets_items_list[:10]) 

In [None]:
# Vectorize each string and return all vectors and feature names
def prepSparseMatrix(list_of_str):
    cv = CountVectorizer(token_pattern = r'[^\,\ ]+', lowercase = False)
    sparseMatrix = cv.fit_transform(list_of_str)
    return sparseMatrix, cv.get_feature_names_out()

In [None]:
sparseMatrix, feature_names = prepSparseMatrix(tickets_items_list)

In [None]:
# For each ticket, 0 if item is not in the cart, 1 if items is not in the cart
df_sparseMatrix = pd.DataFrame.sparse.from_spmatrix(sparseMatrix, index = tickets, columns = feature_names)
df_sparseMatrix

In [None]:
# Verification step : makek sure 0 and 1 are correctly distributed 
# in the sparse matrix according to the first 6 tickets initial DF
first_6_tickets_SM = carts_encoded[carts_encoded['TICKET_ID'].isin(tickets[:6])].sort_values('TICKET_ID')
print(first_6_tickets_SM.T)
print(df_sparseMatrix.loc[np.unique(first_6_tickets_SM['TICKET_ID']), list(map(str, np.unique(first_6_tickets_SM['LIBELLE_ENCODED'])))])

In [None]:
# Use elbow method to define the optimized K
# WCSS = Within Clusters Sum of Squares
class elbowMethod():
    def __init__(self, sparseMatrix):
        self.sparseMatrix = sparseMatrix
        self.wcss = list()
        self.differences = list()
    def run(self, init, upto, max_iterations = 300):
        for i in range(init, upto + 1):
            kmeans = KMeans(n_clusters=i, init = 'k-means++', max_iter = max_iterations, n_init = 10, random_state = 0)
            kmeans.fit(sparseMatrix)
            self.wcss.append(kmeans.inertia_)
        self.differences = list()
        for i in range(len(self.wcss)-1):
            self.differences.append(self.wcss[i] - self.wcss[i+1])
    def showPlot(self, boundary = 500, upto_cluster = None):
        if upto_cluster is None:
            WCSS = self.wcss
            DIFF = self.differences
        else:
            WCSS = self.wcss[:upto_cluster]
            DIFF = self.differences[:upto_cluster - 1]
        plt.figure(figsize=(15, 6))
        plt.subplot(121).set_title('Elbow Method Graph')
        plt.plot(range(1, len(WCSS) + 1), WCSS)
        plt.grid(b = True)
        plt.subplot(122).set_title('Differences in Each Two Consective Clusters')
        len_differences = len(DIFF)
        X_differences = range(1, len_differences + 1)
        plt.plot(X_differences, DIFF)
        plt.plot(X_differences, np.ones(len_differences)*boundary, 'r')
        plt.plot(X_differences, np.ones(len_differences)*(-boundary), 'r')
        plt.grid()
        plt.show()

In [None]:
# Instantiate elbowMethod with our SM
elbow_method = elbowMethod(sparseMatrix) 
# Test SM with several K
elbow_method.run(1, 10)
elbow_method.showPlot(boundary = 3000)

In [None]:
elbow_method.run(11, 30)
elbow_method.showPlot(boundary = 3000)

In [None]:
# Fitting Data on Model and saving it with pickle (kmeans)
kmeans = KMeans(n_clusters=8, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(sparseMatrix)
pickle.dump(kmeans, open('model_kmeans.pkl', 'wb'))
clusters = kmeans.fit_predict(sparseMatrix)

In [None]:
# Fitting Data on Model and saving it with pickle (dbscan)
dbscan = DBSCAN(eps=3, min_samples=2)
dbscan.fit(sparseMatrix)
pickle.dump(dbscan, open('model_dbscan.pkl', 'wb'))

In [None]:
# Fitting Data on Model and saving it with pickle (spectral)
spectral = SpectralClustering(n_clusters=2, assign_labels='discretize', random_state=0)
spectral.fit(sparseMatrix)
pickle.dump(spectral, open('model_spectral.pkl', 'wb'))

In [None]:
# Display each ticket and its corresponding cluster
tickets_cluster = pd.DataFrame(np.concatenate((tickets.reshape(-1,1), clusters.reshape(-1,1)), axis = 1), columns = ['TICKET_ID', 'Cluster'])
tickets_cluster.T

In [None]:
# For each cluster, how many times do we find each libelle
def clustersLibelles(tickets_cluster, tickets_data):
    clusters = list(tickets_cluster['Cluster'])
    each_cluster_libelles = list()
    for i in range(len(np.unique(clusters))):
        tickets_list = list(tickets_cluster[tickets_cluster['Cluster'] == i]['TICKET_ID'])
        tickets_libelles_list = list()
        for ticket in tickets_list:    
            tickets_libelles_list.extend(list(tickets_data[tickets_data['TICKET_ID'] == ticket]['LIBELLE_ENCODED']))
        tickets_libelles_counts = list()
        tickets_libelles_counts.extend([[libelle, tickets_libelles_list.count(libelle)] for libelle in np.unique(tickets_libelles_list)])
        each_cluster_libelles.append(pd.DataFrame(tickets_libelles_counts, columns=['LIBELLE_ENCODED', 'Count']).sort_values(by = ['Count'], ascending = False).reset_index(drop=True))
    return each_cluster_libelles

cluster_libelles = clustersLibelles(tickets_cluster, carts)

In [None]:
# For cluster 1, libelle [LIBELLE_ENCODED] is present [Count] times
cluster_libelles[1].T

In [None]:
# How many different tickets are in each cluster ?
# Also check that the sum of the nb of tickets in each cluster == the total number of tickets at start
total_nb_tickets = 0
for i in range(8):
    len_tickets = tickets_cluster[tickets_cluster['Cluster'] == i].shape[0]
    total_nb_tickets += len_tickets
    print('Tickets in Cluster ' + str(i) + ' -> ', len_tickets) 
print(f"Total number of tickets : {total_nb_tickets}")

In [None]:
# Get all the tickets belonging to a certain client (by his id)
tickets_of_clients = pd.read_csv('KaDo.part', usecols = ['TICKET_ID', 'CLI_ID'])

def tickets_of_a_client(client_id):
    tickets_df = tickets_of_clients[tickets_of_clients['CLI_ID'] == client_id]
    return tickets_df['TICKET_ID'].unique()    
    
analyzed_client_tickets = tickets_of_a_client(941958669)
analyzed_client_tickets

In [None]:
# Get all the clusters of a specific client thanks to his former tickets
def clusters_of_a_client(tickets):
    client_clusters = list()
    for ticket_id in tickets:
        current_cluster = tickets_cluster[tickets_cluster['TICKET_ID'] == ticket_id]['Cluster'].values[0];
        if current_cluster not in client_clusters:
            client_clusters.append(current_cluster)
    return client_clusters

analyzed_client_clusters = clusters_of_a_client(analyzed_client_tickets)
analyzed_client_clusters

In [None]:
def bought_items(tickets):
    bought_items = list()
    for ticket_id in tickets:
        libelles_serie = df_sparseMatrix.loc[ticket_id]
        for item_id in list(libelles_serie[libelles_serie==1].index):
            bought_items.append(item_id)
    return list(map(int,bought_items))
            
analyzed_client_bought_items = bought_items(analyzed_client_tickets)
analyzed_client_bought_items

In [None]:
def items_contained_in_clusters(clusters):
    items_contained_in_clusters = list()
    for cluster in clusters:
        items = cluster_libelles[cluster][cluster_libelles[cluster]['Count'] > 500]['LIBELLE_ENCODED'].values
        for item in items:
            if item not in items_contained_in_clusters:
                items_contained_in_clusters.append(item)
    return items_contained_in_clusters

items_in_clusters = items_contained_in_clusters(analyzed_client_clusters)
items_in_clusters

In [None]:
def items_suggestion(bought_items, items_in_clusters):
    suggested_items = list()
    for item in items_in_clusters:
        if item not in bought_items:
            suggested_items.append(item)
    return suggested_items

suggested_items = items_suggestion(analyzed_client_bought_items, items_in_clusters)
suggested_items

In [None]:
# Decode labels so we can read the
decoded_suggested_items.append(le.inverse_transform(si))
decoded_suggested_items