In [1]:
# Import all necessary libraries
import pandas as pd
import numpy as np
print('NumPy version: ', np.__version__)
import pickle
print('Pickle version: ', pickle.format_version)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

NumPy version:  1.23.4
Pickle version:  4.0


In [2]:
carts = pd.read_csv('all_carts_encoded.csv')
tickets = np.unique(carts['TICKET_ID'])

In [24]:
tickets_items_list = pickle.load(open('items_list_for_ticket.txt', 'rb'))
def prepSparseMatrix(list_of_str):
    cv = CountVectorizer(token_pattern = r'[^\,\ ]+', lowercase = False)
    sparseMatrix = cv.fit_transform(list_of_str)
    return sparseMatrix, cv.get_feature_names_out()
sparseMatrix, feature_names = prepSparseMatrix(tickets_items_list)
df_sparseMatrix = pd.DataFrame.sparse.from_spmatrix(sparseMatrix, index = tickets, columns = feature_names)

In [25]:
model = pickle.load(open(f'models/kmeans/0.pkl', 'rb'))
clusters = model.fit_predict(sparseMatrix)
clusters

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [26]:
# Display each ticket and its corresponding cluster (need clusters given my fit_transform a model)
tickets_cluster = pd.DataFrame(np.concatenate((tickets.reshape(-1,1), clusters.reshape(-1,1)), axis = 1), columns = ['TICKET_ID', 'Cluster'])
tickets_cluster.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2734831,2734832,2734833,2734834,2734835,2734836,2734837,2734838,2734839,2734840
TICKET_ID,32931447,32931448,32931451,32931452,32931453,32931454,32931455,32931456,32931461,32931462,...,36529856,36529857,36529858,36529859,36529860,36529861,36529862,36529863,36529864,36529865
Cluster,0,0,0,0,0,0,7,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# For each cluster, how many times do we find each libelle
def clustersLibelles(tickets_cluster, tickets_data):
    clusters = list(tickets_cluster['Cluster'])
    each_cluster_libelles = list()
    for i in range(len(np.unique(clusters))):
        tickets_list = list(tickets_cluster[tickets_cluster['Cluster'] == i]['TICKET_ID'])
        tickets_libelles_list = list()
        for ticket in tickets_list:    
            tickets_libelles_list.extend(list(tickets_data[tickets_data['TICKET_ID'] == ticket]['LIBELLE_ENCODED']))
        tickets_libelles_counts = list()
        tickets_libelles_counts.extend([[libelle, tickets_libelles_list.count(libelle)] for libelle in np.unique(tickets_libelles_list)])
        each_cluster_libelles.append(pd.DataFrame(tickets_libelles_counts, columns=['LIBELLE_ENCODED', 'Count']).sort_values(by = ['Count'], ascending = False).reset_index(drop=True))
    return each_cluster_libelles

cluster_libelles = clustersLibelles(tickets_cluster, carts)

KeyboardInterrupt: 

In [27]:
# For cluster 1, libelle [LIBELLE_ENCODED] is present [Count] times
cluster_libelles[1].T

NameError: name 'cluster_libelles' is not defined

In [28]:
# How many different tickets are in each cluster ?
# Also check that the sum of the nb of tickets in each cluster == the total number of tickets at start
total_nb_tickets = 0
for i in range(8):
    len_tickets = tickets_cluster[tickets_cluster['Cluster'] == i].shape[0]
    total_nb_tickets += len_tickets
    print('Tickets in Cluster ' + str(i) + ' -> ', len_tickets) 
print(f"Total number of tickets : {total_nb_tickets}")

Tickets in Cluster 0 ->  2227309
Tickets in Cluster 1 ->  65794
Tickets in Cluster 2 ->  45575
Tickets in Cluster 3 ->  18316
Tickets in Cluster 4 ->  129868
Tickets in Cluster 5 ->  48830
Tickets in Cluster 6 ->  144624
Tickets in Cluster 7 ->  54525
Total number of tickets : 2734841


In [29]:
# Get all the tickets belonging to a certain client (by his id)
tickets_of_clients = pd.read_csv('KaDo.csv', usecols = ['TICKET_ID', 'CLI_ID'])
def tickets_of_a_client(client_id):
    tickets_df = tickets_of_clients[tickets_of_clients['CLI_ID'] == client_id]
    return tickets_df['TICKET_ID'].unique()    
    
analyzed_client_tickets = tickets_of_a_client(941958669)
analyzed_client_tickets

array([32946317, 33182830, 33577174, 34015044, 34398198, 34665272,
       35017400, 35221068, 35546515, 35703330, 35816290, 35998180,
       36061723, 36315860, 36434434])

In [30]:
# Get all the clusters of a specific client thanks to his former tickets
def clusters_of_a_client(tickets):
    client_clusters = list()
    for ticket_id in tickets:
        current_cluster = tickets_cluster[tickets_cluster['TICKET_ID'] == ticket_id]['Cluster'].values[0]
        if current_cluster not in client_clusters:
            client_clusters.append(current_cluster)
    return client_clusters

analyzed_client_clusters = clusters_of_a_client(analyzed_client_tickets)
analyzed_client_clusters

[0, 1]

In [32]:
def bought_items(tickets):
    bought_items = list()
    for ticket_id in tickets:
        libelles_serie = df_sparseMatrix.loc[ticket_id]
        for item_id in list(libelles_serie[libelles_serie==1].index):
            bought_items.append(item_id)
    return list(map(int,bought_items))
            
analyzed_client_bought_items = bought_items(analyzed_client_tickets)
analyzed_client_bought_items

[187,
 897,
 169,
 1305,
 600,
 897,
 527,
 18,
 945,
 1259,
 31,
 897,
 1259,
 1334,
 897,
 35,
 1305,
 171,
 920,
 1303,
 1317,
 899,
 667]

In [33]:
def items_contained_in_clusters(clusters):
    items_contained_in_clusters = list()
    for cluster in clusters:
        items = cluster_libelles[cluster][cluster_libelles[cluster]['Count'] > 500]['LIBELLE_ENCODED'].values
        for item in items:
            if item not in items_contained_in_clusters:
                items_contained_in_clusters.append(item)
    return items_contained_in_clusters

items_in_clusters = items_contained_in_clusters(analyzed_client_clusters)
items_in_clusters

NameError: name 'cluster_libelles' is not defined

In [12]:
def items_suggestion(bought_items, items_in_clusters):
    suggested_items = list()
    for item in items_in_clusters:
        if item not in bought_items:
            suggested_items.append(item)
    return suggested_items

suggested_items = items_suggestion(analyzed_client_bought_items, items_in_clusters)
suggested_items

NameError: name 'analyzed_client_bought_items' is not defined

In [13]:
# Decode labels so we can read the
decoded_suggested_items.append(le.inverse_transform(si))
decoded_suggested_items

NameError: name 'decoded_suggested_items' is not defined