In [75]:
import pandas as pd
from IPython.core import display as ICD
from multiprocessing import Pool
import numpy as np
from datetime import timedelta
import re
import os
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
import plotly
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from itertools import groupby, product
import json
le = LabelEncoder()

In [76]:
def read_lcm_output(input_name,folder="lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z) for z in x.split(" ") if z != ""]))
    return df


def read_lcm_output_total(input_name,folder="../lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : [int(z.replace('"',"")) for z in x[1:-1].split(",") if z != ""])
    return df


# Data preprocessing for echarts 

In [79]:
def get_articles_descriptions(x,items,encoder):
    ee = encoder.inverse_transform([int(i) for i in x.split()])
    return items.loc[items.index.isin(ee)].DESCRIPTION.tolist()
def format_links(x):
    res = []
    for i in x[0]:
        for idx in range(len(i)-1):
            res.append([i[idx],i[idx+1],x["index"]])
    return res

def make_links(e,index):
    """Product of groups ids for each two consecutive groups periods"""
    prev = e[0]
    for i in e[1:]:
        yield from product(prev,i,[index])
        prev = i

def extract_demographics(input_file):
    demographics = re.findall("\[([A-Z|a-z|_]+),?([A-Z|a-z|_]+)?\]",input_file)[0]
    return  [i for i in demographics if i !=""]
    
def sankey_preprocessing(input_file,stats_folder='../plots/stats',encoders_folder="../plots/encoders",links_folder="../plots/links",groups_folder='../plots/groups',users_demographics = ["DEPARTEMENT","SEX","AGE"],groups_demographics=["STATION_MGT_TYPE","DEPARTEMENT"],users_consecutive_apparition=3,remove_group_with_no_links=True):

    demographics = extract_demographics(input_file)
    users = pd.read_csv("../datasets/Total/users.csv",sep=";")
    df = read_lcm_output_total(input_file).sort_values("period").reset_index(drop=True)
    
    file = f'../plots/links/{input_file}'
    mlb = MultiLabelBinarizer(sparse_output=True)
    _df = mlb.fit_transform(df.user_ids.tolist()).astype(bool)
    _df = pd.DataFrame(_df.toarray(),columns=mlb.classes_)
    
    e = _df.sum()
    _df = _df[e[e>0].index]
    _df  = _df.T.apply(lambda x : np.where(x)[0],axis=1)
    e = _df.to_frame()[0].apply(lambda x: list(list(z) for idx,z in groupby(x,lambda y: df.iloc[y].period)))
    e = e[e.apply(lambda x:len(x))>users_consecutive_apparition]
    
    res = []
    e.to_frame().reset_index().apply(lambda x: [res.append(i) for i in make_links(x[0],x["index"])],axis=1)
    links = pd.DataFrame(res)
    links.columns = ["source","target","user_id"]
    links.groupby(["source","target"])["user_id"].apply(lambda x: ','.join(str(i) for i in x)).to_frame().to_csv(file)

    # Users demographics stats  
    file = f'../plots/stats/users/{input_file}'
    stats = {}
    users_stats = links[["user_id"]].drop_duplicates().merge(users,left_on="user_id",right_on="CUST_ID")[users.columns]
    for i in users_demographics:
        b = users_stats.groupby(i).apply(lambda x: {"name":x[i].unique()[0],"value":x.CUST_ID.shape[0],"users":",".join(str(i) for i in x.CUST_ID)}).values
        stats[i] = b.tolist()
        
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)
  
    
    # Users
    file = f'../plots/users/{input_file}'
    users_stats.to_csv(file)
    
    # filter groups to the ones appearing in the links
    file = f'{groups_folder}/{input_file}'
    if remove_group_with_no_links:
        df_reduced_filtred = df.loc[np.unique(np.union1d(links.source.unique(),links.target.unique()))].dropna()
    else: 
        df_reduced_filtred = df.dropna()
    df_reduced_filtred['depth'] = le.fit_transform(df_reduced_filtred.period)/df_reduced_filtred.period.nunique()
    df_reduced_filtred['size'] = df_reduced_filtred.user_ids.apply(lambda x : len(x))
    if len(demographics)==1:
        df_reduced_filtred[demographics[0]]= df_reduced_filtred.property_values
    else:
        df_reduced_filtred[demographics]= df_reduced_filtred.property_values.str.split("_",expand=True)
    
    # Encoding items to their initial ID + adding names
    items = pd.read_csv("../datasets/Total/items.csv")
    encoder = LabelEncoder()
    encoder.classes_ = np.load(f'{encoders_folder}/{input_file}.npy')
    items = items.set_index("ARTICLE_ID")
    df_reduced_filtred["itemset_name"] = df_reduced_filtred["itemsets"].apply(lambda x : get_articles_descriptions(x,items,encoder))
    df_reduced_filtred.to_csv(file)
    # Groups demographics stats 
    file = f'../plots/stats/groups/{input_file}'
    stats = {}
    for i in np.intersect1d(groups_demographics,demographics):
        b = df_reduced_filtred.groupby(i).apply(lambda x : {"name":x[i].unique()[0],"value":x.index.shape[0],"groups":",".join(str(i) for i in x.index)}).values
        stats[i]=str(b.tolist())
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)

    print("Done",input_file)


In [80]:
input_file = "9M-5-[1-2001]-[a]-lcm.out"
for i in ["9M-5-[1-2001]-[a]-lcm.out"]:
    print(i,extract_demographics(i))
    e = sankey_preprocessing(i,users_consecutive_apparition=1,remove_group_with_no_links=False)


9M-5-[1-2001]-[a]-lcm.out ['a']
Done 9M-5-[1-2001]-[a]-lcm.out


In [36]:
items = pd.read_csv("../datasets/Total/items.csv")
# encoder = LabelEncoder()
# encoders_folder="../plots/encoders"
# encoder.classes_ = np.load(f'{encoders_folder}/{input_file}.npy')
# items.ARTICLE_ID.as
# items = items.set_index("ARTICLE_ID")

In [45]:
e

Unnamed: 0,user_ids,support,itemsets,period,property_values,depth,size,a
0,"[100169, 100882, 102233, 103922, 105498, 10564...",1180,4112,2018-09-01,1,0.000000,1180,1
4,"[149358, 335918, 490858, 503825, 507317, 59453...",9,3408,2018-09-01,1,0.000000,9,1
5,"[131823, 199655, 402629, 449166, 535449, 62564...",10,5624,2018-09-01,1,0.000000,10,1
8,"[146720, 249738, 250022, 470752, 534467, 10788...",8,6894,2018-09-01,1,0.000000,8,1
26,"[381774, 441047, 684803, 881163, 1007975, 1014...",7,4443,2018-09-01,1,0.000000,7,1
...,...,...,...,...,...,...,...,...
1558,"[156096, 245887, 326603, 342631, 468837, 52639...",13,1391 431,2019-02-01,1,0.666667,13,1
1561,"[123517, 197808, 257982, 315386, 329954, 35725...",17,6447,2019-02-01,1,0.666667,17,1
1562,"[197808, 329954, 387836, 470752, 844497, 890276]",6,6447 4112,2019-02-01,1,0.666667,6,1
1563,"[257982, 315386, 470752, 524845, 811849, 84449...",8,6447 4126,2019-02-01,1,0.666667,8,1


In [40]:
items.ARTICLE_ID = items.ARTICLE_ID.astype(int)

In [68]:
e["itemsets"].apply(lambda x : get_articles_descriptions(x,items,encoder)).values

array([list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list(['2 LAMPES W5W 12V TKA FLAURAUD']),
       list(['ASPIRATEUR LAVAGE COMPLET']), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list(['LIPTON ICE TEA 50CL']),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
       list([]), list([]), list([]), list([]), list([]), list([]),
      

In [54]:
encoder.inverse_transform([3408]) in items.ARTICLE_ID.unique()

True