In [1]:
import pandas as pd
from IPython.core import display as ICD
from multiprocessing import Pool
import numpy as np
from datetime import timedelta
import re
import os
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
import plotly
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from itertools import groupby, product
import json
le = LabelEncoder()

In [2]:
def read_lcm_output(input_name,folder="lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z) for z in x.split(" ") if z != ""]))
    return df


def read_lcm_output_total(input_name,folder="../lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z.replace('"',"")) for z in x[1:-1].split(",") if z != ""]))
    return df


# Data preprocessing for echarts 

In [31]:
def get_articles_descriptions(x,items,encoder):
    ee = encoder.inverse_transform([int(i) for i in x.split()])
    return items.loc[items.index.isin(ee)].DESCRIPTION.tolist()
def format_links(x):
    res = []
    for i in x[0]:
        for idx in range(len(i)-1):
            res.append([i[idx],i[idx+1],x["index"]])
    return res

def make_links(e,index):
    """Product of groups ids for each two consecutive groups periods"""
    prev = e[0]
    for i in e[1:]:
        yield from product(prev,i,[index])
        prev = i

def extract_demographics(input_file):
    demographics = re.findall("\[([A-Z|a-z|_]+),?([A-Z|a-z|_]+)?\]",input_file)[0]
    return  [i for i in demographics if i !=""]
    
def sankey_preprocessing(input_file,stats_folder='../plots/stats',encoders_folder="../plots/encoders",links_folder="../plots/links",groups_folder='../plots/groups',users_demographics = ["DEPARTEMENT","SEX","AGE"],groups_demographics=["STATION_MGT_TYPE","DEPARTEMENT"],users_consecutive_apparition=3):

    demographics = extract_demographics(input_file)
    users = pd.read_csv("../datasets/Total/users.csv",sep=";")
    df = read_lcm_output_total(input_file).sort_values("period").reset_index(drop=True)
    
    file = f'../plots/links/{input_file}'
    mlb = MultiLabelBinarizer(sparse_output=True)
    _df = mlb.fit_transform(df.user_ids.tolist()).astype(bool)
    _df = pd.DataFrame(_df.toarray(),columns=mlb.classes_)
    
    e = _df.sum()
    _df = _df[e[e>0].index]
    _df  = _df.T.apply(lambda x : np.where(x)[0],axis=1)
    e = _df.to_frame()[0].apply(lambda x: list(list(z) for idx,z in groupby(x,lambda y: df.iloc[y].period)))
    e = e[e.apply(lambda x:len(x))>users_consecutive_apparition]
    
    res = []
    e.to_frame().reset_index().apply(lambda x: [res.append(i) for i in make_links(x[0],x["index"])],axis=1)
    links = pd.DataFrame(res)
    links.columns = ["source","target","user_id"]
    links.groupby(["source","target"])["user_id"].apply(lambda x: ','.join(str(i) for i in x)).to_frame().to_csv(file)

    # Users demographics stats  
    file = f'../plots/stats/users/{input_file}'
    stats = {}
    users_stats = links[["user_id"]].drop_duplicates().merge(users,left_on="user_id",right_on="CUST_ID")[users.columns]
    for i in users_demographics:
        b = users_stats.groupby(i).apply(lambda x: {"name":x[i].unique()[0],"value":x.CUST_ID.shape[0],"users":",".join(str(i) for i in x.CUST_ID)}).values
        stats[i] = b.tolist()
        
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)
  
    
    # Users
    file = f'../plots/users/{input_file}'
    users_stats.to_csv(file)
    
    # filter groups to the ones appearing in the links
    file = f'{groups_folder}/{input_file}'
    df_reduced_filtred = df.loc[np.unique(np.union1d(links.source.unique(),links.target.unique()))].dropna()
    df_reduced_filtred['depth'] = le.fit_transform(df_reduced_filtred.period)/df_reduced_filtred.period.nunique()
    df_reduced_filtred['size'] = df_reduced_filtred.user_ids.apply(lambda x : len(x))
    if len(demographics)==1:
        df_reduced_filtred[demographics[0]]= df_reduced_filtred.property_values
    else:
        df_reduced_filtred[demographics]= df_reduced_filtred.property_values.str.split("_",expand=True)
        
    # Encoding items to their initial ID + adding names
    items = pd.read_csv("../datasets/Total/items.csv")
    encoder = LabelEncoder()
    encoder.classes_ = np.load(f'{encoders_folder}/{input_file}.npy')
    
    items = items.set_index("ARTICLE_ID")
    df_reduced_filtred["itemset_name"] = df_reduced_filtred["itemsets"].apply(lambda x : get_articles_descriptions(x,items,encoder))
    df_reduced_filtred.to_csv(file)
    return df_reduced_filtred
    # Groups demographics stats 
    file = f'../plots/stats/groups/{input_file}'
    stats = {}
    for i in np.intersect1d(groups_demographics,demographics):
        b = df_reduced_filtred.groupby(i).apply(lambda x : {"name":x[i].unique()[0],"value":x.index.shape[0],"groups":",".join(str(i) for i in x.index)}).values
        stats[i]=str(b.tolist())
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)
    
    
    print("Done",input_file)


In [32]:
input_file = "9M-5-[1-2001]-[a]-lcm.out"
for i in ["9M-5-[1-2001]-[a]-lcm.out"]:
    print(i,extract_demographics(i))
    e = sankey_preprocessing(i,users_consecutive_apparition=0)
    

9M-5-[1-2001]-[a]-lcm.out ['a']


In [38]:
    df = read_lcm_output_total(input_file).sort_values("period").reset_index(drop=True)


In [40]:
df.user_ids.apply(lambda x : print(x))

[291758 355524 579353 605098 609205]
[ 505859  540247  719763  809946  963615 1103450 1179385]
[ 182809  254144  307559  332928  338091  482628  505859  539410  540247
  584447  679482  702529  719763  809946  819872  862846  963615  972937
 1097861 1103450 1179385]
[ 171222  331511  337743  387123  409260  438350  526900  540247  554300
  599279  731074  734723  866385  963523  971894  988692 1039399 1061637
 1064097 1108766 1216762]
[ 695135  754357  784359 1036452 1162193 1173969]
[ 149358  172406  498022  516505  527437  606704  622174  695135  754357
  784359  949127  950781  984430 1036452 1056070 1162193 1173969 1215438
 1218083 1232187]
[ 120262  170586  305003  369654  372731  416219  428651  444381  447204
  527590  602261  634043  726943  825189  838936  842057  867960  896272
  930235  934189  963523  984672 1043581 1191002]
[ 305651  378200  446458  447084  488496  515602  527150  532781  536214
  545605  667914  788356  790583  827281  866455  895812  898790  949779
  984

0       None
1       None
2       None
3       None
4       None
        ... 
1591    None
1592    None
1593    None
1594    None
1595    None
Name: user_ids, Length: 1596, dtype: object

3                          [CR SANDWHICH OCEANIQUE (new)]
5                                        [RED BULL 355ML]
9       [CR COOKIE CHOCOLAT LAIT 103 GR, CR SAND PREMIUM]
11      [CR SANDWICH CLASSIC, CR COOKIE CHOCOLAT LAIT ...
13                                      [RECHARGE LAVAGE]
                              ...                        
1581                                    [DOLE PECHE SIRO]
1588                         [CR TARTE CITRON MERINGUÉET]
1589               [LU POMMES FRITES, Lun Salade Accueil]
1591                                                   []
1592                   [HOLLYW.BLANCH.MENTH. POLAIRE 14G]
Name: itemsets, Length: 455, dtype: object

In [78]:
items.loc[[3425909001007,3425909001083]].DESCRIPTION.tolist()

['RECHARGE LAVAGE TW - A', 'REC LAVAGE PJE']

In [73]:
items.loc[[12,3425909001083]]

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'