In [1]:
import pandas as pd
from IPython.core import display as ICD
from multiprocessing import Pool
import numpy as np
from datetime import timedelta
import re
import os
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
import plotly
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from itertools import groupby, product
import json
le = LabelEncoder()

In [2]:
def read_lcm_output(input_name,folder="lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z) for z in x.split(" ") if z != ""]))
    return df


# Data preprocessing for echarts 

In [3]:
def format_links(x):
    res = []
    for i in x[0]:
        for idx in range(len(i)-1):
            res.append([i[idx],i[idx+1],x["index"]])
    return res

def make_links(e,index):
    """Product of groups ids for each two consecutive groups periods"""
    prev = e[0]
    for i in e[1:]:
        yield from product(prev,i,[index])
        prev = i

def extract_demographics(input_file):
    demographics = re.findall("\[([A-Z|a-z|_]+),?([A-Z|a-z|_]+)?\]",input_file)[0]
    return  [i for i in demographics if i !=""]
    
def sankey_preprocessing(input_file,stats_folder='plots/stats',links_folder="plots/links",groups_folder='plots/groups',users_demographics = ["DEPARTEMENT","SEX","AGE"],groups_demographics=["STATION_MGT_TYPE","DEPARTEMENT"],users_consecutive_apparition=3):
#     if "M-10-[2-5000]-[AGE]-lcm.out" in input_file :
#         return
    
    demographics = extract_demographics(input_file)
    users = pd.read_csv("datasets/Total/users.csv",sep=";")
    df = read_lcm_output_total(input_file).sort_values("period").reset_index(drop=True)
    
    file = f'plots/links/{input_file}'
    mlb = MultiLabelBinarizer(sparse_output=True)
    _df = mlb.fit_transform(df.user_ids.tolist()).astype(bool)
    _df = pd.DataFrame(_df.toarray(),columns=mlb.classes_)
    
    e = _df.sum()
    _df = _df[e[e>1].index]
    _df  = _df.T.apply(lambda x : np.where(x)[0],axis=1)
    e = _df.to_frame()[0].apply(lambda x: list(list(z) for idx,z in groupby(x,lambda y: df.iloc[y].period)))
    e = e[e.apply(lambda x:len(x))>users_consecutive_apparition]
    
    res = []
    e.to_frame().reset_index().apply(lambda x: [res.append(i) for i in make_links(x[0],x["index"])],axis=1)
    links = pd.DataFrame(res)
    links.columns = ["source","target","user_id"]
    links.groupby(["source","target"])["user_id"].apply(lambda x: ','.join(str(i) for i in x)).to_frame().to_csv(file)

    # Users demographics stats  
    file = f'plots/stats/users/{input_file}'
    stats = {}
    users_stats = links[["user_id"]].drop_duplicates().merge(users,left_on="user_id",right_on="CUST_ID")[users.columns]
    for i in users_demographics:
        b = users_stats.groupby(i).apply(lambda x: {"name":x[i].unique()[0],"value":x.CUST_ID.shape[0],"users":",".join(str(i) for i in x.CUST_ID)}).values
        stats[i] = b.tolist()
        
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)
  
    
    # Users
    file = f'plots/users/{input_file}'
    users_stats.to_csv(file)
    
    # filter groups to the ones appearing in the links
    file = f'{groups_folder}/{input_file}'
    df_reduced_filtred = df.loc[np.unique(np.union1d(links.source.unique(),links.target.unique()))].dropna()
    df_reduced_filtred['depth'] = le.fit_transform(df_reduced_filtred.period)/df_reduced_filtred.period.nunique()
    df_reduced_filtred['size'] = df_reduced_filtred.user_ids.apply(lambda x : len(x))
    if len(demographics)==1:
        df_reduced_filtred[demographics[0]]= df_reduced_filtred.property_values
    else:
        df_reduced_filtred[demographics]= df_reduced_filtred.property_values.str.split("_",expand=True)
        
    df_reduced_filtred.to_csv(file)
    
    # Groups demographics stats 
    file = f'plots/stats/groups/{input_file}'
    stats = {}
    for i in np.intersect1d(groups_demographics,demographics):
        b = df_reduced_filtred.groupby(i).apply(lambda x : {"name":x[i].unique()[0],"value":x.index.shape[0],"groups":",".join(str(i) for i in x.index)}).values
        stats[i]=str(b.tolist())
        
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)
    
    
    print("Done",input_file)


In [6]:

def read_lcm_output_total(input_name,folder="lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z.replace('"',"")) for z in x[1:-1].split(",") if z != ""]))
    return df


In [7]:
a = !ls lcm_results/
for i in ["9M-5-[1-2001]-[a]-lcm.out"]:
    print(i,extract_demographics(i))
    try:
        e = sankey_preprocessing(i,users_consecutive_apparition=2)
    except Exception as e:
        print("Error",i,e)

9M-5-[1-2001]-[a]-lcm.out ['a']
Done 9M-5-[1-2001]-[a]-lcm.out


In [41]:
!cat lcm_results/9M-2-[1-2001]-[a]-lcm.out

"[101937, 103914, 104293, 105293, 108938, 110899, 111737, 112044, 113036, 113376, 114666, 118478, 121147, 123801, 127790, 128703, 130181, 130532, 133892, 137504, 137871, 138455, 139598, 145875, 147589, 147772, 149963, 152339, 153185, 154300, 156634, 161394, 162055, 162567, 167520, 167694, 173071, 173169, 174585, 177160, 177373, 178797, 187833, 191304, 191532, 191561, 199984, 200082, 201600, 205109, 206426, 207305, 210060, 210355, 214721, 217160, 223966, 226086, 231418, 235297, 235462, 244827, 249736, 251568, 251775, 253171, 253363, 256669, 260419, 261384, 265419, 266458, 266488, 268238, 270220, 270341, 270517, 275377, 275782, 277766, 281988, 284704, 288842, 292347, 293947, 296181, 296486, 302056, 303315, 303666, 303718, 312314, 314647, 322597, 326598, 333147, 333668, 336730, 337424, 338762, 341270, 345108, 345427, 346829, 347715, 353175, 356448, 356857, 363093, 363105, 363412, 365174, 367609, 368080, 370988, 372038, 372212, 372351, 373006, 373230, 373743, 373747, 375021, 375197, 377599

"[113487, 192562, 230856, 277744, 381774, 514044, 1014979, 1056623]",8, 3461023,2019-02,1
"[230856, 381774]",2, 3461023 451,2019-02,1
"[113487, 381774]",2, 3461023 18064205,2019-02,1
"[192562, 381774, 1056623]",3, 3461023 3461025,2019-02,1
"[192562, 1014979, 1056623]",3, 3461023 3450011,2019-02,1
"[192562, 1056623]",2, 3461023 3450011 3461025,2019-02,1
"[146883, 183398, 486628, 806597, 928890, 942898, 1200584]",7, 3461041,2019-02,1
"[183398, 486628, 928890]",3, 3461041 3450137,2019-02,1
"[486628, 1200584]",2, 3461041 3450011,2019-02,1
"[122130, 182205, 355247, 584447, 622027, 1241616]",6, 5430104,2019-02,1
"[584447, 622027, 1241616]",3, 5430104 7445935,2019-02,1
"[122130, 622027]",2, 5430104 5401051,2019-02,1
"[266087, 348456, 524087, 654138, 719763, 911614, 1073371, 1077767]",8, 7598035,2019-02,1
"[654138, 719763]",2, 7598035 7445935,2019-02,1
"[266087, 524087, 654138, 911614, 1077767]",5, 7598035 7587895,2019-02,1
"[348456, 719763]",2, 7598035 7770415,2019-02,1
"[7197

In [56]:
df[9:15].sort_values(2)

Unnamed: 0,0,1,2,3,4
10,"[416219, 471775, 535929, 536334, 584447, 75256...",13,13600075 6026,2018-06,78
13,"[565366, 574634, 584447, 602649, 735292, 75135...",9,13600075 6026,2017-03,78
11,"[588730, 842314, 911614, 1001550, 1097861]",5,5401047 6033,2018-06,78
9,"[565366, 610249, 719763, 754685, 1014685, 1047...",7,6027 6026,2018-05,78
14,"[519887, 521885, 565366, 584447, 622027, 75256...",7,6033 6026,2017-03,78
12,"[490024, 584447, 588730, 911614, 1001550]",5,8287519 6033,2018-06,78


In [53]:
df = pd.read_csv("lcm_results/M-5-[2-5000]-[DEPARTEMENT]-lcm.out",header=None)

Unnamed: 0,source,target,user_id
0,55,81,5
1,55,914,5
2,914,81,5
3,914,914,5
4,81,83,5
...,...,...,...
233795,4914,4914,1246606
233796,4438,4439,1246606
233797,4438,4914,1246606
233798,4914,4439,1246606


In [479]:
[list(i) for i in res]

[[[(55, 81, 5), (55, 914, 5), (914, 81, 5), (914, 914, 5)],
  [(81, 83, 5), (81, 914, 5), (914, 83, 5), (914, 914, 5)],
  [(83, 113, 5), (83, 914, 5), (914, 113, 5), (914, 914, 5)],
  [(113, 154, 5), (113, 914, 5), (914, 154, 5), (914, 914, 5)],
  [(154, 156, 5), (154, 914, 5), (914, 156, 5), (914, 914, 5)],
  [(156, 177, 5), (156, 914, 5), (914, 177, 5), (914, 914, 5)],
  [(177, 192, 5), (177, 914, 5), (914, 192, 5), (914, 914, 5)],
  [(192, 193, 5), (192, 914, 5), (914, 193, 5), (914, 914, 5)]],
 [],
 [],
 [],
 [[(485, 538, 101540),
   (485, 1046, 101540),
   (1046, 538, 101540),
   (1046, 1046, 101540)],
  [(538, 540, 101540),
   (538, 1046, 101540),
   (1046, 540, 101540),
   (1046, 1046, 101540)]],
 [[(3568, 3572, 101603),
   (3568, 3971, 101603),
   (3971, 3572, 101603),
   (3971, 3971, 101603)],
  [(3572, 3574, 101603),
   (3572, 3971, 101603),
   (3971, 3574, 101603),
   (3971, 3971, 101603)]],
 [],
 [],
 [[(400, 400, 102352),
   (400, 1470, 102352),
   (1439, 400, 102352),
   

In [75]:
pd.read_csv("datasets/Total/STATION_TYPE.csv")

Unnamed: 0,"STATION_ID;""STATION_MGT_TYPE"""
0,"NF078198;""CODO"""
1,"NF079079;""DODO"""
2,"NF078243;""COCO"""
3,"NF058570;""CODO"""
4,"NF050214;""CODO"""
...,...
3481,"NF014416;"""""
3482,"NF014474;"""""
3483,"NF016375;"""""
3484,"NF015915;"""""
