In [5]:
import pandas as pd
from IPython.core import display as ICD
from multiprocessing import Pool
import numpy as np
from datetime import timedelta
import re
import os
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer,LabelEncoder
import plotly
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from itertools import groupby, product


le = LabelEncoder()
mlb = MultiLabelBinarizer()

In [6]:
def read_lcm_output(input_name,folder="lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z) for z in x.split(" ") if z != ""]))
    return df


In [7]:
def lcm_results_overview(folder_name):
    files = !ls {folder_name}/
    res = []
    for i in files:
        a= pd.read_csv(folder_name+"/"+i,sep=",",header=None)
        a.columns = ["itemsets","support","user_ids","period","property_values"]
        a["period"]=a["period"].apply(lambda x:x.split("_")[0])
        mean = a["property_values"].value_counts().mean()
        unique = a["property_values"].nunique()
        indice = a[["period","property_values"]].drop_duplicates().period.value_counts().mean()/a.property_values.nunique()
        res.append((i,mean,unique,a.shape,indice))
    df = pd.DataFrame(res)
    df.columns = ["filename","mean property_values occurence","unique property values","file shape","indice"]
    return df 
def format_name(name):
    return name.split("/")[1].split("-")

def jaccard_similarity(a, b):
    """Compute Jaccard distance between two arrays
    Inputs :
        a: n element array
        b: m element array
    """
    return len(a[0].intersection(b[0]))/len(a[0].union(b[0]))


def similar_groups_union(groups,g_index,threashold):
    """
    Create new groups from given ones,
    each group is affected to the new group having the minimal jaccard distance with (comparing to all new groups)
    if the minimal jaccard distance found is under the threashold, the group is added as a new group 
    Parameters
    ----------
    groups : list of tuples of strhttp://localhohttp://localhost:8888/notebooks/%5BDemographic%20Groups%5D%20EDA.ipynbst:8888/notebooks/%5BDemographic%20Groups%5D%20EDA.ipynb
        Each tuple contain user_ids 
    g_index: list of int
        list of corresponding indexes in initial dataset
    Returns
    -------
    list of new groups and their corresponding indexes in the given dataset
    """
    new_groups = [[groups[0],[g_index[0]]],]
    for group,g_idx in zip(groups[1:],g_index[1:]):
        group = set(group)
        found = False
        for i in new_groups:
            distance = 1-len(group.intersection(i[0]))/len(group.union(i[0]))
            if distance<threashold:
                group_n = tuple(group.union(i[0]))
                for z in new_groups:
                    if z[0]== group_n:
                        z[1].append(g_idx)
                        new_groups.remove(i)
                        break
                i[0] = tuple(group.union(i[0]))
                i[1].append(g_idx)
                found = True
        if not found:
            new_groups.append([tuple(group),[g_idx]])
    return new_groups

def recompute_groups(df,properties,threashold):
    """
    Regroup similar groups of each period to new groups according to the Jaccard distance between them
    Parameters
    ----------
        df: DataFrame 
        threashold: int  
            Minimal Jaccard distance under which two groups are supposed similar
    Result
    ------
        Dataframe 
    """
    groups = pd.DataFrame()
    for (period,values),i in df.groupby(properties):
        g =  [tuple(str(z) for z in x) for x in i.user_ids]
        res = pd.DataFrame(similar_groups_union(g,i.index,threashold))
        res["properties"] = values
        res["period"] = period
        groups = pd.concat([groups,res],axis=0,sort=False)
    groups.columns = ["user_ids","groups_ids","property_values","period"]
    return groups.reset_index(drop="True")


def compute_distance(x):
    return 1-len(x["user_ids_x"].intersection(x["user_ids_y"]))/len(x["user_ids_x"].union(x["user_ids_x"]))


def compute_groups_interactions(df_reduced,input_name,interaction_folder="groups_interactions",groups_folder="groups"):
    """Create jaccard pairwise distances table for groups having same property_values over two concecutive periods"""
    # remove file if already existing
    file = f"{interaction_folder}/{input_name}"
    os.path.exists(file) and os.remove(file)
    lb = MultiLabelBinarizer()
    users = lb.fit_transform(df_reduced.user_ids.tolist()).astype(bool)
    
    for (period,property_values),i in df_reduced.groupby(["period","property_values"]):
        comp_idx = df_reduced[(df_reduced.period>period )&(df_reduced.period-period < timedelta(60))&(df_reduced.property_values==property_values)].index
        if comp_idx.shape==(0,):
            continue
        res = pd.DataFrame(pairwise_distances(users[i.index],users[comp_idx],metric="jaccard",n_jobs=-1),index=i.index,columns=comp_idx) 
        res = res[res<1].stack().reset_index()
        res.to_csv(file,header=False,index=False,mode="a")
    
    groups_df = pd.read_csv(file,header=None)

    
    df_trans = groups_df.merge(df_reduced,left_on=0,right_on="index").merge(df_reduced,left_on=1,right_on="index")
    df_trans.drop([0,1,2],axis=1,inplace=True)
    df_trans["changes"] = df_trans[["user_ids_x","user_ids_y"]].apply(lambda x : set(x["user_ids_x"]).intersection(x["user_ids_y"]),axis=1)
    df_trans["#changes"] = df_trans["changes"].apply(lambda x: len(x))
    
   
    df_trans.to_csv(file,header=True,index=False)
    
#     file = f"{groups_folder}/{input_name}"
#     os.path.exists(file) and os.remove(file)
#     df_reduced.to_csv(file,header=True,index=False)
    
    return df_trans

def read_reduced_groups(filename,folder="reduced_groups"):
    file = f'{folder}/{filename}'
    df = pd.read_csv(file,index_col=0)
    df["user_ids"] = df["user_ids"].apply(lambda x: [int(z.replace("'","")) for z in x[1:-1].split(",")])
    return df


### Plot function 

In [8]:
def get_color(i):
    if i<=5:
        return '#fecb00'
    if i<=10:
        return '#d47600'
    if i<=20: 
        return '#bb133e'
    if i<=35:
        return '#002664'
    return '#000000'


def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram',df_reduced=[]):
    colorPalette = []
    color_size = 4
    
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
    labelList = [i for i in labelList if not np.isnan(i)]
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
            
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
   
    position_x,position_y,colorList = [],[],[]
    for i in labelList:
        i = int(i)
        colorList.append(get_color(int(len(df_reduced.iloc[i].user_ids))))
        position_x.append(df_reduced.iloc[i].position_x)
        position_y.append(df_reduced.iloc[i].position_y)
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    # creating the sankey diagram
    data = dict(
        arrangement = "snap",
        type='sankey',
        node = dict(
          pad = 1,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = [f'size={len(df_reduced.iloc[int(i)].user_ids)}-{df_reduced.iloc[int(i)].period}' for i in labelList],
          x = position_x,
          y=position_y,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

#### Overview of all results 

In [9]:
lcm_results_overview("lcm_results")

Unnamed: 0,filename,mean property_values occurence,unique property values,file shape,indice
0,2M-5-[2-5000]-[DEPARTEMENT]-lcm.out,79.666667,3,"(239, 5)",0.784314
1,3M-5-[2-5000]-[DEPARTEMENT]-lcm.out,103.333333,3,"(310, 5)",0.878788
2,M-10-[2-5000]-[AGE]-lcm.out,3637.0,2,"(7274, 5)",1.0
3,M-10-[2-5000]-[DEPARTEMENT]-lcm.out,3.0,1,"(3, 5)",1.0
4,M-10-[2-5000]-[SEX]-lcm.out,14549.0,2,"(29098, 5)",1.0
5,M-2-[1-5000]-[DEPARTEMENT]-lcm.out,52.769231,13,"(686, 5)",0.897436
6,M-2-[2-5000]-[DEPARTEMENT]-lcm.out,48.8,5,"(244, 5)",0.733333
7,"M-5-[2-5000]-[AGE,SEX]-lcm.out",12942.25,4,"(51769, 5)",1.0
8,M-5-[2-5000]-[DEPARTEMENT]-lcm.out,48.333333,3,"(145, 5)",0.65625
9,M-5-[2-5000]-[SEX]-lcm.out,50606.0,2,"(101212, 5)",1.0


# Plot sankeys

In [6]:
aa = !ls reduced_groups/7*
aa

['reduced_groups/7D-5-[5-50000]-[user_age]-lcm.out']

In [None]:
for i in ['reduced_groups/7D-5-[5-50000]-[user_age]-lcm.out']:
#     df = read_lcm_output(i)
#     df_reduced = recompute_groups(df,["period","property_values"],0.4) 
#     df_reduced.to_csv(f"reduced_groups/{i}")
    df_reduced = read_reduced_groups(i,folder="")
    users = mlb.fit_transform(df_reduced.user_ids.tolist()).astype(bool).T
    _df  = pd.DataFrame([np.where(user) for user in users if user.sum()>1])
    e = _df[0].apply(lambda x: tuple(list(z) for idx,z in groupby(x,lambda y: df_reduced.iloc[y].period)))
    e = e[e.apply(lambda x:len(x))>2]
    print("done")
    if e.shape[0]==0:
        print("Empty")
        continue
    a = e.apply(lambda x : list(product(*x)))
    b = pd.DataFrame(a.sum()) 
    c = b.fillna(" ").groupby(b.columns.to_list()).size().reset_index(name='size')
    c = c.replace(" ",np.nan)
    c["property_values"] = c[0].apply(lambda x : df_reduced.iloc[x].property_values)
    df_reduced["position_x"] = le.fit_transform(df_reduced["period"])/df_reduced["period"].nunique()
    df_reduced["position_y"] = 1
    for period, x in df_reduced.groupby("period"):
        res = x.reset_index().index/x.shape[0]
        df_reduced["position_y"].loc[x.index] =  res - min(res)
    for e,ii in c.groupby("property_values"):
        fig = genSankey(ii,cat_cols=ii.columns.tolist()[:-2],value_cols='size',title=f"{i},{e}",df_reduced=df_reduced)
        plotly.offline.plot(fig, validate=False)

done


1. Read lcm output 
2. Recompute groups 
3. Create transaction from groups
4. store to /groups/ and /groups_interactions/

In [None]:
def groups_interactions(input_name,jaccard_threshold=0.5,output_folder="",plot=True):
    """Recompute groups interaction for LCM output and store result in output_folder"""
    
    file = f'{output_folder}/{input_name}'
    
    df = read_lcm_output(input_name)
    
    # Reduce groups by doing union for groups having jaccard distance under threshold
    df_reduced = recompute_groups(df,["period","property_values"],jaccard_threshold) 
    df_reduced.append({"user_ids":tuple(i for i in range(10000)),"groups_ids":[0],"property_values":"O_M","period":"2000-03-01"},ignore_index=True)
    print(f"Reduced groups for{input_name}")
    
    # Merge groups of consecutive periods 
    print(f"Interaction computed for {input_name}")
    compute_groups_interactions(df_reduced,input_name)
   
    
# [groups_interactions(i.split("/")[1]) for i in a[5:]]
a = groups_interactions("M-5-[5-50000]-[user_gender]-lcm.out")

# Data preprocessing for echarts 

In [59]:
def format_links(x):
    res = []
    for i in x[0]:
        for idx in range(len(i)-1):
            res.append([i[idx],i[idx+1],x["index"]])
    return res

def sankey_preprocessing(input_file,stats_folder='plots/stats',links_folder="plots/links",groups_folder='plots/groups',demographics = ["DEPARTEMENT","SEX","AGE"]):
    users = pd.read_csv("datasets/Total/users.csv",sep=";")   
    df = read_lcm_output_total(inputut_file)
#     df = df[df.property_values =="M"]
     # stats 
    file = f'{stats_folder}/{input_file}'
    
    mlb = MultiLabelBinarizer(sparse_output=True)
    _df = mlb.fit_transform(df.user_ids.tolist()).astype(bool)
    _df = pd.DataFrame(_df.toarray(),columns=mlb.classes_)

    e = _df.sum()
    _df = _df[e[e>3].index]
    
    _df  = _df.T.apply(lambda x : np.where(x)[0],axis=1)

    e = _df.to_frame()[0].apply(lambda x: tuple(list(z) for idx,z in groupby(x,lambda y: df.iloc[y].period)))
    
    e = e[e.apply(lambda x:len(x))>5]
    
#     demographics = ["DEPARTEMENT","SEX","AGE"]
#     stats = {}
    
#     for i in demographics:
#         b = a.groupby(i).apply(lambda x: {"name":x[i].unique()[0],"value":x.CUST_ID.shape[0],"users":",".join(str(i) for i in x.CUST_ID)}).values
#         stats[i] = b.tolist()
#     pd.DataFrame(stats).to_csv(file)

    # links 
    file = f'{links_folder}/{input_file}'
    e = e.apply(lambda x : list(product(*x)))
    e = pd.DataFrame(e)
    links = e.reset_index().apply(format_links,axis=1).sum()
    
    links = pd.DataFrame(links)
    links = links.drop_duplicates()
    links.columns = ["source","target","user_id"]
    links.groupby(["source","target"])["user_id"].apply(lambda x: ','.join(str(i) for i in x)).to_frame().to_csv(file)
    
    # Stats  
    file = f'plots/stats/{input_file}'
    users_stats = links[["user_id"]].drop_duplicates().merge(users,left_on="user_id",right_on="CUST_ID")[users.columns]
    stats = {}
    for i in demographics:
        b = users_stats.groupby(i).apply(lambda x: {"name":x[i].unique()[0],"value":x.CUST_ID.shape[0],"users":",".join(str(i) for i in x.CUST_ID)}).values
        stats[i] = b.tolist()
    with open(file, 'w') as outfile:
        json.dump(stats, outfile)
  
    # Users
    file = f'plots/users/{input_file}'
    users_stats.to_csv(file)
    
    file = f'{groups_folder}/{input_file}'
    # filter groups to the ones appearing in the links
    df_reduced_filtred = df.loc[np.unique(np.union1d(links.source.unique(),links.target.unique()))].dropna()
    df_reduced_filtred['depth'] = le.fit_transform(df_reduced_filtred.period)/df_reduced_filtred.shape[0]
    df_reduced_filtred['size'] = df_reduced_filtred.user_ids.apply(lambda x : len(x))
    df_reduced_filtred.to_csv(file)
    return df_reduced_filtred

In [12]:
def read_lcm_output_total(input_name,folder="lcm_results"):
    """Read and restructure LCM output file,rename columns output a df """
    file = f'{folder}/{input_name}'
    df = pd.read_csv(file,header=None)
    df.columns = ["user_ids","support","itemsets","period","property_values"]
    df["period"] = pd.to_datetime(df["period"])
    df["user_ids"] = df.user_ids.apply(lambda x : np.array([int(z.replace('"',"")) for z in x[1:-1].split(",") if z != ""]))
    return df

df = read_lcm_output_total('M-5-[2-5000]-[SEX]-lcm.out')


In [57]:
df

Unnamed: 0,user_ids,support,itemsets,period,property_values
0,"[519495, 520696, 528218, 529470, 533033, 58238...",11,13600075 6026,2017-03-01,F
1,"[187912, 274183, 390331, 431037, 469899, 51201...",11,4190175 19419902,2017-03-01,F
2,"[144075, 428515, 497218, 524779, 584635, 651557]",6,7476355 7445935,2017-03-01,F
3,"[286240, 336929, 359880, 584635, 740226]",5,5461018 7445935,2017-03-01,F
4,"[393518, 478153, 522436, 529454, 529463, 54185...",8,6033 6026,2017-03-01,F
...,...,...,...,...,...
101207,"[441811, 479778, 513416, 570510, 1209550]",5,4461002 4756741,2019-10-01,M
101208,"[113854, 171050, 664713, 1077838, 1237929]",5,11421001 11462027,2019-10-01,M
101209,"[124875, 235613, 813046, 971143, 978909, 1037289]",6,11430044 11421002,2019-10-01,M
101210,"[122586, 156442, 326976, 449825, 958235]",5,11430218 11421002,2019-10-01,M


In [61]:
inputut_file = "2M-5-[2-5000]-[DEPARTEMENT]-lcm.out"
g = sankey_preprocessing(inputut_file)

Unnamed: 0,CUST_ID,SEX,AGE,DEPARTEMENT
0,108938,M,50-65,59
1,110899,M,35-49,59
2,114666,M,35-49,59
3,118478,M,50-65,59
4,123777,M,50-65,59
...,...,...,...,...
61,1167207,M,35-49,59
62,1168767,M,35-49,59
63,1171591,M,50-65,59
64,1198352,M,35-49,59


In [16]:
users = pd.read_csv("datasets/Total/users.csv",sep=";")

In [49]:
a = g.merge(users,left_on="user_id",right_on="CUST_ID")
g

Unnamed: 0,source,target,user_id
0,150,171,108938
1,171,195,108938
2,195,201,108938
3,201,213,108938
4,213,221,108938
...,...,...,...
788983,178,197,1205586
788984,197,205,1205586
788985,205,212,1205586
788986,212,224,1205586


In [135]:
stats

{'DEPARTEMENT': [{'value': '59',
   'size': 26,
   'users': '108938,110899,114666,118478,123777,127790,133892,187833,235297,244827,270220,292347,314647,368080,403445,421137,425954,427096,427250,464331,483533,602580,607784,612824,616689,739860'},
  {'value': '62', 'size': 3, 'users': '156634,460228,552846'},
  {'value': '78',
   'size': 13,
   'users': '263913,294115,428202,468837,471775,519469,519885,555714,584447,619437,715341,719763,735292'}],
 'SEX': [{'value': 'M',
   'size': 42,
   'users': '108938,110899,114666,118478,123777,127790,133892,156634,187833,235297,244827,263913,270220,292347,294115,314647,368080,403445,421137,425954,427096,427250,428202,460228,464331,468837,471775,483533,519469,519885,552846,555714,584447,602580,607784,612824,616689,619437,715341,719763,735292,739860'}],
 'AGE': [{'value': '35-49',
   'size': 19,
   'users': '110899,114666,156634,244827,292347,294115,421137,425954,427096,427250,460228,468837,483533,519885,555714,607784,616689,619437,719763'},
  {'valu

In [140]:
with open('stats.json', 'w') as outfile:
    json.dump(stats, outfile)

In [137]:
!cat stats

,0,1,2,3
0,"{'value': '59', 'size': 26, 'users': '108938,110899,114666,118478,123777,127790,133892,187833,235297,244827,270220,292347,314647,368080,403445,421137,425954,427096,427250,464331,483533,602580,607784,612824,616689,739860'}","{'value': '62', 'size': 3, 'users': '156634,460228,552846'}","{'value': '78', 'size': 13, 'users': '263913,294115,428202,468837,471775,519469,519885,555714,584447,619437,715341,719763,735292'}",
1,"{'value': 'M', 'size': 42, 'users': '108938,110899,114666,118478,123777,127790,133892,156634,187833,235297,244827,263913,270220,292347,294115,314647,368080,403445,421137,425954,427096,427250,428202,460228,464331,468837,471775,483533,519469,519885,552846,555714,584447,602580,607784,612824,616689,619437,715341,719763,735292,739860'}",,,
2,"{'value': '35-49', 'size': 19, 'users': '110899,114666,156634,244827,292347,294115,421137,425954,427096,427250,460228,468837,483533,519885,555714,607784,616689,619437,719763'}","{'value': '50-65', 'size': 15, 'users': '1089