In [1]:
import pandas as pd
from IPython.core import display as ICD
from multiprocessing import Pool
import numpy as np
from datetime import timedelta
import re

#### Overview of all results 

In [79]:
files = !ls *.out
def format_name(name):
    return name.split("/")[1].split("-")
res = []
for i in files:
    a= pd.read_csv(i,sep=",",header=None)
    a.columns = ["itemsets","support","user_ids","period"]
    a["properties"]= a["period"].apply(lambda x : " ".join(str(z) for z in x.split("_")[1:]))
    mean = a["properties"].value_counts().mean()
    unique = a["properties"].nunique()
    res.append((i,mean,unique,a.shape))
df = pd.DataFrame(res)
df.columns = ["filename","mean properties occurence","unique property","file shape"]
df

Unnamed: 0,filename,mean properties occurence,unique property,file shape
0,"21D-13-[5-3500]-[user_age,user_gender]-groups.out",14017.4,5,"(70087, 5)"
1,21D-20-[5-3500]-[user_age]-groups.out,11946.5,4,"(47786, 5)"
2,"21D-6-[5-3500]-[user_gender,user_occupation]-g...",4668.703704,27,"(126055, 5)"
3,"2M-10-[5-3500]-[user_gender,user_occupation]-g...",4553.863636,22,"(100185, 5)"
4,"2M-24-[5-3500]-[user_age,user_gender]-groups.out",7737.4,5,"(38687, 5)"
5,2M-37-[5-3500]-[user_age]-groups.out,7948.5,4,"(31794, 5)"
6,7D-10-[5-3500]-[user_age]-groups.out,12274.5,6,"(73647, 5)"
7,"7D-7-[5-3500]-[user_age,user_gender]-groups.out",9511.3,10,"(95113, 5)"
8,"M-16-[5-3500]-[user_age,user_gender]-groups.out",8180.857143,7,"(57266, 5)"
9,M-25-[5-3500]-[user_age]-groups.out,9937.5,4,"(39750, 5)"


#### Choosen file : M-5-[5-5000]-[user_age]-groups.out

In [3]:
df = pd.read_csv("7D-10-[5-3500]-[user_age]-groups.out",header=None)
df = pd.concat([df.drop(3,axis=1),df[3].str.split("_",expand=True)],axis=1)
df.columns = ["itemsets","support","user_ids","period","user_age"]
df["period"] = pd.to_datetime(df["period"])
df.reset_index(inplace = True)

In [4]:
df["user_ids"] = df.user_ids.apply(lambda x : np.array([np.int(z) for z in x.split(" ") if z != ""]))
df.head()

Unnamed: 0,index,itemsets,support,user_ids,period,user_age
0,0,5955 5975 5983 5984 5996 6021 6022 6030 6035 ...,10,"[1200, 480, 1196, 1097, 1240]",2000-04-25,3
1,1,5955 5976 5983 5984 5996 6021 6022 6030 6035 ...,10,"[1270, 1200, 1196, 1097, 1240]",2000-04-25,3
2,2,5950 5955 5983 5984 5996 6021 6022 6030 6035 ...,10,"[1270, 1200, 480, 1097, 1240]",2000-04-25,3
3,3,5955 5983 5984 5996 5997 6021 6022 6030 6035 ...,10,"[1270, 1200, 480, 1196, 1240]",2000-04-25,3
4,4,5852 5858 5869 5874 5875 5886 5888 5911 5920 ...,10,"[589, 1210, 1196, 260, 2762]",2000-05-02,3


###  Compute distances

In [13]:
def jaccard_similarity(a, b):
    """Compute Jaccard distance between two arrays
    Inputs :
        a: n element array
        b: m element array
    """
    return len(a[0].intersection(b[0]))/len(a[0].union(b[0]))


def similar_groups_union(groups,g_index,threashold):
    """
    Create new groups from given ones,
    each group is affected to the new group having the minimal jaccard distance with (comparing to all new groups)
    if the minimal jaccard distance found is under the threashold, the group is added as a new group 
    Parameters
    ----------
    groups : list of tuples of str
        Each tuple contain user_ids 
    g_index: list of int
        list of corresponding indexes in initial dataset
    Returns
    -------
    list of new groups and their corresponding indexes in the given dataset
    """
    new_groups = [[groups[0],[g_index[0]]],]
    for group,g_idx in zip(groups[1:],g_index[1:]):
        group = set(group)
        found = False
        for i in new_groups:
            distance = 1-len(group.intersection(i[0]))/len(group.union(i[0]))
            if distance<threashold:
                group_n = tuple(group.union(i[0]))
                for z in new_groups:
                    if z[0]== group_n:
                        z[1].append(g_idx)
                        new_groups.remove(i)
                        break
                i[0] = tuple(group.union(i[0]))
                i[1].append(g_idx)
                found = True
        if not found:
            new_groups.append([tuple(group),[g_idx]])
    return new_groups

def recompute_groups(df,properties,threashold):
    """
    Regroup similar groups of each period to new groups according to the Jaccard distance between them
    Parameters
    ----------
        df: DataFrame 
        threashold: int  
            Minimal Jaccard distance under which two groups are supposed similar
    Result
    ------
        Dataframe 
    """
    groups = pd.DataFrame()
    for (period,values),i in df.groupby(properties):
        g =  [tuple(str(z) for z in x) for x in i.user_ids]
        res = pd.DataFrame(similar_groups_union(g,i.index,threashold))
        res["properties"] = values
        res["period"] = period
        groups = pd.concat([groups,res],axis=0,sort=False)
    return groups.reset_index(drop="True")

In [36]:
df_reduced = recompute_groups(df,["period","user_age"],0.2 )     
df_reduced.columns = ["user_ids","groups_ids","properties","period"]
df_reduced

Unnamed: 0,user_ids,groups_ids,properties,period
0,"(1200, 480, 1196, 1097, 1240)",[0],3,2000-04-25
1,"(1270, 1097, 1240, 1200, 1196)",[1],3,2000-04-25
2,"(1270, 1097, 1240, 1200, 480)",[2],3,2000-04-25
3,"(1270, 1240, 1200, 480, 1196)",[3],3,2000-04-25
4,"(589, 1210, 1196, 260, 2762)",[4],3,2000-05-02
...,...,...,...,...
27188,"(733, 110, 380, 1580, 736, 1196, 1210)",[59641],2,2000-12-12
27189,"(733, 110, 592, 1580, 736, 377)",[59642],2,2000-12-12
27190,"(733, 110, 1610, 1580, 736)",[59643],2,2000-12-12
27191,"(733, 110, 1580, 736, 2006)",[59644],2,2000-12-12


In [37]:
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()

def compute_distance(x):
    return 1-len(x["user_ids_x"].intersection(x["user_ids_y"]))/len(x["user_ids_x"].union(x["user_ids_x"]))
users = lb.fit_transform(df_reduced.user_ids.tolist()).astype(bool)


In [38]:
df["key"]=1
result = pd.DataFrame()
comp_df = res
!rm result
for (period,properties),i in df_reduced.groupby(["period","properties"]):
    comp_idx = df_reduced[(df_reduced.period>period )&(df_reduced.period-period < timedelta(60))&(df_reduced.properties==properties)][:100].index
    if comp_idx.shape==(0,):
        continue
    res = pd.DataFrame(pairwise_distances(users[i.index],users[comp_idx],metric="jaccard",n_jobs=-1),index=i.index,columns=comp_idx) 
    res = res[res<0.6].stack().reset_index()
    res.to_csv("result",header=False,index=False,mode="a")
    result = pd.concat([result,res],axis=0)

In [51]:
df_trans = pd.read_csv("result",header=None).merge(df,left_on=0,right_on="index").merge(df,left_on=1,right_on="index")
df_trans["changes"] = df_trans[["user_ids_x","user_ids_y"]].apply(lambda x : set(x["user_ids_x"]).intersection(x["user_ids_y"]),axis=1)
df_trans["#changes"] = df_trans["changes"].apply(lambda x: len(x))
df_trans = df_trans[df_trans["#changes"]>2]
df_trans.user_age_x.unique(),df_trans.user_age_y.unique()
df_trans

Unnamed: 0,0,1,2,index_x,itemsets_x,support_x,user_ids_x,period_x,user_age_x,key_x,index_y,itemsets_y,support_y,user_ids_y,period_y,user_age_y,key_y,changes,#changes
0,4,107,0.500000,4,5852 5858 5869 5874 5875 5886 5888 5911 5920 ...,10,"[589, 1210, 1196, 260, 2762]",2000-05-02,3,1,107,5842 5862 5865 5869 5874 5875 5886 5908 5911 ...,11,"[1214, 1240, 1210, 1196, 589]",2000-05-02,3,1,"{1210, 1196, 589}",3
1,5,107,0.500000,5,5852 5858 5874 5875 5886 5888 5911 5920 5921 ...,10,"[589, 1210, 1198, 1196, 2762]",2000-05-02,3,1,107,5842 5862 5865 5869 5874 5875 5886 5908 5911 ...,11,"[1214, 1240, 1210, 1196, 589]",2000-05-02,3,1,"{1210, 1196, 589}",3
2,6,107,0.555556,6,5842 5852 5858 5862 5874 5875 5886 5888 5908 ...,12,"[589, 1210, 1198, 1196, 260]",2000-05-02,3,1,107,5842 5862 5865 5869 5874 5875 5886 5908 5911 ...,11,"[1214, 1240, 1210, 1196, 589]",2000-05-02,3,1,"{1210, 1196, 589}",3
3,10,107,0.500000,10,5842 5852 5874 5875 5886 5888 5903 5908 5911 ...,11,"[2571, 589, 1210, 1196, 260]",2000-05-02,3,1,107,5842 5862 5865 5869 5874 5875 5886 5908 5911 ...,11,"[1214, 1240, 1210, 1196, 589]",2000-05-02,3,1,"{1210, 1196, 589}",3
7,47,107,0.500000,47,5842 5852 5858 5862 5874 5875 5886 5888 5908 ...,10,"[1291, 2628, 589, 1210, 1196, 260, 1198]",2000-05-02,3,1,107,5842 5862 5865 5869 5874 5875 5886 5908 5911 ...,11,"[1214, 1240, 1210, 1196, 589]",2000-05-02,3,1,"{1210, 1196, 589}",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47316,25466,27170,0.555556,25466,3400 3414 3416 3456 3491 3493 3499 3507 3539 ...,10,"[780, 1580, 480, 1200, 1210, 1240, 2571, 1196,...",2000-08-22,3,1,27170,3414 3416 3423 3447 3491 3493 3499 3507 3513 ...,10,"[3175, 1198, 1580, 1196, 589, 1210, 1270, 2571...",2000-08-22,3,1,"{480, 2571, 1196, 1580, 589, 1210}",6
47317,25536,27170,0.571429,25536,3414 3416 3456 3491 3493 3499 3507 3539 3564 ...,11,"[780, 1097, 1580, 1200, 1240, 1270, 480, 2571,...",2000-08-22,3,1,27170,3414 3416 3423 3447 3491 3493 3499 3507 3513 ...,10,"[3175, 1198, 1580, 1196, 589, 1210, 1270, 2571...",2000-08-22,3,1,"{480, 2571, 1580, 589, 1270}",5
47318,25615,27170,0.555556,25615,3414 3416 3422 3423 3462 3493 3499 3507 3513 ...,10,"[780, 2028, 2916, 1270, 2571, 1210, 589, 1196]",2000-08-22,3,1,27170,3414 3416 3423 3447 3491 3493 3499 3507 3513 ...,10,"[3175, 1198, 1580, 1196, 589, 1210, 1270, 2571...",2000-08-22,3,1,"{2571, 1196, 589, 1270, 1210}",5
47319,25616,27170,0.555556,25616,3400 3414 3422 3423 3462 3493 3499 3507 3513 ...,10,"[780, 2028, 2804, 2571, 1210, 589, 1196, 2916]",2000-08-22,3,1,27170,3414 3416 3423 3447 3491 3493 3499 3507 3513 ...,10,"[3175, 1198, 1580, 1196, 589, 1210, 1270, 2571...",2000-08-22,3,1,"{1210, 2571, 1196, 589}",4


In [77]:
for i in df_trans.user_age_x.unique():
    e = df_trans[df_trans.user_age_x==str(i)]
    genSankey(e,["index_x","index_y"],value_cols="#changes",title="i")
    fig.show()



In [68]:
for i in df_trans.user_age_x.unique():
    e = df_trans[df_trans.user_age_x==str(i)][:1000]
    ICD.display(e.index_y.value_counts().to_frame())
    continue
    import plotly.graph_objects as go
    print(i)
    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = e.index_x,
          color = "blue"
        ),
        link = dict(
          source = e.index_x, # indices correspond to labels, eg A1, A2, A2, B1, ...
          target = e.index_y,
          value = e["#changes"]
      ))])

    fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
    
fig.show()

Unnamed: 0,index_y
1485,66
1579,49
1483,37
1580,31
1481,30
...,...
1496,1
160,1
1513,1
1150,1


Unnamed: 0,index_y
11738,74
11780,64
15221,46
11736,30
11766,29
...,...
9789,1
9787,1
11791,1
11763,1


Unnamed: 0,index_y
24448,30
24464,28
24487,18
24472,15
24432,15
...,...
24414,1
24486,1
24479,1
24424,1


Unnamed: 0,index_y
9871,57
9842,57
9867,46
9881,42
9830,37
...,...
8736,1
8735,1
8717,1
8733,1


In [70]:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig