In [1]:
import pandas as pd
from IPython.core import display as ICD
from multiprocessing import Pool
import numpy as np
from datetime import timedelta
import re

#### Overview of all results 

In [2]:
files = !ls *.out
def format_name(name):
    return name.split("/")[1].split("-")
res = []
for i in files:
    a= pd.read_csv(i,sep=",",header=None)
    a.columns = ["itemsets","support","user_ids","period"]
    a["properties"]= a["period"].apply(lambda x : " ".join(str(z) for z in x.split("_")[1:]))
    mean = a["properties"].value_counts().mean()
    unique = a["properties"].nunique()
    res.append((i,mean,unique,a.shape))
df = pd.DataFrame(res)
df.columns = ["filename","mean properties occurence","unique property","file shape"]
df

Unnamed: 0,filename,mean properties occurence,unique property,file shape
0,"21D-13-[5-3500]-[user_age,user_gender]-groups.out",14017.4,5,"(70087, 5)"
1,21D-20-[5-3500]-[user_age]-groups.out,11946.5,4,"(47786, 5)"
2,"21D-6-[5-3500]-[user_gender,user_occupation]-g...",4668.703704,27,"(126055, 5)"
3,"2M-10-[5-3500]-[user_gender,user_occupation]-g...",4553.863636,22,"(100185, 5)"
4,"2M-24-[5-3500]-[user_age,user_gender]-groups.out",7737.4,5,"(38687, 5)"
5,2M-37-[5-3500]-[user_age]-groups.out,7948.5,4,"(31794, 5)"
6,7D-10-[5-3500]-[user_age]-groups.out,12274.5,6,"(73647, 5)"
7,"7D-7-[5-3500]-[user_age,user_gender]-groups.out",9511.3,10,"(95113, 5)"
8,"M-16-[5-3500]-[user_age,user_gender]-groups.out",8180.857143,7,"(57266, 5)"
9,M-25-[5-3500]-[user_age]-groups.out,9937.5,4,"(39750, 5)"


#### Choosen file : M-16-[5-3500]-[user_age,user_gender]-groups.out	

In [7]:
df = pd.read_csv("M-16-[5-3500]-[user_age,user_gender]-groups.out",header=None)
df = pd.concat([df.drop(3,axis=1),df[3].str.split("_",expand=True)],axis=1)
df.columns = ["itemsets","support","user_ids","period","user_age","user_gender"]
df["period"] = pd.to_datetime(df["period"])
df.reset_index(inplace = True)
df.head()

Unnamed: 0,index,itemsets,support,user_ids,period,user_age,user_gender
0,0,5523 5540 5550 5553 5596 5667 5675 5678 5679 ...,16,1240 1200 2571 1580 260,2000-05-01,4,M
1,1,5519 5523 5540 5550 5553 5596 5631 5675 5678 ...,16,3471 924 589 1196 260,2000-05-01,4,M
2,2,5099 5167 5187 5219 5225 5264 5283 5319 5332 ...,18,1240 260 589 2628 1196,2000-06-01,2,M
3,3,5167 5187 5219 5225 5264 5283 5318 5319 5332 ...,17,1240 260 480 2628 1196,2000-06-01,2,M
4,4,5167 5187 5219 5225 5264 5283 5319 5332 5339 ...,18,1240 260 480 589 1196,2000-06-01,2,M


In [8]:
df["user_ids"] = df.user_ids.apply(lambda x : np.array([np.int(z) for z in x.split(" ") if z != ""]))
df.head()

Unnamed: 0,index,itemsets,support,user_ids,period,user_age,user_gender
0,0,5523 5540 5550 5553 5596 5667 5675 5678 5679 ...,16,"[1240, 1200, 2571, 1580, 260]",2000-05-01,4,M
1,1,5519 5523 5540 5550 5553 5596 5631 5675 5678 ...,16,"[3471, 924, 589, 1196, 260]",2000-05-01,4,M
2,2,5099 5167 5187 5219 5225 5264 5283 5319 5332 ...,18,"[1240, 260, 589, 2628, 1196]",2000-06-01,2,M
3,3,5167 5187 5219 5225 5264 5283 5318 5319 5332 ...,17,"[1240, 260, 480, 2628, 1196]",2000-06-01,2,M
4,4,5167 5187 5219 5225 5264 5283 5319 5332 5339 ...,18,"[1240, 260, 480, 589, 1196]",2000-06-01,2,M


###  Compute distances

In [9]:
def jaccard_similarity(a, b):
    """Compute Jaccard distance between two arrays
    Inputs :
        a: n element array
        b: m element array
    """
    return len(a[0].intersection(b[0]))/len(a[0].union(b[0]))


def similar_groups_union(groups,g_index,threashold):
    """
    Create new groups from given ones,
    each group is affected to the new group having the minimal jaccard distance with (comparing to all new groups)
    if the minimal jaccard distance found is under the threashold, the group is added as a new group 
    Parameters
    ----------
    groups : list of tuples of str
        Each tuple contain user_ids 
    g_index: list of int
        list of corresponding indexes in initial dataset
    Returns
    -------
    list of new groups and their corresponding indexes in the given dataset
    """
    new_groups = [[groups[0],[g_index[0]]],]
    for group,g_idx in zip(groups[1:],g_index[1:]):
        group = set(group)
        found = False
        for i in new_groups:
            distance = 1-len(group.intersection(i[0]))/len(group.union(i[0]))
            if distance<threashold:
                group_n = tuple(group.union(i[0]))
                for z in new_groups:
                    if z[0]== group_n:
                        z[1].append(g_idx)
                        new_groups.remove(i)
                        break
                i[0] = tuple(group.union(i[0]))
                i[1].append(g_idx)
                found = True
        if not found:
            new_groups.append([tuple(group),[g_idx]])
    return new_groups

def recompute_groups(df,properties,threashold):
    """
    Regroup similar groups of each period to new groups according to the Jaccard distance between them
    Parameters
    ----------
        df: DataFrame 
        threashold: int  
            Minimal Jaccard distance under which two groups are supposed similar
    Result
    ------
        Dataframe 
    """
    groups = pd.DataFrame()
    for (period,values),i in df.groupby(properties):
        g =  [tuple(str(z) for z in x) for x in i.user_ids]
        res = pd.DataFrame(similar_groups_union(g,i.index,threashold))
        res["properties"] = values
        res["period"] = period
        groups = pd.concat([groups,res],axis=0,sort=False)
    return groups.reset_index(drop="True")

In [10]:
df_reduced = recompute_groups(df,["period","user_age"],0.2 )     
df_reduced.columns = ["user_ids","groups_ids","properties","period"]
df_reduced

Unnamed: 0,user_ids,groups_ids,properties,period
0,"(2762, 589, 1198, 1196, 260, 2628, 1240)","[291, 299, 299]",3,2000-05-01
1,"(2762, 2858, 589, 1198, 1196, 260, 2628)","[409, 415, 415, 426, 426, 426]",3,2000-05-01
2,"(1196, 2762, 2858, 589, 1198, 2628)",[423],3,2000-05-01
3,"(2858, 589, 1196, 260, 2628, 2571, 1240)","[444, 450, 450, 464, 464, 464]",3,2000-05-01
4,"(1196, 1240, 2762, 2858, 589, 1198, 1210, 2571)","[445, 453, 454, 454, 474, 474]",3,2000-05-01
...,...,...,...,...
14904,"(480, 1580, 589, 457, 2916, 3256)","[46761, 46762]",4,2000-12-01
14905,"(480, 1580, 589, 1370, 2916)",[46763],4,2000-12-01
14906,"(480, 1200, 589, 2916, 2021)",[46764],4,2000-12-01
14907,"(1210, 1196, 260, 1198, 480)",[36938],6,2000-12-01


In [11]:
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()

def compute_distance(x):
    return 1-len(x["user_ids_x"].intersection(x["user_ids_y"]))/len(x["user_ids_x"].union(x["user_ids_x"]))
users = lb.fit_transform(df_reduced.user_ids.tolist()).astype(bool)


In [12]:
df["key"]=1
result = pd.DataFrame()
comp_df = res
!rm result
for (period,properties),i in df_reduced.groupby(["period","properties"]):
    comp_idx = df_reduced[(df_reduced.period>period )&(df_reduced.period-period < timedelta(60))&(df_reduced.properties==properties)][:100].index
    if comp_idx.shape==(0,):
        continue
    res = pd.DataFrame(pairwise_distances(users[i.index],users[comp_idx],metric="jaccard",n_jobs=-1),index=i.index,columns=comp_idx) 
    res = res[res<0.6].stack().reset_index()
    res.to_csv("result",header=False,index=False,mode="a")
    result = pd.concat([result,res],axis=0)

In [13]:
df_trans = pd.read_csv("result",header=None).merge(df,left_on=0,right_on="index").merge(df,left_on=1,right_on="index")
df_trans["changes"] = df_trans[["user_ids_x","user_ids_y"]].apply(lambda x : set(x["user_ids_x"]).intersection(x["user_ids_y"]),axis=1)
df_trans["#changes"] = df_trans["changes"].apply(lambda x: len(x))
df_trans = df_trans[df_trans["#changes"]>2]
df_trans.user_age_x.unique(),df_trans.user_age_y.unique()
df_trans

Unnamed: 0,0,1,2,index_x,itemsets_x,support_x,user_ids_x,period_x,user_age_x,user_gender_x,...,index_y,itemsets_y,support_y,user_ids_y,period_y,user_age_y,user_gender_y,key_y,changes,#changes
10,98,211,0.555556,98,5501 5543 5592 5614 5617 5627 5634 5636 5645 ...,32,"[589, 2762, 260, 1196, 1210]",2000-05-01,3,M,...,211,5501 5592 5614 5617 5627 5634 5636 5648 5654 ...,30,"[1240, 2628, 589, 2762, 1210]",2000-05-01,3,M,1,"{1210, 2762, 589}",3
13,105,211,0.400000,105,5501 5592 5614 5617 5627 5634 5636 5645 5648 ...,31,"[2628, 2762, 260, 1196, 1210]",2000-05-01,3,M,...,211,5501 5592 5614 5617 5627 5634 5636 5648 5654 ...,30,"[1240, 2628, 589, 2762, 1210]",2000-05-01,3,M,1,"{1210, 2762, 2628}",3
14,109,211,0.583333,109,5501 5554 5592 5614 5617 5627 5634 5636 5645 ...,31,"[2628, 589, 2762, 260, 1196]",2000-05-01,3,M,...,211,5501 5592 5614 5617 5627 5634 5636 5648 5654 ...,30,"[1240, 2628, 589, 2762, 1210]",2000-05-01,3,M,1,"{2762, 2628, 589}",3
15,113,211,0.555556,113,5501 5592 5614 5617 5627 5634 5636 5645 5648 ...,31,"[2628, 2571, 2762, 260, 1210]",2000-05-01,3,M,...,211,5501 5592 5614 5617 5627 5634 5636 5648 5654 ...,30,"[1240, 2628, 589, 2762, 1210]",2000-05-01,3,M,1,"{1210, 2762, 2628}",3
17,116,211,0.555556,116,5497 5501 5570 5574 5576 5592 5614 5617 5624 ...,41,"[2628, 2571, 589, 1196, 1210]",2000-05-01,3,M,...,211,5501 5592 5614 5617 5627 5634 5636 5648 5654 ...,30,"[1240, 2628, 589, 2762, 1210]",2000-05-01,3,M,1,"{1210, 2628, 589}",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20033,11036,14908,0.500000,11036,4461 4501 4508 4509 4512 4524 4560 4578 4591 ...,22,"[1097, 1240, 1200, 1198, 2571, 1196]",2000-07-01,3,M,...,14908,3389 3391 3411 3425 3487 3512 3558 3605 3729 ...,17,"[2716, 296, 1198, 110, 2028, 2571, 1196]",2000-08-01,2,M,1,"{2571, 1196, 1198}",3
20034,11037,14908,0.500000,11037,4461 4501 4508 4509 4512 4560 4578 4591 4626 ...,19,"[1097, 1240, 1200, 1198, 2571, 1196, 260]",2000-07-01,3,M,...,14908,3389 3391 3411 3425 3487 3512 3558 3605 3729 ...,17,"[2716, 296, 1198, 110, 2028, 2571, 1196]",2000-08-01,2,M,1,"{2571, 1196, 1198}",3
20035,11041,14908,0.500000,11041,4461 4501 4508 4509 4512 4560 4578 4591 4658 ...,17,"[1097, 1240, 1200, 1198, 1210, 2571, 260, 1196]",2000-07-01,3,M,...,14908,3389 3391 3411 3425 3487 3512 3558 3605 3729 ...,17,"[2716, 296, 1198, 110, 2028, 2571, 1196]",2000-08-01,2,M,1,"{2571, 1196, 1198}",3
20056,11452,14908,0.500000,11452,4480 4508 4509 4560 4578 4626 4658 4702 4708 ...,21,"[1617, 2028, 1198, 593, 1196, 260]",2000-07-01,3,M,...,14908,3389 3391 3411 3425 3487 3512 3558 3605 3729 ...,17,"[2716, 296, 1198, 110, 2028, 2571, 1196]",2000-08-01,2,M,1,"{1196, 2028, 1198}",3


In [15]:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [18]:
for i in df_trans.user_age_x.unique():
    e = df_trans[df_trans.user_age_x==str(i)]
    print(e.shape)
    genSankey(e,["index_x","index_y"],value_cols="#changes",title="i")
    fig.show()



(5314, 21)


AttributeError: 'dict' object has no attribute 'show'