### IMPORTS

In [1]:
!pip install --user cdlib

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
!pip install --user leidenalg 

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

import networkx as nx
from cdlib import algorithms
from networkx.algorithms import community
from community import community_louvain
from networkx.algorithms import bipartite
from tqdm import tqdm
import time
from operator import itemgetter


Note: to be able to use all crisp methods, you need to install some additional packages:  {'graph_tool', 'karateclub', 'infomap'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'ASLPAw', 'karateclub'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'infomap'}


### DF OVERVIEW (not real data)

In [12]:
df[['Client ID','Page ID','Data']].head(10)

Unnamed: 0,Client ID,Page ID,Data
4,1429741971.1648664,Page 0,20220430
6,1910315733.1611671,Page 1,20220126
8,894077043.1646217,Page 2,20220302
9,1675573705.1641805,Page 3,20220114
10,530234264.16554475,Page 4,20220722
11,1413248295.165108,Page 5,20220427
12,1556507839.158645,Page 6,20220614
13,1556507839.158645,Page 7,20220615
14,1404851356.1645088,Page 8,20220623
15,1660435339.164865,Page 9,20220404


- Cleaning and preprocessing

In [37]:
df['Client ID']=df['Client ID'].astype(str)

le = preprocessing.LabelEncoder()
df['Client ID']=le.fit_transform(df['Client ID'])

def month_extr(inpt):
    return str(inpt)[4:6]
df['month']=df['Data'].apply(month_extr)

##  Bipartite Graph

In [42]:
df=df[df['month'].isin(['01','02','03','04','05','06','07','08'])]
df=df[['Client ID','Page ID','month']]
df.columns=['Client ID','Page ID','month']

- The nodes are both the Client ID and the Page ID

#### EDGES CREATION

In [44]:
sources = []
targets = []
edges = {}
for row in tqdm(df.iterrows()):
    source = str(row[1]["Client ID"])
    targets = row[1]["Page ID"]
    source = source.lower()
    if targets != []:
        weight = (source, str(targets).lower())
        if weight in edges:
            edges[weight] += 1
        else:
            edges[weight] = 1

334990it [00:24, 13948.71it/s]


#### GRAPH CREATION WITH NETWORKX

In [45]:
DG = nx.Graph()
list_0=[]
list_1=[]
for k, v in edges.items():
    source = k[0]
    list_0.append(source)
    target = k[1]
    list_1.append(target)
    weight = v
    DG.add_edge(source, target, weight = weight)

- Pairs of Nodes and links assignment

In [46]:
G = nx.DiGraph() #valutare se sia direzionato o no
G.add_nodes_from(list_0, bipartite=0) 
G.add_nodes_from(list_1, bipartite=1)
for k, v in edges.items():
    source = k[0]
    target = k[1]
    weight = v
    G.add_edge(source, target, weight = weight)

In [47]:
G = G.to_undirected()

In [48]:
nx.info(G)

'Graph with 155948 nodes and 269984 edges'

## Community detection

 Cdlib https://cdlib.readthedocs.io/en/latest/reference/cd_algorithms/node_clustering.html 


#### Louvain

In [49]:
partition = community_louvain.best_partition(G)

# Get a set of the communities
#communities_louvain = set(partition.values())

# Create a dictionary mapping community number to nodes within that community
#communities_louvain_dict = {c: [k for k, v in partition.items() if v == c] for c in communities_louvain}

# Assign each to an attribute in your network
nx.set_node_attributes(G, partition, 'community_louvain')

#### CORE
- Apply core to have a cleaner vision of the clusters.
- Instead of forcing a cluster to each node like Louvain, Core assigns the label "-1" to a generic cluster. Moreover only the unique pages for each cluster are stored in order to avoid overlapping within multiple clusters

In [50]:
coms = algorithms.core_expansion(G)

In [52]:
def remove_common(a, b):
 
    a = [i for i in a if i not in b]
    
    return a


for tnode in list(G.nodes):
    G.nodes[tnode].pop('coreValue',None)

core_community=[]

for xx in tqdm(range(0,len(coms.communities))):
    core = coms.communities[xx]
    for jj in range(0,len(coms.communities)):
        if (xx != jj):

            core = remove_common(core, coms.communities[jj]) 
            
    core_community.append(core)
    
com_dict={}
for i,c in enumerate(core_community):
    com_dict[i]=c

part={}
for k in com_dict.keys():
    for v in com_dict[k]:
        part[v]=int(k)
        
 
 
main_list = list( set(list(G.nodes)) - set(list(part.keys())) )

for k in main_list:
    part[k] = -1
    


100%|██████████| 457/457 [10:45<00:00,  1.41s/it]  


In [54]:
nx.set_node_attributes(G, part, 'core')

#### TO EXPORT

In [55]:
nx.write_gexf(G, "FILENAME.gexf")