<a href="https://colab.research.google.com/github/Andreaierardi/SocialNetworkAnalysis-project/blob/main/project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Social Network Analysis Project**

> ## **University of Milan** - **DataScience and Economics**

<br>

#### Author : **Andrea Ierardi**

<br>


---




 <br>






## Dataset
### Description

Data collected about Facebook pages (November 2017). These datasets represent blue verified Facebook page networks of different categories. Nodes represent the pages and edges are mutual likes among them.








 <br>






In [608]:
import pandas as pd
nodes = pd.read_csv('https://raw.githubusercontent.com/Andreaierardi/SocialNetworkAnalysis-project/main/dataset/fb-pages-food.csv',sep=";")

edges = pd.read_csv('https://raw.githubusercontent.com/Andreaierardi/SocialNetworkAnalysis-project/main/dataset/fb-pages-food.edges')


In [609]:
print(nodes)
print(edges)

                         name,id
0                 Josh Marks,386
1    Blue Ribbon Restaurants,473
2                    Pat Neely,1
3                 La Griglia,542
4                Jose Garces,189
..                           ...
615               Jumia Food,163
616              Luke Thomas,381
617          Clodagh McKenna,140
618  Chef Michelle Bernstein,157
619               SORTEDfood,114

[620 rows x 1 columns]
      source  target
0          0     276
1          0      58
2          0     132
3          0     603
4          0     398
...      ...     ...
2097     597     611
2098     601     603
2099     601     616
2100     603     616
2101     311     613

[2102 rows x 2 columns]


In [595]:
nodes

Unnamed: 0,"name,id"
0,"Josh Marks,386"
1,"Blue Ribbon Restaurants,473"
2,"Pat Neely,1"
3,"La Griglia,542"
4,"Jose Garces,189"
...,...
615,"Jumia Food,163"
616,"Luke Thomas,381"
617,"Clodagh McKenna,140"
618,"Chef Michelle Bernstein,157"


In [596]:
from collections import Counter # Counter counts the number of occurrences of each item
from itertools import tee, count
def uniquify(seq, suffs = count(1)):
    """Make all the items unique by adding a suffix (1, 2, etc).

    `seq` is mutable sequence of strings.
    `suffs` is an optional alternative suffix iterable.
    """
    not_unique = [k for k,v in Counter(seq).items() if v>1] # so we have: ['name', 'zip']
    # suffix generator dict - e.g., {'name': <my_gen>, 'zip': <my_gen>}
    suff_gens = dict(zip(not_unique, tee(suffs, len(not_unique))))  
    for idx,s in enumerate(seq):
        try:
            suffix = str(next(suff_gens[s]))
        except KeyError:
            # s was unique
            continue
        else:
            seq[idx] += suffix
    

In [597]:
names = list(nodes.name)
uniquify(names, (f'_{x!s}' for x in range(1, 100)))
names[0:10]

AttributeError: ignored

In [None]:
# Find the name of the column by index
n = nodes.columns[0]

# Drop that column
nodes.drop(n, axis = 1, inplace = True)

# Put whatever series you want in its place
nodes[n] = names

list(nodes.name)[0:20]

In [None]:
# Nodes as dictionary
nodes_dict = {}
for (n, id) in zip(nodes["name"],nodes["id"]):
 # print(n, "+", id)
  nodes_dict[id] = n

In [None]:
nodes_dict.values()

\





## Network Building



In [None]:
import networkx as nx
from statsmodels.distributions.empirical_distribution import ECDF
import numpy as np
import matplotlib.pyplot as plt

In [None]:
G = nx.Graph()

for e in edges.values.tolist():
  G.add_edge(e[0],e[1])

In [None]:
H = nx.relabel_nodes(G, nodes_dict)
len(sorted(H))

G = H

In [None]:
print('Number of nodes: {} - Number of links:{}'.format(G.order(),G.size()))


\

# Degree Analysis

In [None]:
G.nodes()

In [None]:
density = nx.density(G)
print('Density: {}'.format(density))

In [None]:
degree = list(dict(G.degree()).values())
degree[0:20]

In [None]:
print('Standard deviation: {}'.format(np.std(degree)))
print('Mean: {}'.format(np.mean(degree)))
print('Median: {}'.format(np.median(degree)))
print('Min: {}'.format(np.min(degree)))
print('Max: {}'.format(np.max(degree)))


print("Assortativity coefficient: " + str(nx.degree_assortativity_coefficient(G)))

\

### ECDF

In [None]:
# ECDF linear scale
cdf = ECDF(degree)
x = np.unique(degree)
y = cdf(x)
fig_cdf = plt.figure(figsize=(8,4))
axes = fig_cdf.gca()
axes.plot(x,y,marker='o',ms=6, linestyle='None')
axes.set_xlabel('Degree',size=20)
axes.set_ylabel('ECDF Food', size = 20)

In [None]:

# ECDF loglog scale
cdf = ECDF(degree)
x = np.unique(degree)
y = cdf(x)
fig_cdf = plt.figure(figsize=(8,4))
axes = fig_cdf.gca()
axes.loglog(x,y,marker='o',ms=8, linestyle='--')
axes.set_xlabel('Degree',size=20)
axes.set_ylabel('ECDF Food', size = 20)


In [None]:
# ECCDF
cdf = ECDF(degree)
x = np.unique(degree)
y = cdf(x)
fig_cdf = plt.figure(figsize=(8,4))
axes = fig_cdf.gca()
axes.loglog(x,1-y,marker='o',ms=8, linestyle='--')
axes.set_xlabel('Degree',size=20)
axes.set_ylabel('ECCDF FOOD', size = 20)

In [None]:
p = density
random_graph = nx.fast_gnp_random_graph(G.order(),p)


In [None]:
print('Number of nodes: {}'.format(random_graph.order()))
print('Number of links: {}'.format(random_graph.size()))

In [None]:
random_degree = list(dict(random_graph.degree()).values())
print('Random Net Standard deviation: {}'.format(np.std(random_degree)))
print('Random Net Mean: {}'.format(np.mean(random_degree)))
print('Random Net Median: {}'.format(np.median(random_degree)))
print('Random Net Min: {}'.format(np.min(random_degree)))
print('Random Net Max: {}'.format(np.max(random_degree)))

In [None]:
cdf = ECDF(degree)
x = np.unique(degree)
y = cdf(x)

cdf_random = ECDF(random_degree)
x_random = np.unique(random_degree)
y_random = cdf_random(x_random)

fig_cdf_fb = plt.figure(figsize=(8,4))
axes = fig_cdf_fb.gca()
axes.set_xscale('log')
axes.set_yscale('log')
axes.loglog(x,1-y,marker='o',ms=8, linestyle='--')
axes.loglog(x_random,1-y_random,marker='+',ms=10, linestyle='--')
axes.set_xlabel('Degree',size=20)
axes.set_ylabel('ECCDF', size = 20)

\

## HUBS

In [None]:
percentile_99 = np.percentile(degree,99)
print(percentile_99)

In [None]:
hub_nodi = [k for k,v in dict(G.degree()).items() if v>= percentile_99]


In [None]:
print(len(hub_nodi))
print(list(hub_nodi))

In [None]:
print(list(nx.isolates(G)))


\

## Connectivity

In [None]:
print(nx.is_connected(G))
print(nx.number_connected_components(G))

In [None]:
import math

In [None]:
#nx.draw_networkx(H,)

# draw the graph
pos = nx.spring_layout(G)
# color the nodes according to their partition
nx.draw_networkx_nodes(G, pos, node_size=20, label = list(G.nodes()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()


In [None]:
nx.number_connected_components(G)

In [None]:
#dir(G)

In [None]:
nodes

In [None]:
len(nx.triangles(H))

\

\

## Clusters

In [None]:
clusters = nx.average_clustering(G)
clusters

In [None]:
type(clusters)

In [None]:
nx.transitivity(G)


In [None]:
local_cluster = nx.clustering(G)
sorted(local_cluster.items(), key=lambda item: item[1], reverse= True)


In [None]:
triangle = nx.triangles(G)
triangle

\

## Centrality



In [None]:
deg_centr = nx.degree_centrality(G)


In [None]:
eigen = nx.eigenvector_centrality(G)
#eigen

In [None]:
pagerank = nx.pagerank(G)
#pagerank

In [None]:
betweenesCentrality = nx.betweenness_centrality(G)
#betweenesCentrality

\

## Communities


In [None]:
import networkx.algorithms.community as nx_comm
list_community_sets_greedy = list(nx_comm.greedy_modularity_communities(G))
print(list_community_sets_greedy)

In [None]:
partition_greedy = {}
for i, comm in enumerate(list_community_sets_greedy):
    print("Community:", i)
    print(i,comm)
    for n in comm:
        partition_greedy[n]=i

In [None]:
print(partition_greedy)

In [None]:
# draw the graph
pos = nx.spring_layout(G)
# color the nodes according to their partition
cmap = cm.get_cmap('viridis', max(partition_greedy.values()) + 1)
nx.draw_networkx_nodes(G, pos, partition_greedy.keys(), node_size=40,
                       cmap=cmap, node_color=list(partition_greedy.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()

In [None]:
import community as community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx

In [None]:
partition_library = community_louvain.best_partition(G)

In [None]:
print(partition_library)

In [None]:
# draw the graph
pos = nx.spring_layout(G)
# color the nodes according to their partition
cmap = cm.get_cmap('viridis', max(partition_library.values()) + 1)
nx.draw_networkx_nodes(G, pos, partition_library.keys(), node_size=40,
                       cmap=cmap, node_color=list(partition_library.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()

In [None]:
comms = set(partition_library.values())
comms



In [None]:
list_community_sets_library = [ set() for i in range(len(comms)) ]

In [None]:
for n, comm in partition_library.items():
    list_community_sets_library[comm].add(n)

list_community_sets_library

In [None]:
for my_list in [list_community_sets_greedy,  list_community_sets_library]:
    
    #print("Coverage")
    print("Coverage", nx_comm.coverage(G, my_list))
    #print("Modularity")
    print("Modularity", nx_comm.modularity(G, my_list, weight='weight'))
    #print("Performance")
    print("Performance", nx_comm.performance(G, my_list))
    
    print("---")

In [None]:
list_community_sets_library

In [None]:


pairs = []
for i, nodes in enumerate(list_community_sets_library):
    print(i,len(nodes))
    comm_size = (i,len(nodes))
    pairs.append(comm_size)



In [None]:
pairs

In [None]:
community_index = []
number_of_nodes = []

for comm, n_nodes in pairs:
    community_index.append(str(comm))
    number_of_nodes.append(n_nodes)



In [None]:
plt.bar(community_index,number_of_nodes)
plt.xlabel("Community")
plt.ylabel("Number of nodes")