Skip to content

This project is done using Python library NetworkX, which helps us to analyze the GitHub Social Network Data.

Notifications You must be signed in to change notification settings

FareedKhan-dev/GitHub-Social-Network-Analysis

Repository files navigation

SNA Group Assignment

Importing Libraries

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

Importing dataset

users = pd.read_csv('musae_git_target.csv')

Reading Graphs

Data = open('musae_git_edges.csv', "r")

next(Data, None)  # skip the first line in the input file

followers = nx.parse_edgelist(Data, delimiter=',', create_using=nx.Graph(), nodetype=int)

Finding Local Cluster (each nodes clustering coefficient)

local_cluster = nx.clustering(followers)
sorted_local_cluster = {k: v for k, v in sorted(local_cluster.items(), key=lambda item: item[1])}
sorted_local_cluster
{0: 0,
 34526: 0,
 34035: 0,
 6067: 0,
 ...
 33455: 0,
 13958: 0,
 7764: 0,
 ...}

Global Clustering with count zeroes (default)

global_cluster = nx.average_clustering(followers, count_zeros=True)
global_cluster
0.16753704480107323

eccentricity (shortest path with maximum number of nodes)

# import pickle
# with open('eccentricity.pkl', 'rb') as f:
#     eccentricity = pickle.load(f)

eccentricity = nx.eccentricity(followers)
sorted_eccentricity = {k: v for k, v in sorted(eccentricity.items(), key=lambda item: item[1])}

sorted_eccentricity = list(dict(sorted(eccentricity.items(), key=lambda item: item[1], reverse=True)).items())

# get index (node number) and value (node eccentricity value) top 10 after sorting
sorted_eccentricity_indexes = [x[0] for x in sorted_eccentricity]
sorted_eccentricity_values = [x[1] for x in sorted_eccentricity]

# Creating dataframe
top_sorted_eccentricity = pd.DataFrame({'Name':users.iloc[sorted_eccentricity_indexes].name.tolist(), 
                                      'Eccentricity': sorted_eccentricity_values, 
                                      'ml_target':users.iloc[sorted_eccentricity_indexes].ml_target.tolist()})

top_sorted_eccentricity.groupby('Eccentricity').count()['ml_target'].sort_values(ascending=False)
# top_sorted_eccentricity
Eccentricity
7     27399
8      8677
6      1113
9       480
10       27
11        4
Name: ml_target, dtype: int64

Radius of Followers Graph

radius_of_graph = nx.radius(followers)
radius_of_graph
6

Diameter of Followers Graph

diameter_of_graph = nx.diameter(followers)
diameter_of_graph
11

For verification diameter=maximum eccentricity

Density of follower

density_of_graph = nx.density(followers)
density_of_graph
0.0004066878203117068

Degree Distribution upto degree value 50

degree_distrubution = nx.degree_histogram(followers)
fig = plt.figure(figsize=(15, 10))
x = [i for i in range(0, 50)]
plt.bar(x, degree_distrubution[0:50])
plt.show()

png

Connected Components

print(nx.is_connected(followers))
print(nx.number_connected_components(followers))
True
1

Average Path Length

average_short_path_length = nx.average_shortest_path_length(followers)
average_short_path_length
3.2464090056353823

# Creating dataframe
single_value_calc = pd.DataFrame({'Radius': [radius_of_graph], 'Diameter': [diameter_of_graph], 'Density':[density_of_graph], 
                                  'Connected Component': [nx.number_connected_components(followers)],
                                  'Average Path Length': [average_short_path_length]})

# single_value_calc = pd.DataFrame({'Radius': [6], 'Diameter': [11],  'Density': [0.0004066878203117068], 
#                                   'Connected Component': [1], 'Average Path Length': [3.2464090056353823]})

single_value_calc.rename(index={0:'Overall Graph Measures'})
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Radius Diameter Density Connected Component Average Path Length
Overall Graph Measures 6 11 0.000407 1 3.246409
local_clustering_sort = list((k,v) for k, v in sorted(local_cluster.items(), key=lambda item: item[0], reverse=True))
eccentricity_sort = list((k,v)for k, v in sorted(eccentricity.items(), key=lambda item: item[0], reverse=True))

# get index (node number) and value (node centrality value) top 10 after sorting
sorted_indexes = [x[0] for x in local_clustering_sort]
local_values_sort = [x[1] for x in local_clustering_sort]
eccentricity_values_sort = [x[1] for x in eccentricity_sort]

# Creating dataframe
eccentricity_and_local_cluster = pd.DataFrame({'Name':users.iloc[sorted_indexes].name.tolist(), 
                                      'Eccentricity': eccentricity_values_sort, 
                                      'Local Clustering': local_values_sort})

eccentricity_and_local_cluster
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Eccentricity Local Clustering
0 caseycavanagh 7 0.333333
1 Injabie3 8 0.000000
2 qpautrat 7 0.000000
3 kris-ipeh 7 0.000000
4 shawnwanderson 8 0.000000
... ... ... ...
37695 sunilangadi2 8 0.000000
37696 SuhwanCha 7 0.000000
37697 JpMCarrilho 8 0.000000
37698 shawflying 8 0.178571
37699 Eiryyy 8 0.000000

37700 rows × 3 columns


Centrality Measures

Degree Centrality

# Applying degree centrality in NetworX
deg_centrality = nx.degree_centrality(followers)

# Applying degree centrality in NetworX
deg_centrality = nx.degree_centrality(followers)
# Sorting degree centrality and getting top 10
sorted_deg_centrality = list(dict(sorted(deg_centrality.items(), key=lambda item: item[1], reverse=True)).items())

# get index (node number) and value (node centrality value) top 10 after sorting
sorted_deg_centrality_indexes = [x[0] for x in sorted_deg_centrality]
sorted_deg_centrality_values = [x[1] for x in sorted_deg_centrality]

# Creating dataframe
top_degree_centrality = pd.DataFrame({'Name':users.iloc[sorted_deg_centrality_indexes].name.tolist(), 
                                      'Degree Centrality': sorted_deg_centrality_values, 
                                      'ml_target':users.iloc[sorted_deg_centrality_indexes].ml_target.tolist()})

top_degree_centrality
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Degree Centrality ml_target
0 dalinhuang99 0.250882 0
1 nfultz 0.187936 0
2 addyosmani 0.088172 0
3 Bunlong 0.078464 0
4 gabrielpconceicao 0.065466 0
... ... ... ...
37695 chrisryancarter 0.000027 0
37696 kcakdemir 0.000027 1
37697 chadmazilly 0.000027 0
37698 orionblastar 0.000027 1
37699 jovanidash21 0.000027 0

37700 rows × 3 columns

Closeness Centrality

# Loading save json of closeness centrality output
# import pickle
# with open('closness_centrality.pkl', 'rb') as f:
#     closeness_centrality = pickle.load(f)

# Applying closeness centrality in NetworX
closeness_centrality = nx.closeness_centrality(followers)


# Sorting degree centrality and getting top 10
sorted_closness_centrality = list(dict(sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True)).items())

# get index (node number) and value (node centrality value) top 10 after sorting
sorted_closeness_centrality_indexes = [x[0] for x in sorted_closness_centrality]
sorted_closeness_centrality_values = [x[1] for x in sorted_closness_centrality]

# Creating dataframe
top_closeness_centrality = pd.DataFrame({'Name':users.iloc[sorted_closeness_centrality_indexes].name.tolist(), 
                                      'Closeness Centrality': sorted_closeness_centrality_values, 
                                      'ml_target':users.iloc[sorted_closeness_centrality_indexes].ml_target.tolist()})

top_closeness_centrality
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Closeness Centrality ml_target
0 nfultz 0.523081 0
1 dalinhuang99 0.517787 0
2 Bunlong 0.466324 0
3 addyosmani 0.450342 0
4 gabrielpconceicao 0.447461 0
... ... ... ...
37695 haochenli 0.155371 0
37696 abhishekpopli 0.153934 1
37697 scandeiro 0.147439 0
37698 jazzchipc 0.145151 0
37699 SOUMAJYOTI 0.141389 1

37700 rows × 3 columns

Betweenness Centrality

# Loading save json of closeness centrality output
# import pickle
# with open('betweeness_centrality.pkl', 'rb') as f:
#     betweeness_centrality = pickle.load(f)

# Applying betweeness centrality in NetworX
betweeness_centrality = nx.betweenness_centrality(followers)

# Sorting degree centrality and getting top 10
sorted_between_centrality = list(dict(sorted(betweeness_centrality.items(), key=lambda item: item[1], reverse=True)).items())

# get index (node number) and value (node centrality value) top 10 after sorting
sorted_between_centrality_indexes = [x[0] for x in sorted_between_centrality]
sorted_between_centrality_values = [x[1] for x in sorted_between_centrality]

# Creating dataframe
top_between_centrality = pd.DataFrame({'Name':users.iloc[sorted_between_centrality_indexes].name.tolist(), 
                                      'Betweeness Centrality': sorted_between_centrality_values, 
                                      'ml_target':users.iloc[sorted_between_centrality_indexes].ml_target.tolist()})

top_between_centrality
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Betweeness Centrality ml_target
0 dalinhuang99 0.269599 0
1 nfultz 0.240541 0
2 Bunlong 0.055323 0
3 addyosmani 0.043408 0
4 gabrielpconceicao 0.035337 0
... ... ... ...
37695 chrisryancarter 0.000000 0
37696 kcakdemir 0.000000 1
37697 chadmazilly 0.000000 0
37698 orionblastar 0.000000 1
37699 jovanidash21 0.000000 0

37700 rows × 3 columns

plt.subplot(1,3,1)
plt.xlabel('Centrality Values')
plt.title('Degree Centrality')

top_degree_centrality['Degree Centrality'].plot.hist(figsize=(30, 5), ylim=(0,200))
plt.subplot(1,3,2)
plt.xlabel('Centrality Values')
plt.title('Closeness Centrality')

top_closeness_centrality['Closeness Centrality'].plot.hist(figsize=(30, 5))

plt.subplot(1,3,3)
plt.xlabel('Centrality Values')
plt.title('Betweeness Centrality')

top_between_centrality['Betweeness Centrality'].plot.hist(figsize=(30, 5))

plt.show()

png

Visualizing Highest Degree Centrality Node in Graph

followers_network = pd.read_csv('musae_git_edges.csv')
users = pd.read_csv('musae_git_target.csv')

nodes_with_deg_gret_100 = dict((k, v) for k, v in dict(followers.degree).items() if v > 50)

# Creating a function to check if id_1 or id_2 is ML or Web developer
def check_if_exist(id):
    if id in list(nodes_with_deg_gret_100.keys()):
        return 1
    else:
        return 0

followers_network['first_id'] = followers_network['id_1'].apply(check_if_exist)
# followers_network['second_id'] = followers_network['id_2'].apply(check_if_exist)
followers_test = followers_network[(followers_network['first_id'] == 1) & (followers_network['id_2'] == 31890)]
G = nx.from_pandas_edgelist(followers_test, 'id_1', 'id_2')

# Visualizing the highest degree centrality node regin in Graph
fig = plt.figure(figsize=(10, 10))
plt.margins(0,0)

nodes_sizes = []
nodes_color = []

for each in list(G.nodes):
    if each == sorted_deg_centrality[0][0]:
        nodes_sizes.append(4000)
        nodes_color.append('#f02b79')

    elif each == sorted_deg_centrality[1][0]:
        nodes_sizes.append(1000)
        nodes_color.append('#f02b79')
    else:
        nodes_sizes.append(20)
        nodes_color.append('#02A4DE')

print('Highest Degree Centrality Node Lies in Red Shaded Region')
nx.draw(G, node_size=nodes_sizes, node_color=nodes_color, edge_color='#dce8e0')
Highest Degree Centrality Node Lies in Red Shaded Region

png

Visualizing Two Highest Closeness Centrality Node in Graph

followers_network = pd.read_csv('musae_git_edges.csv')
users = pd.read_csv('musae_git_target.csv')

nodes_with_deg_gret_100 = dict((k, v) for k, v in dict(followers.degree).items() if v > 50)

# Creating a function to check if id_1 or id_2 is ML or Web developer
def check_if_exist(id):
    if id in list(nodes_with_deg_gret_100.keys()):
        return 1
    else:
        return 0

followers_network['first_id'] = followers_network['id_1'].apply(check_if_exist)
# followers_network['second_id'] = followers_network['id_2'].apply(check_if_exist)
followers_test = followers_network[(followers_network['first_id'] == 1) & (followers_network['id_2'] == 31890)]
G = nx.from_pandas_edgelist(followers_test, 'id_1', 'id_2')

# Visualizing the highest degree centrality node regin in Graph
fig = plt.figure(figsize=(10, 10))
plt.margins(0,0)

nodes_sizes = []
nodes_color = []

for each in list(G.nodes):
    if each == sorted_closness_centrality[0][0]:
        nodes_sizes.append(4000)
        nodes_color.append('#f02b79')

    elif each == sorted_closness_centrality[1][0]:
        nodes_sizes.append(1000)
        nodes_color.append('#f02b79')
    else:
        nodes_sizes.append(20)
        nodes_color.append('#02A4DE')


print('Highest Closeness Centrality Node Lies in Red Shaded Region')
nx.draw(G, node_size=nodes_sizes, node_color=nodes_color, edge_color='#dce8e0')
Highest Closeness Centrality Node Lies in Red Shaded Region

png

Visualizing Two Betwenness Closeness Centrality Node in Graph

followers_network = pd.read_csv('musae_git_edges.csv')
users = pd.read_csv('musae_git_target.csv')

nodes_with_deg_gret_100 = dict((k, v) for k, v in dict(followers.degree).items() if v > 50)

# Creating a function to check if id_1 or id_2 is ML or Web developer
def check_if_exist(id):
    if id in list(nodes_with_deg_gret_100.keys()):
        return 1
    else:
        return 0

followers_network['first_id'] = followers_network['id_1'].apply(check_if_exist)
# followers_network['second_id'] = followers_network['id_2'].apply(check_if_exist)
followers_test = followers_network[(followers_network['first_id'] == 1) & (followers_network['id_2'] == 31890)]
G = nx.from_pandas_edgelist(followers_test, 'id_1', 'id_2')

# Visualizing the highest degree centrality node regin in Graph
fig = plt.figure(figsize=(10, 10))
plt.margins(0,0)

nodes_sizes = []
nodes_color = []

for each in list(G.nodes):
    if each == sorted_between_centrality[0][0]:
        nodes_sizes.append(4000)
        nodes_color.append('#f02b79')

    elif each == sorted_between_centrality[1][0]:
        nodes_sizes.append(1000)
        nodes_color.append('#f02b79')
    else:
        nodes_sizes.append(20)
        nodes_color.append('#02A4DE')


print('Highest Betweenness Centrality Node Lies in Red Shaded Region')
nx.draw(G, node_size=nodes_sizes, node_color=nodes_color, edge_color='#dce8e0')
Highest Betweenness Centrality Node Lies in Red Shaded Region

png

About

This project is done using Python library NetworkX, which helps us to analyze the GitHub Social Network Data.

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published