In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from pathlib import Path
import matplotlib.pyplot as plt
# import seaborn as sns

import sys
sys.path.append("../../") # root-dir of the project
from src.features.util import calculate_key_figures

# Setup & Import

Paths and filenames

In [2]:
DATA_DIR = Path("../../data/")
filename_relationships = "Following_Ignoring_Relationships_01052019_31052019.csv"
filename_users = "user.csv"

# use this output-path for saving figures
FIG_OUTPUT_DIR = Path("../../reports/figures/follows/")

Load the data

In [3]:
relationships = pd.read_csv(DATA_DIR / "raw" / filename_relationships, sep=';')
users = pd.read_csv(DATA_DIR / "processed" / filename_users)

In [4]:
users.isnull().sum() / len(users)

ID_CommunityIdentity    0.000000
UserGender              0.479897
account_age             0.368144
dtype: float64

# Basic Exploration

In [5]:
relationships.head()

Unnamed: 0,ID_CommunityIdentity,ID_CommunityIdentityConnectedTo,ID_CommunityConnectionType
0,1778,246490,1
1,5872,5872,1
2,9030,23875,1
3,9030,508504,1
4,10569,10569,1


In [6]:
relationships.ID_CommunityConnectionType.value_counts()

ID_CommunityConnectionType
1    57117
2    29659
Name: count, dtype: int64

This dataset contains 57117 follows and 29659 ignore relationships

Rename the columns for convenience

In [7]:
relationships.columns = ["source_user_id", "target_user_id", "type"]

In [8]:
relationships.isnull().sum() / relationships.isnull().count()

source_user_id    0.0
target_user_id    0.0
type              0.0
dtype: float64

No missing values which is nice.

In [9]:
len(relationships[relationships.source_user_id == relationships.target_user_id])

3148

3148 Users follow themselves which is possible but does not convey any meaningful information from a social-science perspective and only distorts the graphs with unnecessary circles.
We therefore remove them.

**Note:** Ignoring yourself is not possible on this platform

In [10]:
relationships = relationships[~(relationships.source_user_id == relationships.target_user_id)]

## Merge with attributes df

Merge the relationships dataframe with the user dataframe, which contains the additional attributes Gender, Date of Creation for each user-id (if present)

In [11]:
relationships.head()

Unnamed: 0,source_user_id,target_user_id,type
0,1778,246490,1
2,9030,23875,1
3,9030,508504,1
5,10569,91850,1
6,10569,547472,1


In [12]:
users_source = users.rename(columns={'ID_CommunityIdentity': 'source_user_id', 'UserGender': 'source_user_gender', 'account_age': 'source_user_account_age'})
users_target = users.rename(columns={'ID_CommunityIdentity': 'target_user_id', 'UserGender': 'target_user_gender', 'account_age': 'target_user_account_age'})

In [13]:
intermediate_df = pd.merge(relationships, users_source, on='source_user_id', how='left')

In [14]:
relationships_enriched = pd.merge(intermediate_df, users_target, on='target_user_id', how='left')

To further investigate the homophily by the attributes gender and date of creation we create seperate dataframes where all the rows with missing values are removed for the specific attribute

In [15]:
relationships_enriched.head()

Unnamed: 0,source_user_id,target_user_id,type,source_user_gender,source_user_account_age,target_user_gender,target_user_account_age
0,1778,246490,1,,,m,19.0
1,9030,23875,1,,,m,18.0
2,9030,508504,1,,,w,12.0
3,10569,91850,1,,,m,22.0
4,10569,547472,1,,,,


In [16]:
relationships_with_gender = relationships_enriched.dropna(subset=['source_user_gender', 'target_user_gender'])
relationships_with_date = relationships_enriched.dropna(subset=['source_user_account_age', 'target_user_account_age'])

Split the data to create separate graphs for each type of relationship

In [17]:
following_relationships = relationships_enriched[relationships_enriched['type'] == 1].drop(columns=['type'])
ignoring_relationships = relationships_enriched[relationships_enriched['type'] == 2].drop(columns=['type'])

following_relationships_gender = relationships_with_gender[relationships_with_gender['type'] == 1].drop(columns=['type'])
ignoring_relationships_gender = relationships_with_gender[relationships_with_gender['type'] == 2].drop(columns=['type'])

following_relationships_with_date = relationships_with_date[relationships_with_date['type'] == 1].drop(columns=['type'])
ignoring_relationships_with_date = relationships_with_date[relationships_with_date['type'] == 2].drop(columns=['type'])

In [18]:
len(following_relationships)

53969

In [19]:
len(following_relationships_with_date)

35533

In [20]:
len(following_relationships_gender)

27357

# Sample

In [21]:
'''
n_follow = "full-"
n_ignore = "full-"
following_relationships_sample = following_relationships  #.sample(n=n_follow, random_state=1040)
ignoring_relationships_sample = ignoring_relationships  #.sample(n=n_ignore, random_state=1040)
''';

# Numeric Analysis

## Assortative Mixing by Degree

Here we use the full graph since we only need the nodes and the edges of the graph and no additional information

Construct graphs

In [63]:
G_follow = nx.from_pandas_edgelist(following_relationships,
                            source='source_user_id',
                            target='target_user_id',
                            create_using=nx.DiGraph)

G_ignore = nx.from_pandas_edgelist(ignoring_relationships,
                                   source='source_user_id',
                                   target='target_user_id',
                                   create_using=nx.DiGraph)

Calculate key figures

In [64]:
key_figures_follow = calculate_key_figures(G_follow, display=True)

Number of nodes: 14497
Number of edges: 53969
Average degree: 7.445540456646203
Average clustering coefficient: 0.03222674311027298
Degree assortativity coefficient: -0.11808474766165833


 * **Connectivity:**
The average degree is about 7.45, which suggests that on average, each user follows around 7-8 other users. This is a moderate level of connectivity which may indicate that users have selective preferences or do not see the benefits in following other users even if they use the platform regularly (No News-Feed).<br>

* **Low Clustering:**
The average clustering coefficient is quite low at approximately 0.032. This indicates that the users' followers are not highly interconnected; that is, a follower of one user is not likely to be a follower of another user that they follow. It reflects a lack of tightly-knit communities or cliques within the network.<br>

* **Disassortative Mixing:**
The negative degree assortativity coefficient (-0.118) suggests that the network exhibits disassortative mixing by degree, meaning users with many followers are more likely to follow users with fewer followers and vice versa. This is common in social networks where popular individuals (celebrities, influencers) are followed by many but themselves follow few.

**Note:** Interpretation was made for the results of the unfiltered graph

In [65]:
key_figures_ignore = calculate_key_figures(G_ignore, display=True)

Number of nodes: 8694
Number of edges: 29659
Average degree: 6.822866344605475
Average clustering coefficient: 0.0068942632903669695
Degree assortativity coefficient: -0.21685760911155277


* **Connectivity:**
The average degree is around 6.82, which means that, on average, each user has blocked nearly 7 other users.
Its interesting that users almost block as much users on average as they follow.

* **Low Clustering:**
The average clustering coefficient is extremely low (~0.000037), nearly zero. This suggests that users who block one user are not likely to block the same users as each other. In other words, there's minimal tendency for 'cliques' of mutually blocking users to form. This could imply that reasons for blocking are very individual and not influenced by communal or shared grievances.

* **Disassortative Mixing:**
A negative degree assortativity coefficient of -0.2169 is even more pronounced than in the following network. It shows a tendency for users to block users with dissimilar degrees. High-degree users (those who block many others) tend to block users who block fewer users.


## Assortative Mixing by a discrete attribute (Gender)

Create graphs

In [66]:
G_follow_gender = nx.from_pandas_edgelist(following_relationships_gender,
                                   source='source_user_id',
                                   target='target_user_id',
                                   create_using=nx.DiGraph)

G_ignore_gender = nx.from_pandas_edgelist(ignoring_relationships_gender,
                                   source='source_user_id',
                                   target='target_user_id',
                                   create_using=nx.DiGraph)

Add attributes

In [67]:
for _, row in users.iterrows():
    # Check if node exists in the graph, then add the gender attribute
    if row['ID_CommunityIdentity'] in G_follow_gender:
        nx.set_node_attributes(G_follow_gender, {row['ID_CommunityIdentity']: row['UserGender']}, 'gender')
    if row['ID_CommunityIdentity'] in G_ignore_gender:
        nx.set_node_attributes(G_ignore_gender, {row['ID_CommunityIdentity']: row['UserGender']}, 'gender')

Following-Graph: Key-figures

In [68]:
calculate_key_figures(G_follow_gender, display=True);
gender_assortativity = nx.attribute_assortativity_coefficient(G_follow_gender, 'gender')
print(f"Gender Assortativity: {gender_assortativity}")

Number of nodes: 7032
Number of edges: 27357
Average degree: 7.780716723549488
Average clustering coefficient: 0.03633332905792894
Degree assortativity coefficient: -0.1453662068784821
Gender Assortativity: 0.08494605011747595


Ignore-Graph: Key-figures

In [69]:
calculate_key_figures(G_ignore_gender, display=True);
gender_assortativity = nx.attribute_assortativity_coefficient(G_ignore_gender, 'gender')
print(f"Gender Assortativity: {gender_assortativity}")

Number of nodes: 4670
Number of edges: 15160
Average degree: 6.492505353319058
Average clustering coefficient: 0.007868070902570152
Degree assortativity coefficient: -0.2261378257159407
Gender Assortativity: 0.04227104861199373


## Assortative Mixing by a continuous attribute (Age of Account)

Create graphs

In [70]:
G_follow_age = nx.from_pandas_edgelist(following_relationships_with_date,
                                          source='source_user_id',
                                          target='target_user_id',
                                          create_using=nx.DiGraph)

G_ignore_age = nx.from_pandas_edgelist(ignoring_relationships_gender,
                                          source='source_user_id',
                                          target='target_user_id',
                                          create_using=nx.DiGraph)

Add attributes

In [71]:
for _, row in users.iterrows():
    if row['ID_CommunityIdentity'] in G_follow_age:
        nx.set_node_attributes(G_follow_age, {row['ID_CommunityIdentity']: row['account_age']}, 'age')

for _, row in users.iterrows():
    if row['ID_CommunityIdentity'] in G_ignore_age:
        nx.set_node_attributes(G_ignore_age, {row['ID_CommunityIdentity']: row['account_age']}, 'age')

Following-Graph: Key-figures

In [72]:
calculate_key_figures(G_follow_age, display=True);
age_assortativity = nx.numeric_assortativity_coefficient(G_follow_age, 'age')
print(f"Age Assortativity: {age_assortativity}")

Number of nodes: 8847
Number of edges: 35533
Average degree: 8.032779473267775
Average clustering coefficient: 0.03506913540503731
Degree assortativity coefficient: -0.13635769989821428
Age Assortativity: 0.12252669716232956


Ignore-Graph: Key-figures

In [73]:
calculate_key_figures(G_ignore_age, display=True);
age_assortativity = nx.numeric_assortativity_coefficient(G_ignore_age, 'age')
print(f"Age Assortativity: {age_assortativity}")

Number of nodes: 4670
Number of edges: 15160
Average degree: 6.492505353319058
Average clustering coefficient: 0.007868070902570152
Degree assortativity coefficient: -0.2261378257159407
Age Assortativity: -0.015724069469989773


# Visual Analysis

## Assortative Mixing by Degree

### Following-Network: Visualization

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_follow)
plt.savefig(FIG_OUTPUT_DIR / f"follow_by_degree.png", format="PNG")
plt.show()

### Ignore-Network: Visualization

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_ignore)
plt.savefig(FIG_OUTPUT_DIR / f"ignore_by_degree.png", format="PNG")
plt.show()

## Assortative Mixing by a discrete attribute (Gender)

### Following-Network: Visualization

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_follow_gender)
plt.savefig(FIG_OUTPUT_DIR / f"follow_by_gender.png", format="PNG")
plt.show()

### Ignore-Network: Visualization

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_ignore_gender)
plt.savefig(FIG_OUTPUT_DIR / f"ignore_by_gender.png", format="PNG")
plt.show()

## Assortative Mixing by a continuous attribute (Age of the Account)

### Following-Network: Visualization

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_follow_age)
plt.savefig(FIG_OUTPUT_DIR / f"follow_by_age.png", format="PNG")
plt.show()

### Ignore-Network: Visualization

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_ignore_age)
plt.savefig(FIG_OUTPUT_DIR / f"follow_by_age.png", format="PNG")
plt.show()

## Degree Distribution

In [None]:
degree_sequence = sorted([d for n, d in G_follow.in_degree], reverse=True)

In [None]:
# Create the bins for the histogram
min_degree = min(degree_sequence)
max_degree = max(degree_sequence)

# Ensure the minimum and maximum are multiples of 3
min_bin = min_degree - (min_degree % 3)
max_bin = max_degree + (3 - max_degree % 3)

bins = list(range(min_bin, max_bin + 1, 3))  # Create bins from min_bin to max_bin with a step of 3

# Plot the histogram
plt.figure(figsize=(12, 8))
plt.hist(degree_sequence, bins=bins, density=True)  # Specify the custom bins
plt.title("Degree Histogram")
plt.ylabel("Density")
plt.xlabel("Degree")
plt.xticks(bins)  # Set x-ticks to match the bins
plt.show()

Create Log-Log Plot

In [None]:
plt.figure(figsize=(12, 8))
hist, bins = np.histogram(degree_sequence, bins=50, density=True)
logbins = np.logspace(np.log10(bins[0]), np.log10(bins[-1]), len(bins))

plt.plot(logbins[:-1], hist, linestyle='None', marker='.')
plt.xscale('log')  # Set log scale for x-axis
plt.yscale('log')  # Set log scale for y-axis
plt.title("Degree Distribution on Log-Log Scale")
plt.xlabel("Degree (log scale)")
plt.ylabel("Probability Density (log scale)")
plt.show()

# Further Analysis - WIP from here on

We can see that there is a very connected center of users in the graph, while other users do only interact with a few select users and are not connected to the well-connected users.
To further investigate this we take a look at the nodes with higher degrees

In [None]:
# Assuming 'G' is your original NetworkX graph
G_filtered = G_follow.subgraph(node for node, degree in dict(G_follow.degree()).items() if degree >= 5).copy()

In [None]:
fig = plt.figure(figsize=(50,50))
nx.draw_spring(G_filtered)
# plt.savefig(FIG_OUTPUT_PATH / f"ignore_{n_ignore}n.png", format="PNG")
plt.show()

In [None]:
fig = plt.figure(figsize=(50,50))

node_size = [200 * G_filtered.degree(v) for v in G_filtered.nodes()]

nx.draw_spring(G_filtered, node_size=node_size, with_labels=False)
#plt.savefig(FIG_OUTPUT_PATH / f"ignore_{n_ignore}n.png", format="PNG")
plt.show()

# Distributions

In [None]:
'''pos = nx.spring_layout(G_
betCent = nx.betweenness_centrality(G, normalized=True, endpoints=True)
node_color = [20000.0 * G.degree(v) for v in G]
node_size =  [v * 10000 for v in betCent.values()]
plt.figure(figsize=(20,20))
nx.draw_networkx(G, pos=pos, with_labels=False,
                 node_color=node_color,
                 node_size=node_size )
plt.axis('off')
# plt.savefig(FIG_OUTPUT_PATH / "G_sample_5000.png, format="PNG")
plt.show()''';