# Part 2: Email Behaviour Data Analysis

---

### Install Python packages (pip only)

In [None]:
#e.g., %pip install some-package
%pip install networkx
%pip install matplotlib
%pip install numpy
%pip install json

### Import Python packages

In [None]:
#e.g., import some-package
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import json

---

### Task 1 of 1 

Examine the file "emails_cmt224.edgelist" which represents email behaviour at an organisation. Each line contains two numbers, 𝑢 and 𝑣, separated by a blank space. Consider each number as an identifier for an individual in an organisation, with the space on each line representing that the individual, 𝑢, sent at least one email to the another individual, 𝑣, at some point. Model the data using an appropriate, directed network representation and answer the following questions:

##### Q1. Do the majority of individuals have a higher or lower ratio of mutual connections than average in the network?

In [None]:
#CODE:
file_path = "emails_cmt224.edgelist"
eg = nx.read_edgelist(file_path, create_using=nx.DiGraph())
# 计算每个节点的互惠性
reciprocity_per_node = nx.reciprocity(eg, eg.nodes())
print("Individual Reciprocity for 5 example nodes in G (id, reciprocity):")
print(list(reciprocity_per_node.items())[:5])
# 将上述字典的所有值(互惠性)提出来生成一个列表
reciprocity_values = list(reciprocity_per_node.values())
# 计算所有节点的平均互惠性
mean_reciprocity = np.mean(reciprocity_values)
print(f"Mean reciprocity: {mean_reciprocity:.2f}")
# 将所有节点的互惠性与平均值比较，统计小于平均值的个数、大于平均值的个数
higher_count = 0
lower_count = 0
total_num = eg.number_of_nodes()
for node in reciprocity_values:
    if node > mean_reciprocity: higher_count += 1
    elif node < mean_reciprocity: lower_count += 1
# 计算两者的比率
higher_rate = higher_count / total_num
lower_rate = lower_count / total_num
print(f"The number of higher one: {higher_count}, the rate is: {higher_rate:.2f}")
print(f"The number of lower one: {lower_count}, the rate is: {lower_rate:.2f}")

##### Q2. Using the largest, strongly connected component (where at least one path exists between each individual and all others). Could the connectivity of the component be suggested to be reflective of a small world phenomenon in comparison to the typical connectivity of 10 comparative random networks?

In [None]:
#CODE:
# 找最大强连通组件
strongly_connected_components = nx.strongly_connected_components(eg)
# 按照len也就是节点数来排列
strongly_connected_components_sorted_by_number_of_nodes = sorted(
    strongly_connected_components, 
    key=len,
    reverse=True # 反顺序，就是从大到小排列
)
# 取最多节点的强连通组件
list_of_nodes_in_largest_strongly_connected_component = strongly_connected_components_sorted_by_number_of_nodes[0]
# 利用最大的强连通组件创建子图
LSCC = eg.subgraph(list_of_nodes_in_largest_strongly_connected_component).copy()
# 对比子图和整个图的信息
# 子图和全部图的节点数
print(f"Number of nodes in eg: {eg.number_of_nodes()}")
print(f"Number of nodes in the largest strongly connected component of eg: {LSCC.number_of_nodes()}")
# 子图和全部图的边数
print(f"Number of edges in eg: {eg.number_of_edges()}")
print(f"Number of edges in the largest strongly connected component of eg: {LSCC.number_of_edges()}")
# 计算子图的 最短路径长度 和 平均聚类系数
eg_sub_average_shortest_path_length = nx.average_shortest_path_length(LSCC)
eg_sub_average_clustering = nx.average_clustering(LSCC)
print(f"Average shortest path length in eg's largest strongly connected component: {eg_sub_average_shortest_path_length:.2f}")
print(f"Average clustering coefficient in eg's largest strongly connected component: {eg_sub_average_clustering:.2f}")

# 判断该子图是否支持“小世界”属性
# 生成10个随机网络，规格与子图节点数、边数相同
random_average_shortest_path_length = []
random_average_clustering = []
for i in range(10):
    R = nx.gnm_random_graph(LSCC.number_of_nodes(), LSCC.number_of_edges(), directed=True)
    # 在随机网络中，生成强连通组件列表并按照节点数量排序，且取大的那个
    R_LSCC_nodes = sorted(nx.strongly_connected_components(R), key=len, reverse=True)[0]
    # 在具有最多节点数的强连通组件下，生成子网
    R_LSCC = R.subgraph(R_LSCC_nodes).copy()
    # 将子图的 最短路径长度 和 平均聚类系数 储存到列表
    random_average_shortest_path_length.append(nx.average_shortest_path_length(R_LSCC))
    random_average_clustering.append(nx.average_clustering(R_LSCC))
# 将随机网络结果与实际子图的结果对比
mean_random_average_shortest_path_length = np.mean(random_average_shortest_path_length)
mean_random_average_clustering = np.mean(random_average_clustering)
print(f"The mean of average shortest path length in 10 random world: {mean_random_average_shortest_path_length:.2f}")
print(f"The mean of average clustering in 10 random world: {mean_random_average_clustering:.2f}")

##### Q3. Are occurrences of induced, connected subgraphs of 3 individuals (triads) with only mutual connections more abundant in the largest, strongly connected component than those with a mixture of asymmetric and mutual connections? What does this suggest about how mutual connections are distributed in the component?

In [None]:
#CODE:
def calculate_normalised_connected_triadic_census(H):
    # 找出所有三元组，返回一个字典，键三元组的类型标识符如"300"，值为该类型三元组出现次数
    tc = nx.triadic_census(H)
    # 删除非连通三元组数据
    del tc["003"]
    del tc["012"]
    del tc["102"]
    # 规范化数据，使其和为1，为了看出比例
    factor = 1.0 / sum(tc.values())
    for k in tc:
        tc[k] = round(tc[k] * factor, 2)
    return tc

def triads_compared(tc):
    # 计算只包含互惠连接的三元组数量（"300"）
    only_mutual = tc.get("300")
    # 计算包含混合连接的三元组数量（非"300"，但也不是非连通的）
    mixture = sum(value for key, value in tc.items() if key != "300")
    print(f"The rate of only mutual: {only_mutual:2}")
    print(f"The rate of those with a mixture of asymmetric and mutual connections: {round(mixture, 2)}")

# 调用函数，计算LSCC(子图)的三元组普查并规范化
triadic_census = calculate_normalised_connected_triadic_census(LSCC)
# 输出三元组普查结果，特别注意"300"类型
print(triadic_census)
# 调用函数，比较结果
triads_compared(triadic_census)

---
### Task 2 of 2

Examine the JSON file "emails_cmt224_departments.json" (departments file). Keys in the departments file represent individuals using the same ids as in the "emails_cmt224.edgelist" file in Part 2, Task 1 and the values represent a department id that the individual can be attributed to. Using the contents of the departments file in combination with the network in Part 2, Task 1, answer the following questions:

##### Q1. Using the connections that individuals have in the network, are they more likely to mix with others in their department or those with a similar number of outward connections?

In [None]:
#CODE:
# 加载数据，是个字典
with open("emails_cmt224_departments.json") as json_file:
    departments = json.load(json_file)
print(departments)
# 数据准备
department_connections = 0
similar_connections = 0

# 计算连接数(度)
degrees = dict(eg.degree())
# 遍历每个节点
for node in eg.nodes():
    this_department = departments[node]
    this_degee = degrees[node]
    # 分析其连接的节点是属于同部门还是具有相似连接数
    for other in eg.neighbors(node):
        if departments[other] == this_department: department_connections += 1
        if degrees[other] == this_degee: similar_connections += 1
print(f"与同一部门的连接数: {department_connections}")
print(f"与连接数相似的人的连接数: {similar_connections}")

##### Q2. Are all departments with 15 or more members more tightly connected amongst themselves in comparison to all individuals across the overall network irrespective of their department?  Where in this context, 'more tightly connected' is defined as having more mutual AND clustered connections. In addition to answering the overall question as yes or no, provide a list of departments this is true for (if any) and not true for (if any).

In [None]:
#CODE:
# 函数：计算该图的互惠连接性
def computing_reciprocity(G):
    reciprocity_per_node = nx.reciprocity(G, G.nodes())
    reciprocity_values = []
    for value in reciprocity_per_node.values():
        if value is not None: reciprocity_values.append(float(value))
        else: reciprocity_values.append(0)
    mean_reciprocity = np.mean(reciprocity_values)
    return mean_reciprocity

# 函数：创建子图
def create_subgraph(dep, dep_dict, G):
    # 从部门字典中选择属于该部门的所有成员
    target_member_list = [member for member, m_dep in dep_dict.items() if m_dep == dep]
    # 从图G中选择这些成员的节点，创建子图
    subgraph = G.subgraph(target_member_list).copy()
    return subgraph

# 找出有大于或等于15个成员的所有部门，创建符合条件的部门列表，并创建不符合条件的部门列表
department_sizes = {}
# 计算每个部分的成员数量
for member, department in departments.items():
    if department not in department_sizes:
        department_sizes[department] = 0
    department_sizes[department] += 1
# 分割大部门、小部门
large_departments = {}
small_departments = {}
for dep, counts in department_sizes.items():
    if counts >= 15: large_departments[dep] = counts
    else: small_departments[dep] = counts
    
# 初始化所有大部门的聚类系数和平均互惠连接性
large_dep_mean_clustering = []
large_dep_mean_reciprocity = []
# 计算每个部门内部的互惠连接性和集群系数
for department, size in large_departments.items():
    # 创建该部门子图
    dep_subgraph = create_subgraph(department, departments, eg)
    # 计算该子图所有节点的平均互惠连接性和聚类系数
    this_dep_clustering = nx.average_clustering(dep_subgraph)
    this_reciprocity = computing_reciprocity(dep_subgraph)
    # 将结果添加到列表
    large_dep_mean_clustering.append(this_dep_clustering)
    large_dep_mean_reciprocity.append(this_reciprocity)

# 计算所有大部门的平均值
mean_large_dep_clustering = np.mean(large_dep_mean_clustering)
mean_large_dep_reciprocity = np.mean(large_dep_mean_reciprocity)
# 计算网络整体的互惠连接性和集群系数
total_clustering = nx.average_clustering(eg)
total_mean_reciprocity = computing_reciprocity(eg)
# 比较，判断是否部门内部更紧密连接
print(f"大部门的平均聚类系数: {mean_large_dep_clustering:.2f}")
print(f"大部门的平均互惠链接性: {mean_large_dep_reciprocity:.2f}")
print(f"全部图的平均聚类系数: {total_clustering:.2f}")
print(f"全部图的平均互惠链接性: {total_mean_reciprocity:.2f}")