In [1]:
from collections import Counter
from tqdm import tqdm
import random
import copy

import networkx as nx
import pandas as pd
import numpy as np

In [2]:
def get_nx_graph(edge_list_path):
    edge_list = pd.read_csv(edge_list_path, index_col = 0)
    aev = edge_list.values
    edges_list_t = [(aev[i][0], aev[i][1]) for i in tqdm(range(len(aev)))]
    edge_to_index = {(aev[i][0], aev[i][1]):i for i in tqdm(range(len(aev)))}
    return edge_to_index, nx.DiGraph((x, y) for (x, y) in tqdm(Counter(edges_list_t)))    

def get_subraph(N, source: int, depth_limit: int = 4):
    nodes = list(nx.dfs_preorder_nodes(N, source = source, depth_limit = depth_limit))
    H = N.subgraph(nodes)
    print("Nodes in subgraph: ", len(nodes), "\nEdges in subgraph: ", len(H.edges()))
    return H

### Notebook navigation


[Co-authorship subgraph extraction](#ca_subgraph_extraction)

[Citation subgraph extraction](#citation_graph_extraction)

[Dataset clear](#dataset_clear)

[Final check](#final_check)

[Save data](#save_data)

### Co-authorship subgraph extraction
<a id='ca_subgraph_extraction'></a>

In [92]:
global_dataset = "SSORC_CS_2010_2021"

In [93]:
edge_to_index_A, A = get_nx_graph("processed_data/" + global_dataset + "_authors_edge_list.csv")

100%|██████████| 30796749/30796749 [00:21<00:00, 1414189.08it/s]
100%|██████████| 30796749/30796749 [00:35<00:00, 858394.00it/s]
100%|██████████| 30796749/30796749 [01:15<00:00, 407047.74it/s]


In [94]:
edge_to_index_G, G = get_nx_graph("processed_data/" + global_dataset + "_papers_edge_list_indexed.csv")

100%|██████████| 17921409/17921409 [00:11<00:00, 1524559.81it/s]
100%|██████████| 17921409/17921409 [00:20<00:00, 882486.28it/s]
100%|██████████| 17921409/17921409 [02:25<00:00, 123114.75it/s]


print("Number of connected components in the initial graphs: ", \
      nx.number_connected_components(A.to_undirected()), \
      nx.number_connected_components(G.to_undirected()))

In [None]:
Acc = sorted(nx.connected_components(A.to_undirected()), key=len, reverse=True)

In [None]:
print("Some CC sizes: ", len(Acc[0]), len(Acc[1]), len(Acc[2]))

Co-authorship network edge list:

In [95]:
authors_edges = pd.read_csv("processed_data/" + global_dataset + "_authors_edge_list.csv", index_col = 0)

Co-authorship network edges papers:

In [96]:
authors_edges_papers = pd.read_csv("processed_data/" + global_dataset + "_authors_edges_papers_indices.csv", index_col = 0, \
                                   converters={"papers_indices": lambda x: x.strip("[]").replace("'","").split(", ")})

In [97]:
source = 144092530 #random.choice(list(A.nodes()))
#source = 46812895 #random.choice(list(A.nodes()))
#source = 153425792 #random.choice(list(A.nodes()))
sub_A = get_subraph(A, source, depth_limit = 5) #A.subgraph(list(Acc[1]))
source

Nodes in subgraph:  1437 
Edges in subgraph:  14972


144092530

In [68]:
print("Subgraph A connected components check: ", nx.number_connected_components(sub_A.to_undirected()))

Subgraph A connected components check:  1


In [98]:
sub_A_edges = list(sub_A.edges())
len(sub_A_edges)

14972

Obtaining papers corresponding to co-authorship graph edges:

In [99]:
authors_edges_papers_sub = [authors_edges_papers["papers_indices"][edge_to_index_A[sub_A_edges[i]]] for i in tqdm(range(len(sub_A_edges)))]

100%|██████████| 14972/14972 [00:00<00:00, 51455.76it/s]


Extracting unique papers:

In [100]:
authors_edges_papers_sub_flat = [int(item) for subarray in authors_edges_papers_sub for item in subarray]
unique_papers = list(set(authors_edges_papers_sub_flat))

In [101]:
print("Total amount of mentioned papers: ", len(authors_edges_papers_sub_flat), \
      "\nUnique papers number: ", len(unique_papers))

Total amount of mentioned papers:  20299 
Unique papers number:  3210


### Citation subgraph extraction
<a id='citation_graph_extraction'></a>

In [102]:
len(G.nodes)

2504381

In [103]:
papers_to_delete_initial = list(set(unique_papers) - set(G.nodes))

In [104]:
len(papers_to_delete_initial )

46

In [105]:
G_sub = G.subgraph(unique_papers)

In [106]:
G_sub_nodes = list(G_sub.nodes())

In [107]:
len(G_sub_nodes)

3164

### Dataset clear
<a id='dataset_clear'></a>

Get connected components of initial citation graph:

In [44]:
Gcc = sorted(nx.connected_components(G_sub.to_undirected()), key=len, reverse=True)

In [45]:
print("Some CC sizes: ", len(Gcc[0]), len(Gcc[1]), len(Gcc[2]))

Some CC sizes:  1276 76 69


In [46]:
remnants = 0
for i in range(1, len(Gcc)):
    remnants += len(Gcc[i])
print ("Number of papers out of GCC: ", remnants)

Number of papers out of GCC:  1888


Get id's of papers that are not presented in the largest connected component:

In [108]:
papers_out_lcc = []

In [602]:
for i in tqdm(range(len(G_sub_nodes))):
    if G_sub_nodes[i] not in Gcc[0]:
        papers_out_lcc.append(G_sub_nodes[i])

100%|██████████| 5339/5339 [00:00<00:00, 1732429.91it/s]


In [603]:
print("Total number of papers to delete: ", len(papers_out_lcc))

Total number of papers to delete:  1372


Removing unwanted papers from G_sub:

In [78]:
G_sub_clear = nx.DiGraph(G_sub)

In [605]:
for i in range(len(papers_out_lcc)):
    node = papers_out_lcc[i]
    G_sub_clear.remove_node(node) 

In [606]:
print(nx.number_connected_components(G_sub_clear.to_undirected()))

1


In [65]:
G_sub_clear_nodes = list(G_sub_clear.nodes)

Obtaining list of disappeared collaborations after papers removing:

In [111]:
papers_out_lcc += papers_to_delete_initial

In [112]:
jj = []
collabs_indices_to_delete = []

In [113]:
for i in tqdm(range(len(papers_out_lcc))):
    for j in range(len(authors_edges_papers_sub)):
#        if str(1745104) in authors_edges_papers_sub[j]:
#            jj.append(j)
        if str(papers_out_lcc[i]) in authors_edges_papers_sub[j]:
            del authors_edges_papers_sub[j][authors_edges_papers_sub[j].index(str(papers_out_lcc[i]))]
            if len(authors_edges_papers_sub[j]) == 0:
                collabs_indices_to_delete.append(j)

100%|██████████| 46/46 [00:00<00:00, 212.41it/s]


In [114]:
collabs_indices_to_delete_copy = copy.deepcopy(collabs_indices_to_delete)

In [115]:
len(collabs_indices_to_delete)

106

In [116]:
A_sub_clear = nx.DiGraph(sub_A)

In [117]:
A_sub_clear_edges = list(A_sub_clear.edges())

In [118]:
for i in tqdm(range(len(collabs_indices_to_delete))):
    edge = A_sub_clear_edges[collabs_indices_to_delete[i]]
    if edge not in A_sub_clear_edges:
        print("error")

    A_sub_clear.remove_edge(*edge) 


100%|██████████| 106/106 [00:00<00:00, 6237.58it/s]


In [119]:
authors_edges_papers_sub_clear = [authors_edges_papers_sub[i] for i in range(len(authors_edges_papers_sub)) if len(authors_edges_papers_sub[i]) > 0]

In [120]:
len(A_sub_clear_edges) -  len(collabs_indices_to_delete), len(authors_edges_papers_sub_clear)

(14866, 14866)

### CHECK

In [121]:
A_sub_clear_edges_check = list(A_sub_clear.edges())

In [122]:
len(A_sub_clear_edges), len(A_sub_clear_edges_check)

(14972, 14866)

In [123]:
authors_edges_papers_sub_2 = [authors_edges_papers["papers_indices"][edge_to_index_A[A_sub_clear_edges_check[i]]] for i in tqdm(range(len(A_sub_clear_edges_check)))]

100%|██████████| 14866/14866 [00:00<00:00, 57772.81it/s]


In [124]:
authors_edges_papers_sub_2 = [authors_edges_papers["papers_indices"][edge_to_index_A[A_sub_clear_edges_check[i]]] for i in tqdm(range(len(A_sub_clear_edges_check)))]
authors_edges_papers_sub_flat_2 = [int(item) for subarray in authors_edges_papers_sub_2 for item in subarray]
unique_papers_2 = list(set(authors_edges_papers_sub_flat_2))

100%|██████████| 14866/14866 [00:00<00:00, 72905.78it/s]


In [125]:
len(authors_edges_papers_sub_flat_2)

20151

In [126]:
len(unique_papers_2),len(G_sub_clear_nodes)

(3164, 3164)

In [626]:
set(unique_papers_2).difference(set(G_sub_clear_nodes))

set()

In [627]:
counter = 0
for i in tqdm(range(len(authors_edges_papers_sub_2))):
    if len(authors_edges_papers_sub_2[i]) == 0:
        counter +=1

counter

100%|██████████| 109642/109642 [00:00<00:00, 2720314.46it/s]


0

In [628]:
len(authors_edges_papers_sub_2), len(A_sub_clear_edges_check)

(109642, 109642)

In [629]:
authors_edges_papers_sub = authors_edges_papers_sub_2

In [630]:
counter = 0
for paper in tqdm(unique_papers_2):
    if paper not in G_sub_clear_nodes:
        counter += 1
        print(paper)
counter

100%|██████████| 3967/3967 [00:00<00:00, 40296.25it/s]


0

Delete unwanted authors that remain separated after collaboration removing:

In [631]:
Gcc_a = sorted(nx.connected_components(A_sub_clear.to_undirected()), key=len, reverse=True)

In [632]:
print("Connected compontents in the clear graph: ", [len(Gcc_a[i]) for i in range(len(Gcc_a))])

Connected compontents in the clear graph:  [1897, 4, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [633]:
for i in range(1, len(Gcc_a)):
    authors_2_delete = list(Gcc_a[i])
    for j in range(len(authors_2_delete)):
        node = authors_2_delete[j]
        A_sub_clear.remove_node(node) 

In [40]:
Gcc_a_f = sorted(nx.connected_components(A_sub_clear.to_undirected()), key=len, reverse=True)

In [41]:
print("Connected compontents in the final graph: ", [len(Gcc_a_f[i]) for i in range(len(Gcc_a_f))])

Connected compontents in the final graph:  [1437]


### Final check
<a id='final_check'></a>

In [636]:
A_nc, G_nc = nx.number_connected_components(A_sub_clear.to_undirected()),\
nx.number_connected_components(G_sub_clear.to_undirected()) 
print("Number of connected components in the final graphs: ", \
      A_nc, \
      G_nc)
assert A_nc == 1 and G_nc == 1, "The number of connected components is not equal to 1 in the graphs"

Number of connected components in the final graphs:  1 1


In [37]:
def get_graph_properties(H):
    print("Nodes in in the final subgraph: ", len(H.nodes()), "\nEdges in the final subgraph: ", len(H.edges()))
#    print("Diameter: ", nx.diameter(H.to_undirected()))
    print("Average clustering coefficient: ", nx.average_clustering(H.to_undirected()))
    return len(H.nodes())

In [127]:
G_sub_clear = G_sub

In [130]:
print("Co-authorship graph properties: ")
p1 = get_graph_properties(A_sub_clear)
print("\nCitation graph properties: ")
p2 = get_graph_properties(G_sub_clear)

Co-authorship graph properties: 
Nodes in in the final subgraph:  1437 
Edges in the final subgraph:  14866
Average clustering coefficient:  0.7756888832623524

Citation graph properties: 
Nodes in in the final subgraph:  3164 
Edges in the final subgraph:  4283
Average clustering coefficient:  0.18015433283481017


### Save data
<a id='save_data'></a>

In [128]:
nodes_author, nodes_citation = p1, p2

In [129]:
dataset_name = "SSORC_CS_10_21_" + str(nodes_author) + "_" + str(nodes_citation) + "_unfiltered" 
dataset_name

'SSORC_CS_10_21_1437_3164_unfiltered'

_Recommended format:_ name_of_SSORC_subset (e. g., SSORC_CS_10_21) + _ + co-authorsip graph node number + _ + citation graph node number + _ + fancy latin word

In [131]:
!mkdir datasets/{dataset_name}

mkdir: cannot create directory ‘datasets/SSORC_CS_10_21_1437_3164_unfiltered’: File exists


In [132]:
for node in G_sub_clear.nodes:
    G.add_edge(node, node)

In [133]:
nx.write_edgelist(G_sub_clear, "datasets/" + dataset_name + "/" + dataset_name + "_" + "papers.edgelist")

In [134]:
nx.write_edgelist(A_sub_clear, "datasets/" + dataset_name + "/" + dataset_name + "_" + "authors.edgelist")

In [135]:
authors_edges_papers.to_csv("datasets/" + dataset_name + "/" + dataset_name + "_" + "authors_edges_papers_indices.csv")