# Selection Sampling
In this section, articles are selected on a prepared Arepo (Article Repository) based on a requirement, and after that a graph is prepared and analyzed.
In this example arepo is a mongoDB.

## Import Library


In [67]:
import click
import time
import sys
import json
import re
import networkx as nx
from pymongo import MongoClient
from triplea.config.settings import SETTINGS
from triplea.service.click_logger import logger
from triplea.schemas.article import Article
from triplea.schemas.node import Node
from triplea.service.graph.analysis.info import info
import triplea.service.repository.persist as persist
import triplea.service.graph.export.export as gexport
import triplea.service.graph.analysis.ganalysis as ganaliz


## Prepare Connection String

In [41]:
connection_url = SETTINGS.AAA_MONGODB_CONNECTION_URL
client = MongoClient(connection_url)
db = client[SETTINGS.AAA_MONGODB_DB_NAME]
col_article = db["articledata"]

## Select Specific Article

In [61]:
rgx = re.compile('.*blood donations.*', re.IGNORECASE)  # compile the regex
cursor = db.tweets.find({'text':rgx},{'text':1,'created_at':1})

myquery = {"$or":[
    {"Topics": re.compile('.*biobank.*', re.IGNORECASE) },
    {"Topics": re.compile('.*Biobank.*', re.IGNORECASE) },

    ]
    }
cursor = col_article.find(myquery, projection={"PMID": "$PMID", "_id": 0})
l_pmid = []
for a in list(cursor):
    l_pmid.append(a['PMID'])
logger.DEBUG(f"{str(len(l_pmid))} Article(s) Selected.")

[32m1917 Article(s) Selected.[0m


## Generate Graph From Selected Article

In [None]:
from triplea.schemas.node import Edge
from triplea.service.nlp.topic_extract import extract_textrank

total_article_in_current_state = len(l_pmid)
number_of_article_move_forward = 0

refresh_point = 0
nodes = []
edges = []
for id in l_pmid:
    try:
        number_of_article_move_forward = number_of_article_move_forward + 1
        if refresh_point == 500:
            refresh_point = 0
            print()
            logger.INFO(
                f"There are {str(total_article_in_current_state - number_of_article_move_forward)} article(s) left ... ",
                forecolore="yellow",
            )
        else:
            refresh_point = refresh_point + 1

        a = persist.get_article_by_pmid(id)
        try:
            updated_article = Article(**a.copy())
        except Exception:
            print()
            print(logger.ERROR(f"Error in parsing article. PMID = {id}"))
            raise Exception("Article Not Parsed.")
        #------------------Extract Topic Graph


        node_article = Node()
        node_article.Identifier = updated_article.PMID
        node_article.Name = updated_article.PMID
        node_article.Type = "Article"
        nodes.append(node_article.dict())

        if updated_article.Abstract is None or updated_article.Abstract == "":
            pass
        else:
            # topic_list = []
            topic_list_phrase = []
            topic_list_phrase = extract_textrank(updated_article.Abstract)
            if topic_list_phrase is not None:
                for t in topic_list_phrase:
                    if t.rank > 0.08:
                        # topic_list.append(t.text)

                        node_topic = Node()
                        node_topic.Identifier = t.text.lower()
                        node_topic.Name = t.text.lower()
                        node_topic.Type = "Topic"
                        nodes.append(node_topic.dict())

                        edge = Edge()
                        edge.SourceID = node_article.Identifier
                        edge.DestinationID = node_topic.Identifier
                        edge.Type = "TOPIC"
                        edge.Weight = t.rank
                        edge.HashID = str(hash(edge.SourceID + edge.DestinationID + edge.Type))
                        edges.append(edge.dict())
        #------------------Extract Topic Graph
    except Exception:
            exc_type, exc_value, exc_tb = sys.exc_info()
            print()
            print(exc_tb.tb_lineno)
            logger.ERROR(f"Error {exc_type}")
            logger.ERROR(f"Error {exc_value}")

graphdict = {"nodes": nodes, "edges": edges}
logger.INFO("Graph Generated.")

## Convert GraphDict Format to NetworkX

In [63]:
G = gexport.export_networkx_from_graphdict(graphdict,graph_type='directed')
info(G)

Report Time : 1687162679.4307654
Elapsed Time Calculation Report : 0.4747304916381836
Graph Type: Directed
Graph Nodes: 12737
Graph Edges: 16588
Graph Average Degree : 1.302347491560022
Graph Density : 0.00010225718369660977
Graph Transitivity : 0
Graph max path length : 1
Graph Average Clustering Coefficient : 0.0
Graph Degree Assortativity Coefficient : -0.06897886758729954
Graph Radius : NaN Found infinite path length because the digraph is not strongly connected
SCC: 12737
WCC: 52
Reciprocity : 0.0
Graph Diameter : Can not calculate in directed graph.
Number of Components : Can not calculate in directed graph.


In [79]:
dcs = ganaliz.sorted_degree_centrality(G)
dcs = dcs.nlargest(n=80, keep='first')
print(dcs)
# print(dcs.to_dict())
rl = []
for k in dcs.to_dict():
    rl.append(k)

uk biobank               0.183360
biobanks                 0.142870
research                 0.087557
biobanking               0.069860
biobank                  0.063838
                           ...   
health research          0.008431
biobank participation    0.008339
biobank governance       0.008061
rare variants            0.008061
women                    0.007968
Length: 80, dtype: float64


## Remove Isolated Node

In [70]:
import networkx as nx
isolates = list(nx.isolates(G))
print(f"Number of isolated nodes : {len(isolates)}")
G.remove_nodes_from(isolates)
info(G)

Number of isolated nodes : 0
Report Time : 1687163441.2271411
Elapsed Time Calculation Report : 1.871528148651123
Graph Type: Directed
Graph Nodes: 10820
Graph Edges: 64290
Graph Average Degree : 5.94177449168207
Graph Density : 0.0005491981229024929
Graph Transitivity : 0.06775609088500188
Graph max path length : NaN
Graph Average Clustering Coefficient : 0.47533769082157207
Graph Degree Assortativity Coefficient : -0.06145435727843715
Graph Radius : NaN Found infinite path length because the digraph is not strongly connected
SCC: 5865
WCC: 52
Reciprocity : 0.02093638201897651
Graph Diameter : Can not calculate in directed graph.
Number of Components : Can not calculate in directed graph.


## Remove low degree node

In [76]:
k=0
a=0
for node , d in list(G.nodes(data=True)):
    degree = G.degree(node)
    node_type = d['Type']
    if node_type == 'Topic' and degree < 3:
        k=k+1
        G.remove_node(node)
        # print(f' {node}  {node_type} {a}')
    elif node_type == 'Article' and degree == 1:
        a=a+1
        G.remove_node(node)

print(f"Remove {k} Keywords and {a} Articles")
info(G)

Remove 0 Keywords and 0 Articles
Report Time : 1687163534.9168267
Elapsed Time Calculation Report : 1.7365355491638184
Graph Type: Directed
Graph Nodes: 10794
Graph Edges: 64252
Graph Average Degree : 5.952566240503984
Graph Density : 0.0005515210081074755
Graph Transitivity : 0.06796540843070914
Graph max path length : NaN
Graph Average Clustering Coefficient : 0.4753863440616496
Graph Degree Assortativity Coefficient : -0.061390302383558214
Graph Radius : NaN Found infinite path length because the digraph is not strongly connected
SCC: 5843
WCC: 50
Reciprocity : 0.020948764240801842
Graph Diameter : Can not calculate in directed graph.
Number of Components : Can not calculate in directed graph.


## Generate Topic Graph

### Add Degree to each node

In [65]:
for node in G.nodes:
    G.nodes[node]['degree'] = G.degree(node)

### Remove Article Node

In [69]:
top = 0
for node , d in list(G.nodes(data=True)):
    top = top + 1
    node_type = d['Type']
    if node_type == 'Article':
        neighbors = list(G.neighbors(node))
        G.remove_node(node)
        for i in range(len(neighbors)):
            for j in range(i+1, len(neighbors)):
                G.add_edge(neighbors[i], neighbors[j])
                
   
    # if top > 10:
    #     break
info(G)

Report Time : 1687163281.1617606
Elapsed Time Calculation Report : 1.778926134109497
Graph Type: Directed
Graph Nodes: 10820
Graph Edges: 64290
Graph Average Degree : 5.94177449168207
Graph Density : 0.0005491981229024929
Graph Transitivity : 0.06775609088500188
Graph max path length : NaN
Graph Average Clustering Coefficient : 0.47533769082157207
Graph Degree Assortativity Coefficient : -0.06145435727843715
Graph Radius : NaN Found infinite path length because the digraph is not strongly connected
SCC: 5865
WCC: 52
Reciprocity : 0.02093638201897651
Graph Diameter : Can not calculate in directed graph.
Number of Components : Can not calculate in directed graph.


### Draw with edge

In [None]:
import matplotlib.pyplot as plt

pos = nx.kamada_kawai_layout(G)

fig = plt.figure(figsize=(16, 16))
nx.draw_networkx(G, pos= pos, node_size= [d['degree'] for node , d  in list(G.nodes(data=True))])
plt.show()
fig.savefig('output.jpg', bbox_inches='tight')

### Export Json File for VOSviewer
By using the VOSviewer software, you can draw the graph of topics of the selected articles. The output that can be imported into VOSviewer software can be produced as follows.

In [77]:
j = {}
items = []
links = []

n = 0
for node in G.nodes:
    G.nodes[node]['id'] = n
    n = n + 1

for node , d in list(G.nodes(data=True)):
    item = {}
    item['id'] = d['id']
    if 'Name' in d:
        item['label'] = d['Name']
    else:
        item['label'] = "NAN"
    
    items.append(item)

for s , d in list(G.edges()):
    link = {}
    link['source_id'] = G.nodes(data=True)[s]['id']
    link['target_id'] = G.nodes(data=True)[d]['id']
    links.append(link)

gml = {}
gml['network'] = {'items' : items, 'links' : links}

with open("VOS.json", "w") as file:
    json.dump(gml, file)

![](../docs/assets/img/topic-graph-biobank.png)