# Network analysis

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError

import pandas as pd
import numpy as np

from threading import Thread
from numba import jit

# from progressbar import ProgressBar
# from jyquickhelper import add_notebook_menu

In [3]:
add_notebook_menu()

## Crawl the domain

Structure des url

- *root* : `https://ideas.repec.org/`
- *article* : `a/`
- *editeur* : `xxx/` (ex: `oup` - Oxford university Press)
- *journal* : `xxxxxx/` (ex: `qjecon` - Quarterly Journal of Economics)
- *id* : `xxxx.html` 


| Structure | Exemple | Contenu |
|:----------|:--------|:--------|
|*root* | <https://ideas.repec.org/>| page d'accueil |
|~ + *article* | <https://ideas.repec.org/a/>| liste des répertoires d'éditeurs |
|~ + *editeur* | <https://ideas.repec.org/a/oup/>| liste des répertoires de journaux |
|~ + *editeur* | <https://ideas.repec.org/a/oup/qjecon/>| liste des articles du journal |
|~ + *id*      | <https://ideas.repec.org/a/oup/qjecon/v1y1886i1p1-27..html> | page d'article |


Pour passer en revue l'ensemble des articles, il suffit donc de collecter l'ensemble des urls d'articles  en explorant successivement ces différentes couches. 

In [2]:
# set parameters
root="https://ideas.repec.org/"
article="a/"

### Get editors

In [197]:
# Get editors list
ed_list=[]
pbar = ProgressBar()

html=urlopen(root+article)
bsObj=BeautifulSoup(html,"lxml")

for ed in pbar(bsObj.findAll({"a":"href"})):
    if len(ed.attrs["href"])==4:
        ed_list+= [ed.attrs["href"]]       

### Get journals

In [198]:
# Get journals list

journ_list=[]
edjourn_list=[]
pbar = ProgressBar()

for ed in pbar(ed_list):    
    html=urlopen(root + article + ed)
    bsObj=BeautifulSoup(html, "lxml")

    for journ in bsObj.findAll({"a":"href"}):
        if len(journ.attrs["href"])==7:
            journ_list+= [journ.attrs["href"]]
            edjourn_list+= [ed + journ.attrs["href"]]


100% |########################################################################|


### Get articles

In [209]:
# Get articles list
art_list=[]
edjournart_list=[]
pbar = ProgressBar()
i=0

for edj in pbar(edjourn_list):
    try :
        html=urlopen(root + article + edj )
    except HTTPError as e :
        i+=1
    
    bsObj=BeautifulSoup(html, "lxml")
    for art in bsObj.findAll({"a":"href"}):
        if ".html" in art.attrs["href"]:
            art_list+=[art.attrs["href"]]
            edjournart_list+=[edj + art.attrs["href"]]

100% |########################################################################|


In [210]:
print("There are {0} editors, {1} journals and {2} articles"\
      .format(len(ed_list),len(journ_list),len(art_list)))

There are 655 editors, 3024 journals and 1678916 articles


In [212]:
# Save lists (.csv)

pd.Series(ed_list).to_csv('IR_scrap/ed_list.csv', index=False, header=False)
pd.Series(journ_list).to_csv('IR_scrap/journ_list.csv', index=False, header=False)
pd.Series(art_list).to_csv('IR_scrap/art_list.csv', index=False, header=False)
pd.Series(edjourn_list).to_csv('IR_scrap/edjourn_list.csv', index=False, header=False)
pd.Series(edjournart_list).to_csv('IR_scrap/edjournart_list.csv', index=False, header=False)

## Parse article pages

In [5]:
# Load dataset
edjournart_list=pd.read_csv('IR_scrap/edjournart_list.csv', header=None).values.flatten()

In [6]:
# cat
edjournart_list[:5]

array(['abp/hehehe/v1y1998i1p3-30.html', 'abp/hehehe/v1y1998i1p31-63.html',
       'abp/hehehe/v1y1998i1p65-87.html',
       'abp/hehehe/v1y1998i1p89-108.html',
       'abp/hehehe/v1y1998i1p109-44.html'], dtype=object)

### Get references

In [2]:
def get_refs(eja, root="https://ideas.repec.org/a/"):
    """
    This function returns the references from the 
    specified article (url)
    
    [in]  id_art
    [out] [[id_art, id_ref]...] (np.array)
    """
    
    url= root + eja
    try:
        html=urlopen(url)
    except HTTPError as e:
        return None
    
    bsObj=BeautifulSoup(html,"lxml")
    try:
        ref=bsObj.find("div", {"aria-labelledby":"refs-tab" })\
        .find("input").attrs["value"].split("#")
    except AttributeError as e:
        return None
    
    ref=pd.Series(ref).apply(lambda x:\
                           [eja,x.split(":")[1] + "/" + \
                           x.split(":")[2] + "/" + \
                           ''.join(x.split(":")[3:]) + ".html"]).values
    return ref

### Get citations

In [3]:
def get_cits(eja, root="https://ideas.repec.org/a/"):
    """
    This function returns the citations pointing to the 
    specified article (url)
    
    [in]  id_art
    [out] [[id_art, id_cit]...] (np.array)
    """
    url= root + eja
    try:
        html=urlopen(url)
    except HTTPError as e:
        return None
    
    bsObj=BeautifulSoup(html,"lxml")
    try:
        ref=bsObj.find("div", {"aria-labelledby":"cites-tab" })\
        .find("input").attrs["value"].split("#")
    except AttributeError as e:
        return None
    
    ref=pd.Series(ref).apply(lambda x:\
                           [eja ,x.split(":")[1] + "/" + \
                           x.split(":")[2] + "/" + \
                           ''.join(x.split(":")[3:]) + ".html"]).values
    return ref

In [9]:
# Exemple d'utilisation
eja="oup/qjecon/v132y2017i4p1553-1592..html"
refs_list=get_refs(eja)
cits_list=get_cits(eja)
print("list of references \n",refs_list[:5])
print("list of citations \n",cits_list[:5])

list of references 
 [ ['oup/qjecon/v132y2017i4p1553-1592..html', 'bla/randje/v47y2016i3p463-497.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'aea/jecper/v24y2010i2p3-30.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'aea/jeclit/v48y2010i2p356-98.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'ucp/jpolec/doi10.1086/666588.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'ecm/emetrp/v63y1995i4p841-90.html']]
list of citations 
 [['oup/qjecon/v132y2017i4p1553-1592..html', 'nbr/nberwo/20695.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'eee/indorg/v43y2015icp189-207.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'bla/randje/v46y2015i3p625-649.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'feb/framed/00392.html']]


### Get stack

In [4]:
def get_stack(itr_list,i,j, meth):
    """
    This function piles-up the refences/citations from/pointing
    to the sequence of specified articles (url)
    
    [in]  id_art (iterable)
          meth="ref" for references
               "cit" for citations
    [out] [[id_art1, id_cit1]
            [id_art1, id_cit2]
            ...
            [id_artn, id_citp]] (np.array)
    """
    
    valid={"cit","ref"}
    if meth not in valid:
        raise ValueError("results: meth must be one of %r." % valid)
        
    if meth=="cit":
        cits=np.empty(1)
        #pbar = ProgressBar()
        #for eja in pbar(itr_list[i:j]):
        for eja in itr_list[i:j]:
            if get_cits(eja) is not None:
                    cits=np.concatenate((cits,get_cits(eja)), axis=0)
        return cits[1:]
    
    if meth=="ref":
        refs=np.empty(1)
        #pbar = ProgressBar()
        #for eja in pbar(itr_list[i:j]):
        for eja in itr_list[i:j]:
            if get_refs(eja) is not None:
                    try :
                        refs_copy = np.copy(refs)
                        refs=np.concatenate((refs,get_refs(eja)), axis=0)
                    except ValueError:
                        refs = refs_copy
        return refs[1:]

In [5]:
path = "C://Users//Dimitri//Desktop//ENSAE3A//NetworkData//"
out_path = path + "Output//"
import time

In [6]:
edjournart_list = pd.read_csv(path + "sub_edjournart.csv", header=None)[0].tolist()

In [7]:
ind_list = list(range(0, len(edjournart_list)))
# ind_list = list(range(10000, 10100))
n_splits = 20
splits = np.array_split(ind_list, n_splits)

In [8]:
for i in range(15, n_splits):
    refs = get_stack(edjournart_list, splits[i][0], splits[i][-1], "ref")
    pd.Series(refs).to_csv(out_path + "refs_" + str(i) + ".csv")

In [11]:
# Exemple d'utilisation
test_ref=get_stack(edjournart_list,0,100, "ref")
test_cit=get_stack(edjournart_list,0,100, "cit")

In [12]:
print("References",test_ref[:5])
print("Citations",test_cit[:5])

References [['acb/agenda/v1y1994i1p5-12.html', 'ags/joagco/120726.html']
 ['acb/agenda/v1y1994i1p5-12.html', 'sae/envira/v25y1993i12p1853-1856.html']
 ['acb/agenda/v1y1994i1p5-12.html', 'cep/cepdps/dp0174.html']
 ['acb/agenda/v1y1994i1p5-12.html', 'ags/ajaeau/22738.html']
 ['acb/agenda/v1y1994i1p5-12.html', 'ags/remaae/9621.html']]
Citations [['acb/agenda/v1y1994i1p5-12.html', 'acb/agenda/v4y1997i1p101-111.html']
 ['acb/agenda/v1y1994i1p13-23.html', 'acb/agenda/v2y1995i2p233-240.html']
 ['acb/agenda/v1y1994i1p13-23.html', 'acb/agenda/v1y1994i2p167-178.html']
 ['acb/agenda/v1y1994i1p71-79.html', 'acb/agenda/v1y1994i2p253-255.html']
 ['acb/agenda/v1y1994i1p71-79.html', 'acb/agenda/v1y1994i2p250-253.html']]


Avec `if get_refs(eja) is not None:`

    CPU times: user 4.41 s, sys: 128 ms, total: 4.54 s
    Wall time: 31.2 s
    
En précompilant (`@jit`)

    CPU times: user 4.41 s, sys: 121 ms, total: 4.53 s
    Wall time: 22.5 s

### Get attributes

In [6]:
def get_attrs(url):
    """
    This function returns the attributes of interest from the 
    specified article (url)
    
    [in]  url
    [out] url
          title
          authors
          date
          jel_code
          jeywords
    """
    
    try:
        html=urlopen(url)
    except HTTPError as e:
        return None
    
    bsObj=BeautifulSoup(html,"lxml")
    try:
        title=bsObj.find("meta", {"name":"citation_title"}).attrs["content"]
    except AttributeError as e:
        title=np.nan
    try:
        authors=bsObj.find("meta", {"name":"citation_authors"}).attrs["content"]
    except AttributeError as e:
        authors=np.nan
    try:
        date=bsObj.find("meta", {"name":"date"}).attrs["content"]
    except AttributeError as e:
        date=np.nan
    try:
        jel_code=bsObj.find("meta", {"name":"jel_code"}).attrs["content"]
    except AttributeError as e:
        jel_code=np.nan
    try:
        keywords=bsObj.find("meta", {"name":"keywords"}).attrs["content"]
    except AttributeError as e:
        keywords=np.nan
    
    return url, title, authors, date, jel_code, keywords

In [14]:
# Ex d'utilisation
attrs=[]

for eja in edjournart_list[:100]:
    attrs+=[get_attrs(root + article + eja)]    

Pour 100 articles :

    CPU times: user 3.68 s, sys: 162 ms, total: 3.84 s
    Wall time: 20.1 s
    
ie approx 95 heures pour 1,7 millions ...

Options :

- multithreading
- sélection
- mixte des deux

In [15]:
# Attributs complémentaires

db_attrs= pd.DataFrame(attrs, 
                      columns=["url", "title", "authors", "date", "jel_code", "keywords"])
db_attrs["editor"]= db_attrs.url.str.split("/").apply(lambda x: x[4])
db_attrs["journal"]= db_attrs.url.str.split("/").apply(lambda x: x[5])
db_attrs["article_id"]= db_attrs.url.str.split("/").apply(lambda x: x[-1])
db_attrs["year"]=pd.DatetimeIndex(db_attrs.date).year

In [16]:
# Aperçu de la base de données
db_attrs.head()

Unnamed: 0,url,title,authors,date,jel_code,keywords,editor,journal,article_id,year
0,https://ideas.repec.org/a/abp/hehehe/v1y1998i1...,Tavares Bastos e a questão agrária no Império,Ligia Osorio Silva,1998-02-02,,,abp,hehehe,v1y1998i1p3-30.html,1998
1,https://ideas.repec.org/a/abp/hehehe/v1y1998i1...,O mercado de trabalho mineiro no século XIX,Sérgio de Oliveira Birchal,1998-02-02,,,abp,hehehe,v1y1998i1p31-63.html,1998
2,https://ideas.repec.org/a/abp/hehehe/v1y1998i1...,Encilhamento: controvérsia e efeitos sobre a i...,Maria Teresa Ribeiro de Oliveira,1998-02-02,,,abp,hehehe,v1y1998i1p65-87.html,1998
3,https://ideas.repec.org/a/abp/hehehe/v1y1998i1...,Entre a lavoura e a indústria: tensões e polêm...,Maria Izilda Santos de Matos,1998-02-02,,,abp,hehehe,v1y1998i1p89-108.html,1998
4,https://ideas.repec.org/a/abp/hehehe/v1y1998i1...,A Sadia e o pioneirismo industrial na agroindú...,Armando Dalla Costa,1998-02-02,,,abp,hehehe,v1y1998i1p109-44.html,1998


## Production

### Subset

In [17]:
def get_rankj(url, lb):
    html=urlopen(url)
    bsObj=BeautifulSoup(html, "lxml")
    rankj_list=[]
    i=0
    
    while (len(rankj_list)<=lb):
        i+=1
        try:
            rankj_list+=[bsObj.find("div", {"aria-labelledby":"ranking-tab" })\
            .findAll("a")[i].attrs["name"]]
            #print(i)
        except (AttributeError,KeyError):
            pass
    
    return pd.Series(rankj_list).apply(lambda x : x.split(":")[1] + "/" + x.split(":")[2]).values

In [18]:
# Get the 30 best ranked journals
#rankj_30=get_rankj("https://ideas.repec.org/top/top.journals.all.html",30)
#pd.Series(rankj_30).to_csv('IR_scrap/rankj_30.csv', index=False, header=False)

In [26]:
# Load rankj_30
rankj_30=pd.read_csv('IR_scrap/rankj_30.csv',header=None).values.flatten()

In [23]:
# edjournart_db
edjournart_db=pd.read_csv('IR_scrap/edjournart_list.csv', header=None, names=["eja"])
edjournart_db.head()

Unnamed: 0,eja
0,abp/hehehe/v1y1998i1p3-30.html
1,abp/hehehe/v1y1998i1p31-63.html
2,abp/hehehe/v1y1998i1p65-87.html
3,abp/hehehe/v1y1998i1p89-108.html
4,abp/hehehe/v1y1998i1p109-44.html


In [28]:
# Subset of articles (only published in one of the 30 most reknown journals)

sub_eja=np.empty(1).flatten() # init
for i in range(len(rankj_30)):
    sub_eja=np.concatenate([sub_eja,\
                            np.array(np.where(edjournart_db.eja.str.contains(rankj_30[i])==True)).flatten()])
sub_eja=sub_eja[1:].astype(int)

In [29]:
# Subset of interest
sub_edjournart_list= edjournart_list[sub_eja]
pd.Series(sub_edjournart_list).to_csv('IR_scrap/sub_edjournart.csv', index=False, header=False)

In [33]:
# Load sub_edjournart
sub_edjournart=pd.read_csv('IR_scrap/sub_edjournart.csv',header=None).values.flatten()

### Multithreading

In [7]:
# Thread for collecting attributes
class thrd_attrs(Thread):
    
    def __init__(self, itr_list):
        Thread.__init__(self)
        #self.lb=lb
        #self.ub=ub
        self.attrs=[]
        self.itr_list= itr_list
        
    def run(self):
        #from progressbar import ProgressBar
        
        root="https://ideas.repec.org/"
        article="a/"
        #pbar= ProgressBar()

        #for i in pbar(range(self.lb,self.ub)):
        for i in range(len(self.itr_list)):
            self.attrs+=[get_attrs(root + article + self.itr_list[i])]        

In [8]:
# Thread for collecting ref

class thrd_stack(Thread):
    
    def __init__(self, itr_list, meth):
        Thread.__init__(self)
        #self.lb=lb
        #self.ub=ub
        self.meth=meth
        self.itr_list=itr_list
        import numpy as np
        self.stack=np.empty(1)
        
    def run(self):        
        self.stack=get_stack(self.itr_list,0,len(self.itr_list), self.meth)        

### Here we are

In [9]:
# Load sub_edjournart
sub_edjournart=pd.read_csv('IR_scrap/sub_edjournart.csv',header=None).values.flatten()
step=len(sub_edjournart)//10

In [10]:
attrs=[]
pbar=ProgressBar()

for eja in pbar(sub_edjournart[step*0:step*1]):
    attrs+=[get_attrs(root + article + eja)]    

100% |########################################################################|


In [15]:
db_attrs1_sub= pd.DataFrame(attrs, 
                      columns=["url", "title", "authors", "date", "jel_code", "keywords"])

In [17]:
db_attrs1_sub.to_csv("IR_scrap/attrs1_sub")

## Snippets

In [18]:
# Exemple d'utilisation
eja="oup/qjecon/v132y2017i4p1553-1592..html"
refs_list=get_refs(eja)
cits_list=get_cits(eja)
print("list of references \n",refs_list[:5])
print("list of citations \n",cits_list[:5])

list of references 
 [ ['oup/qjecon/v132y2017i4p1553-1592..html', 'bla/randje/v47y2016i3p463-497.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'aea/jecper/v24y2010i2p3-30.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'aea/jeclit/v48y2010i2p356-98.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'ucp/jpolec/doi10.1086/666588.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'ecm/emetrp/v63y1995i4p841-90.html']]
list of citations 
 [['oup/qjecon/v132y2017i4p1553-1592..html', 'nbr/nberwo/20695.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'eee/indorg/v43y2015icp189-207.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'bla/randje/v46y2015i3p625-649.html']
 ['oup/qjecon/v132y2017i4p1553-1592..html', 'feb/framed/00392.html']]


In [43]:
# Instantiate threads

# Attributes threads
attrs_1=thrd_attrs(sub_edjournart[step*0 :step*1])
attrs_2=thrd_attrs(sub_edjournart[step*1 :step*2])
attrs_3=thrd_attrs(sub_edjournart[step*2 :step*3])
attrs_4=thrd_attrs(sub_edjournart[step*3 :step*4])
#attrs_5=thrd_attrs(sub_edjournart[step*4 ,step*5])
#attrs_6=thrd_attrs(sub_edjournart[step*5 ,step*6])
#attrs_7=thrd_attrs(sub_edjournart[step*6 ,step*7])
#attrs_8=thrd_attrs(sub_edjournart[step*7 ,step*8])
#attrs_9=thrd_attrs(sub_edjournart[step*8 ,step*9])
#attrs_10=thrd_attrs(sub_edjournart[step*9 ,len(sub_edjournart_list)])

# Ref threads
#refs_1=thrd_stack(sub_edjournart_list,lb=step*0 ,ub=step*1, meth="ref")
#refs_2=thrd_stack(sub_edjournart_list,lb=step*1 ,ub=step*2, meth="ref")
#refs_3=thrd_stack(sub_edjournart_list,lb=step*2 ,ub=step*3, meth="ref")
#refs_4=thrd_stack(sub_edjournart_list,lb=step*3 ,ub=step*4, meth="ref")
#refs_5=thrd_stack(sub_edjournart_list,lb=step*4 ,ub=step*5, meth="ref")
#refs_6=thrd_stack(sub_edjournart_list,lb=step*5 ,ub=step*6, meth="ref")
#refs_7=thrd_stack(sub_edjournart_list,lb=step*6 ,ub=step*7, meth="ref")
#refs_8=thrd_stack(sub_edjournart_list,lb=step*7 ,ub=step*8, meth="ref")
#refs_9=thrd_stack(sub_edjournart_list,lb=step*8 ,ub=step*9, meth="ref")
#refs_10=thrd_stack(sub_edjournart_list,lb=step*9 ,ub=len(sub_edjournart_list), meth="ref")

# Cit threads
#cits_1=thrd_stack(sub_edjournart_list,lb=step*0 ,ub=step*1, meth="cit")
#cits_2=thrd_stack(sub_edjournart_list,lb=step*1 ,ub=step*2, meth="cit")
#cits_3=thrd_stack(sub_edjournart_list,lb=step*2 ,ub=step*3, meth="cit")
#cits_4=thrd_stack(sub_edjournart_list,lb=step*3 ,ub=step*4, meth="cit")
#cits_5=thrd_stack(sub_edjournart_list,lb=step*4 ,ub=step*5, meth="cit")
#cits_6=thrd_stack(sub_edjournart_list,lb=step*5 ,ub=step*6, meth="cit")
#cits_7=thrd_stack(sub_edjournart_list,lb=step*6 ,ub=step*7, meth="cit")
#cits_8=thrd_stack(sub_edjournart_list,lb=step*7 ,ub=step*8, meth="cit")
#cits_9=thrd_stack(sub_edjournart_list,lb=step*8 ,ub=step*9, meth="cit")
#cits_10=thrd_stack(sub_edjournart_list,lb=step*9 ,ub=len(sub_edjournart_list), meth="cit")

In [44]:
del attrs_1, attrs_2, attrs_3, attrs_4

In [45]:
attrs_1=thrd_attrs(sub_edjournart[step*0 :step*1])
attrs_2=thrd_attrs(sub_edjournart[step*1 :step*2])
attrs_3=thrd_attrs(sub_edjournart[step*2 :step*3])
attrs_4=thrd_attrs(sub_edjournart[step*3 :step*4])

attrs_1.start()
attrs_2.start()
attrs_3.start()
attrs_4.start()

In [24]:
# 2 by 2, otherwise, kernel dies ... 

attrs_1.start()
attrs_2.start()
attrs_1.join()
attrs_2.join()

attrs_3.start()
attrs_4.start()
attrs_3.join()
attrs_4.join()

attrs_5.start()
attrs_6.start()
attrs_5.join()
attrs_6.join()

attrs_7.start()
attrs_6.start()
attrs_7.join()
attrs_8.join()

attrs_9.start()
attrs_10.start()
attrs_9.join()
attrs_10.join()


attrs_1.start()
attrs_2.start()
attrs_1.join()
attrs_2.join()

attrs_3.start()
attrs_4.start()
attrs_3.join()
attrs_4.join()

attrs_5.start()
attrs_6.start()
attrs_5.join()
attrs_6.join()

attrs_7.start()
attrs_6.start()
attrs_7.join()
attrs_8.join()

attrs_9.start()
attrs_10.start()
attrs_9.join()
attrs_10.join()


 21% |###############                                                         |

KeyboardInterrupt: 

 22% |###############                                                         |