# Languages pivot table

In [1]:
from datetime import datetime
start = datetime.utcnow() # For measuring the total processing time

In [2]:
import json
from urllib.request import urlopen
import pandas as pd
import numpy as np

  return f(*args, **kwds)


## Get collection information from ArticleMeta 

In [3]:
AMC_URL = "http://articlemeta.scielo.org/api/v1/collection/identifiers/"
amc_data = pd.DataFrame(json.load(urlopen(AMC_URL)))

print("Number of collections: " + str(amc_data.shape[0]+1))
amc_data.head(2)

Number of collections: 34


Unnamed: 0,acron,acron2,code,document_count,domain,has_analytics,is_active,journal_count,name,original_name,status,type
0,arg,ar,arg,39006.0,www.scielo.org.ar,True,True,"{'deceased': 22, 'current': 125}","{'en': 'Argentina', 'pt': 'Argentina', 'es': '...",Argentina,certified,journals
1,chl,cl,chl,63467.0,www.scielo.cl,True,True,"{'deceased': 13, 'suspended': 1, 'current': 105}","{'en': 'Chile', 'pt': 'Chile', 'es': 'Chile'}",Chile,certified,journals


##### Filtering valid collections and renames 'code' to 'collection'

Some collections won't be analyzed, mainly to avoid duplicates
(there are articles in more than one collection).
The `spa` (*Public Health* collection) should have part of it
kept in the result, but it's not a collection
whose journals/articles are assigned to a single country.
The collections below are linked to a single country:

In [4]:
dont_evaluate = ["bio", "cci", "cic", "ecu", "psi", "pry", "rve", "rvo", "rvt", "sss", "spa", "wid"]
amc_names_map = {"code": "collection"}
amc_pairs = amc_data[(amc_data["acron2"].str.len() == 2) & 
                     ~amc_data["code"].isin(dont_evaluate)]\
                    .rename(columns=amc_names_map)\
#                    "~amc_data["code"].isin(dont_evaluate)]" is denying the list "dont_evaluate"
print("Number of collections: " + str(amc_pairs.shape[0]+1))

collections = amc_pairs[['collection']].copy()
collections

Number of collections: 15


Unnamed: 0,collection
0,arg
1,chl
2,col
3,cub
4,esp
5,mex
6,prt
8,scl
11,sza
12,ven


## ISSN selection from `spa`

These journals in the `spa` collection have the following countries:

In [5]:
spa_issn_country = pd.DataFrame([
    ("0021-2571"),
    ("0042-9686"),
    ("1020-4989"),
    ("1555-7960"),
], columns=["issn"])
spa_issn_country # For collection = "spa", only!

Unnamed: 0,issn
0,0021-2571
1,0042-9686
2,1020-4989
3,1555-7960


## Languages dataset

This dataset is the
[Network spreadsheet/CSV pack](https://static.scielo.org/tabs/tabs_network.zip)
 which can be found in the
[SciELO Analytics report](https://analytics.scielo.org/w/reports)
web page.
The first two rows of it are:

#### Unzip the CSV file

In [6]:
import zipfile

# Use the Zip file in jcatalog/data/scielo
with zipfile.ZipFile('../../data/scielo/tabs_network_190210.zip', 'r') as zip_ref:
    zip_ref.extract('documents_languages.csv', 'csv_files')


In [7]:
dataset = pd.read_csv("csv_files/documents_languages.csv", keep_default_na=False)
dataset.shape

(877068, 26)

#### Simplify the column names

In [8]:
names_map = {
    "ISSN SciELO": "issn",
    "collection": "collection",
    "title at SciELO": "title",
    "document publishing ID (PID SciELO)": "docs",
    "document type":"type",
    "document languages": "languages",
    "document is citable": "is_citable",
    "document publishing year": "year",
    "document pt": "document_pt",
    "document es": "document_es",
    "document en": "document_en",
    "document other languages": "document_other_languages"
}
df0 = dataset[list(names_map.keys())].rename(columns=names_map)
df0.head(2)

Unnamed: 0,issn,collection,title,docs,type,languages,is_citable,year,document_pt,document_es,document_en,document_other_languages
0,0100-879X,scl,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800006,research-article,en,1,1998,0,0,1,0
1,0100-879X,scl,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800011,rapid-communication,en,1,1998,0,0,1,0


### Creates a new DataFrame: filtering SPA and discarding those collections that are not analyzable

In [9]:
df = pd.concat([
    pd.merge(df0[df0["collection"] != "spa"], collections,      how="inner", on="collection"),
    pd.merge(df0[df0["collection"] == "spa"], spa_issn_country, how="inner", on="issn"),
])

In [10]:
df.head(2)

Unnamed: 0,issn,collection,title,docs,type,languages,is_citable,year,document_pt,document_es,document_en,document_other_languages
0,0100-879X,scl,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800006,research-article,en,1,1998,0,0,1,0
1,0100-879X,scl,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800011,rapid-communication,en,1,1998,0,0,1,0


In [11]:
# compare
df0.shape

(877068, 12)

In [12]:
df.shape

(793648, 12)

In [13]:
set(df.collection)

{'arg',
 'bol',
 'chl',
 'col',
 'cri',
 'cub',
 'esp',
 'mex',
 'per',
 'prt',
 'scl',
 'spa',
 'sza',
 'ury',
 'ven'}

#### Add pub_year (ate_1996)

In [14]:
# df["pub_year"] = np.where(df['year'] <= 1996, 'ate_1996', df["year"])
df["pub_year"] = np.where(df['year'] <= 1996, 'anterior', df["year"])

#### Add review type

In [15]:
df["tipo_review"] = np.where(df['type'] == "review-article", 1, 0)

#### Add citable_(language)

In [16]:
df["citable_pt"] = np.where((df['document_pt'] == 1) & (df['is_citable'] == 1), 1, 0)
df["citable_es"] = np.where((df['document_es'] == 1) & (df['is_citable'] == 1), 1, 0)
df["citable_en"] = np.where((df['document_en'] == 1) & (df['is_citable'] == 1), 1, 0)
df["citable_other_lang"] = np.where((df['document_other_languages'] == 1) & (df['is_citable'] == 1), 1, 0)

In [17]:
df['sum_to_2_more_lang'] = np.sum([df['document_en'], df['document_pt'], df['document_es'], df['document_other_languages']], axis=0)
df[(df['sum_to_2_more_lang'] == 3)].T

Unnamed: 0,31760,32728,34448,36347,37268,37270,37272,39283,42191,42453,...,681017,681019,681020,681025,681028,681029,2915,3247,3249,3257
issn,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,...,2007-6835,2007-6835,2007-6835,2007-6835,2007-6835,2007-6835,1020-4989,1020-4989,1020-4989,1020-4989
collection,scl,scl,scl,scl,scl,scl,scl,scl,scl,scl,...,mex,mex,mex,mex,mex,mex,spa,spa,spa,spa
title,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,...,Revista ALCONPAT,Revista ALCONPAT,Revista ALCONPAT,Revista ALCONPAT,Revista ALCONPAT,Revista ALCONPAT,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública
docs,S0104-11692003000400001,S0104-11692003000600001,S0104-11692004000200001,S0104-11692003000500001,S0104-11692004000700001,S0104-11692004000700002,S0104-11692004000700003,S0104-11692004000400001,S0104-11692004000500001,S0104-11692004000600001,...,S2007-68352015000200138,S2007-68352015000300162,S2007-68352015000300190,S2007-68352015000300203,S2007-68352015000200097,S2007-68352015000200151,S1020-49892012001000007,S1020-49892016000800080,S1020-49892016000800076,S1020-49892016000800078
type,editorial,editorial,editorial,editorial,editorial,research-article,research-article,editorial,editorial,editorial,...,review-article,research-article,research-article,research-article,research-article,review-article,research-article,undefined,editorial,editorial
languages,en;pt;es,en;pt;es,en;pt;es,en;pt;es,en;pt;es,en;pt;es,en;pt;es,en;pt;es,en;pt;es,en;pt;es,...,en;es;pt,en;es;pt,en;es;pt,en;es;pt,en;es;pt,en;es;pt,en;pt;es,fr;en;es,fr;en;es,fr;en;es
is_citable,0,0,0,0,0,1,1,0,0,0,...,1,1,1,1,1,1,1,0,0,0
year,2003,2003,2004,2003,2004,2004,2004,2004,2004,2004,...,2015,2015,2015,2015,2015,2015,2012,2016,2016,2016
document_pt,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,0,0
document_es,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


#### Add 2 or more lang

In [18]:
df["doc_2_more_lang"] = np.where(((df['sum_to_2_more_lang']) >= 2), 1, 0)
df["citable_doc_2_more_lang"] = np.where((df['sum_to_2_more_lang'] >= 2) & (df['is_citable'] == 1), 1, 0)
# remove sum_to_2_more_lang column
del df['sum_to_2_more_lang']

df[(df["doc_2_more_lang"] == 1)].T

Unnamed: 0,11706,11710,11711,11713,11718,11721,11725,31273,31274,31275,...,3135,3247,3249,3256,3257,3320,3325,3479,3485,3668
issn,0102-311X,0102-311X,0102-311X,0102-311X,0102-311X,0102-311X,0102-311X,0066-782X,0066-782X,0066-782X,...,1020-4989,1020-4989,1020-4989,1020-4989,1020-4989,1020-4989,1020-4989,1020-4989,1020-4989,1020-4989
collection,scl,scl,scl,scl,scl,scl,scl,scl,scl,scl,...,spa,spa,spa,spa,spa,spa,spa,spa,spa,spa
title,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Arquivos Brasileiros de Cardiologia,Arquivos Brasileiros de Cardiologia,Arquivos Brasileiros de Cardiologia,...,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública,Revista Panamericana de Salud Pública
docs,S0102-311X1998000200011,S0102-311X1998000200015,S0102-311X1998000200016,S0102-311X1998000200018,S0102-311X1998000200024,S0102-311X1998000200003,S0102-311X1998000200007,S0066-782X2003001400001,S0066-782X2003001400004,S0066-782X2003001400005,...,S1020-49892015000600001,S1020-49892016000800080,S1020-49892016000800076,S1020-49892016000800085,S1020-49892016000800078,S1020-49892016000500215,S1020-49892016000500213,S1020-49892018000100101,S1020-49892018000100100,S1020-49892018000100106
type,research-article,research-article,research-article,brief-report,article-commentary,research-article,research-article,research-article,research-article,research-article,...,research-article,undefined,editorial,undefined,editorial,undefined,editorial,editorial,editorial,editorial
languages,en;pt,pt;es,pt;es,en;pt,es;pt,en;pt,pt;es,en;pt,en;pt,en;pt,...,en;es,fr;en;es,fr;en;es,en;es,fr;en;es,en;es,en;es,en;es,en;es,en;es
is_citable,1,1,1,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
year,1998,1998,1998,1998,1998,1998,1998,2003,2003,2003,...,2015,2016,2016,2016,2016,2016,2016,2018,2018,2018
document_pt,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
document_es,0,1,1,0,1,0,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [19]:
df.columns

Index(['issn', 'collection', 'title', 'docs', 'type', 'languages',
       'is_citable', 'year', 'document_pt', 'document_es', 'document_en',
       'document_other_languages', 'pub_year', 'tipo_review', 'citable_pt',
       'citable_es', 'citable_en', 'citable_other_lang', 'doc_2_more_lang',
       'citable_doc_2_more_lang'],
      dtype='object')

In [20]:
values_list = [
    "docs",
    "is_citable", 
    "tipo_review",
    "document_pt", 
    "document_es", 
    "document_en",
    "document_other_languages",
    "doc_2_more_lang",
    "citable_pt",
    "citable_es",
    "citable_en",
    "citable_other_lang",
    "citable_doc_2_more_lang"]

td = df.pivot_table(
     index=["issn"],
     values=values_list,
     columns=["pub_year"],
     aggfunc=np.count_nonzero,
     fill_value=0)

In [21]:
td[:12].T

Unnamed: 0_level_0,issn,0001-3714,0001-3765,0001-6002,0001-6365,0002-0591,0002-192X,0002-7014,0003-2573,0004-0592,0004-0614,0004-0622,0004-0649
Unnamed: 0_level_1,pub_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
citable_doc_2_more_lang,1997,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,1998,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,1999,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2000,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2001,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2002,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2003,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2004,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2005,0,0,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang,2006,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
td.columns.levels

FrozenList([['citable_doc_2_more_lang', 'citable_en', 'citable_es', 'citable_other_lang', 'citable_pt', 'doc_2_more_lang', 'docs', 'document_en', 'document_es', 'document_other_languages', 'document_pt', 'is_citable', 'tipo_review'], ['1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', 'anterior']])

### Renames the labels for CSV

In [23]:
td.keys()
for k in td.keys():
    print(k)

('citable_doc_2_more_lang', '1997')
('citable_doc_2_more_lang', '1998')
('citable_doc_2_more_lang', '1999')
('citable_doc_2_more_lang', '2000')
('citable_doc_2_more_lang', '2001')
('citable_doc_2_more_lang', '2002')
('citable_doc_2_more_lang', '2003')
('citable_doc_2_more_lang', '2004')
('citable_doc_2_more_lang', '2005')
('citable_doc_2_more_lang', '2006')
('citable_doc_2_more_lang', '2007')
('citable_doc_2_more_lang', '2008')
('citable_doc_2_more_lang', '2009')
('citable_doc_2_more_lang', '2010')
('citable_doc_2_more_lang', '2011')
('citable_doc_2_more_lang', '2012')
('citable_doc_2_more_lang', '2013')
('citable_doc_2_more_lang', '2014')
('citable_doc_2_more_lang', '2015')
('citable_doc_2_more_lang', '2016')
('citable_doc_2_more_lang', '2017')
('citable_doc_2_more_lang', '2018')
('citable_doc_2_more_lang', '2019')
('citable_doc_2_more_lang', 'anterior')
('citable_en', '1997')
('citable_en', '1998')
('citable_en', '1999')
('citable_en', '2000')
('citable_en', '2001')
('citable_en', '2

In [24]:
newlabel = []
for k in td.keys():
    newlabel.append(k[0]+'_'+k[1])

In [25]:
newlabel

['citable_doc_2_more_lang_1997',
 'citable_doc_2_more_lang_1998',
 'citable_doc_2_more_lang_1999',
 'citable_doc_2_more_lang_2000',
 'citable_doc_2_more_lang_2001',
 'citable_doc_2_more_lang_2002',
 'citable_doc_2_more_lang_2003',
 'citable_doc_2_more_lang_2004',
 'citable_doc_2_more_lang_2005',
 'citable_doc_2_more_lang_2006',
 'citable_doc_2_more_lang_2007',
 'citable_doc_2_more_lang_2008',
 'citable_doc_2_more_lang_2009',
 'citable_doc_2_more_lang_2010',
 'citable_doc_2_more_lang_2011',
 'citable_doc_2_more_lang_2012',
 'citable_doc_2_more_lang_2013',
 'citable_doc_2_more_lang_2014',
 'citable_doc_2_more_lang_2015',
 'citable_doc_2_more_lang_2016',
 'citable_doc_2_more_lang_2017',
 'citable_doc_2_more_lang_2018',
 'citable_doc_2_more_lang_2019',
 'citable_doc_2_more_lang_anterior',
 'citable_en_1997',
 'citable_en_1998',
 'citable_en_1999',
 'citable_en_2000',
 'citable_en_2001',
 'citable_en_2002',
 'citable_en_2003',
 'citable_en_2004',
 'citable_en_2005',
 'citable_en_2006',
 'ci

In [26]:
newlabel[::24]

['citable_doc_2_more_lang_1997',
 'citable_en_1997',
 'citable_es_1997',
 'citable_other_lang_1997',
 'citable_pt_1997',
 'doc_2_more_lang_1997',
 'docs_1997',
 'document_en_1997',
 'document_es_1997',
 'document_other_languages_1997',
 'document_pt_1997',
 'is_citable_1997',
 'tipo_review_1997']

In [27]:
td.columns = newlabel

In [28]:
td.T

issn,0001-3714,0001-3765,0001-6002,0001-6365,0002-0591,0002-192X,0002-7014,0003-2573,0004-0592,0004-0614,...,2504-3145,2518-4431,2520-9868,2526-8910,2531-0488,2531-1379,2545-7756,2594-1321,2595-3192,2619-6573
citable_doc_2_more_lang_1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_1999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
td.to_csv("output/td_documents_languages_network.csv")
#  td.to_csv("output/td_documents_languages_bra_190123.csv")

In [30]:
print(f"Notebook processing duration: {datetime.utcnow() - start}")

Notebook processing duration: 0:00:09.775706
