# Languages pivot table

In [1]:
import json
from urllib.request import urlopen
import pandas as pd
import numpy as np

  return f(*args, **kwds)


## Get collection information from ArticleMeta 

In [2]:
AMC_URL = "http://articlemeta.scielo.org/api/v1/collection/identifiers/"
amc_data = pd.DataFrame(json.load(urlopen(AMC_URL)))

In [3]:
amc_data.head(6)

Unnamed: 0,acron,acron2,code,document_count,domain,has_analytics,is_active,journal_count,name,original_name,status,type
0,arg,ar,arg,39007.0,www.scielo.org.ar,True,True,"{'current': 125, 'deceased': 22}","{'pt': 'Argentina', 'en': 'Argentina', 'es': '...",Argentina,certified,journals
1,chl,cl,chl,63467.0,www.scielo.cl,True,True,"{'current': 105, 'deceased': 13, 'suspended': 1}","{'pt': 'Chile', 'en': 'Chile', 'es': 'Chile'}",Chile,certified,journals
2,col,co,col,69725.0,www.scielo.org.co,True,True,"{'current': 226, 'suspended': 7}","{'pt': 'Colombia', 'en': 'Colombia', 'es': 'Co...",Colombia,certified,journals
3,cub,cu,cub,33492.0,scielo.sld.cu,True,True,"{'current': 61, 'deceased': 2, 'suspended': 4}","{'pt': 'Cuba', 'en': 'Cuba', 'es': 'Cuba'}",Cuba,certified,journals
4,esp,es,esp,37862.0,scielo.isciii.es,True,True,"{'current': 43, 'deceased': 6, 'suspended': 11}","{'pt': 'Espanha', 'en': 'Spain', 'es': 'España'}",España,certified,journals
5,mex,mx,mex,64406.0,www.scielo.org.mx,True,True,"{'current': 159, 'deceased': 12, 'suspended': 44}","{'pt': 'Mexico', 'en': 'Mexico', 'es': 'Mexico'}",Mexico,certified,journals


Some collections won't be analyzed, mainly to avoid duplicates
(there are articles in more than one collection).
The `spa` (*Public Health* collection) should have part of it
kept in the result, but it's not a collection
whose journals/articles are assigned to a single country.
The collections below are linked to a single country:

In [4]:
dont_evaluate = ["bio", "cci", "cic", "ecu", "psi", "pry", "rve", "rvo", "rvt", "sss", "spa", "wid"]
amc_names_map = {
    "code": "collection",
    "acron2": "origin",
}
amc_pairs = amc_data \
    [(amc_data["acron2"].str.len() == 2) &
     ~amc_data["code"].isin(dont_evaluate)] \
    [list(amc_names_map.keys())] \
    .rename(columns=amc_names_map) \
    .assign(origin=lambda df: df["origin"].str.upper())
amc_pairs

Unnamed: 0,collection,origin
0,arg,AR
1,chl,CL
2,col,CO
3,cub,CU
4,esp,ES
5,mex,MX
6,prt,PT
8,scl,BR
11,sza,ZA
12,ven,VE


## ISSN selection from `spa`

These journals in the `spa` collection have the following countries:

In [5]:
spa_issn_country = pd.DataFrame([
    ("0021-2571", "IT"),
    ("0042-9686", "CH"),
    ("1020-4989", "US"),
    ("1555-7960", "US"),
], columns=["issn", "origin"])
spa_issn_country # For collection = "spa", only!

Unnamed: 0,issn,origin
0,0021-2571,IT
1,0042-9686,CH
2,1020-4989,US
3,1555-7960,US


## Languages dataset

This dataset is the
[Network spreadsheet/CSV pack](https://static.scielo.org/tabs/tabs_network.zip)
 which can be found in the
[SciELO Analytics report](https://analytics.scielo.org/w/reports)
web page.
The first two rows of it are:

#### Unzip the CSV file

In [6]:
import zipfile
# Use the Zip file in jcatalog/data/scielo
# with zipfile.ZipFile('../../data/scielo/tabs_network_190128.zip', 'r') as zip_ref:
#     zip_ref.extract('documents_languages.csv', 'csv_files')
with zipfile.ZipFile('../../data/scielo/tabs_bra_190123.zip', 'r') as zip_ref:
   zip_ref.extract('documents_languages.csv', 'csv_files')


In [7]:
dataset = pd.read_csv("csv_files/documents_languages.csv", keep_default_na=False)
dataset.head(3).T

Unnamed: 0,0,1,2
extraction date,2019-01-22,2019-01-22,2019-01-22
study unit,document,document,document
collection,scl,scl,scl
ISSN SciELO,0100-879X,0100-879X,0100-879X
ISSN's,0100-879X;1414-431X,0100-879X;1414-431X,0100-879X;1414-431X
title at SciELO,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...
title thematic areas,Biological Sciences;Health Sciences,Biological Sciences;Health Sciences,Biological Sciences;Health Sciences
title is agricultural sciences,0,0,0
title is applied social sciences,0,0,0
title is biological sciences,1,1,1


#### Simplify the column names

In [8]:
names_map = {
    "ISSN SciELO": "issn",
    "title at SciELO":"title",
    "document publishing ID (PID SciELO)": "pid",
    "document type":"type",
    "document languages": "languages",
    "document is citable": "is_citable",
    "document publishing year": "year",
    "document pt": "document_pt",
    "document es": "document_es",
    "document en": "document_en",
    "document other languages": "document_other_languages"
}
df = dataset[list(names_map.keys())].rename(columns=names_map)
df[:5].T

Unnamed: 0,0,1,2,3,4
issn,0100-879X,0100-879X,0100-879X,0100-879X,0100-879X
title,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...
pid,S0100-879X1998000800006,S0100-879X1998000800011,S0100-879X1998000800005,S0100-879X1998000800009,S0100-879X1998000800010
type,research-article,rapid-communication,research-article,rapid-communication,rapid-communication
languages,en,en,en,en,en
is_citable,1,1,1,1,1
year,1998,1998,1998,1998,1998
document_pt,0,0,0,0,0
document_es,0,0,0,0,0
document_en,1,1,1,1,1


#### Add pub_year (ate_1996)

In [9]:
# df["pub_year"] = np.where(df['year'] <= 1996, 'ate_1996', df["year"])
df["pub_year"] = np.where(df['year'] <= 2007, 'anterior', df["year"])

#### Add review type

In [10]:
df["tipo_review"] = np.where(df['type'] == "review-article", 1, 0)

#### Add citable_(language)

In [11]:
df["citable_pt"] = np.where((df['document_pt'] == 1) & (df['is_citable'] == 1), 1, 0)
df["citable_es"] = np.where((df['document_es'] == 1) & (df['is_citable'] == 1), 1, 0)
df["citable_en"] = np.where((df['document_en'] == 1) & (df['is_citable'] == 1), 1, 0)
df["citable_other_lang"] = np.where((df['document_other_languages'] == 1) & (df['is_citable'] == 1), 1, 0)

In [12]:
df['sum_to_2_more_lang'] = np.sum([df['document_en'], df['document_pt'], df['document_es'], df['document_other_languages']], axis=0)
df[(df['sum_to_2_more_lang'] == 3)].T

Unnamed: 0,31760,32728,34448,36347,37268,37270,37272,39283,42191,42453,...,376802,376804,376805,376807,376951,376957,376963,376966,376975,377601
issn,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,0104-1169,...,1983-8042,1983-8042,1983-8042,1983-8042,0080-6234,0080-6234,0080-6234,0080-6234,0080-6234,0102-311X
title,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,Revista Latino-Americana de Enfermagem,...,Revista Bioética,Revista Bioética,Revista Bioética,Revista Bioética,Revista da Escola de Enfermagem da USP,Revista da Escola de Enfermagem da USP,Revista da Escola de Enfermagem da USP,Revista da Escola de Enfermagem da USP,Revista da Escola de Enfermagem da USP,Cadernos de Saúde Pública
pid,S0104-11692003000400001,S0104-11692003000600001,S0104-11692004000200001,S0104-11692003000500001,S0104-11692004000700001,S0104-11692004000700002,S0104-11692004000700003,S0104-11692004000400001,S0104-11692004000500001,S0104-11692004000600001,...,S1983-80422018000400597,S1983-80422018000400484,S1983-80422018000400606,S1983-80422018000400506,S0080-62342018000100101,S0080-62342018000100472,S0080-62342018000100477,S0080-62342018000100470,S0080-62342018000100471,S0102-311X2019000100101
type,editorial,editorial,editorial,editorial,editorial,research-article,research-article,editorial,editorial,editorial,...,research-article,rapid-communication,research-article,rapid-communication,editorial,research-article,research-article,research-article,research-article,editorial
languages,en;pt;es,en;es;pt,en;es;pt,en;es;pt,en;pt;es,en;es;pt,en;pt;es,en;pt;es,en;pt;es,en;pt;es,...,en;es;pt,en;es;pt,en;es;pt,en;es;pt,en;pt;es,en;pt;es,en;es;pt,en;pt;es,en;pt;es,en;pt;es
is_citable,0,0,0,0,0,1,1,0,0,0,...,1,1,1,1,0,1,1,1,1,0
year,2003,2003,2004,2003,2004,2004,2004,2004,2004,2004,...,2018,2018,2018,2018,2018,2018,2018,2018,2018,2019
document_pt,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
document_es,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
document_en,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


#### Add 2 or more lang

In [13]:
df["doc_2_more_lang"] = np.where(((df['sum_to_2_more_lang']) >= 2), 1, 0)
df["citable_doc_2_more_lang"] = np.where((df['sum_to_2_more_lang'] >= 2) & (df['is_citable'] == 1), 1, 0)
# remove sum_to_2_more_lang column
del df['sum_to_2_more_lang']

df[(df["doc_2_more_lang"] == 1)].T

Unnamed: 0,11706,11710,11711,11713,11718,11721,11725,31273,31274,31275,...,377684,377686,377692,377700,377704,377710,377714,377716,377726,377730
issn,0102-311X,0102-311X,0102-311X,0102-311X,0102-311X,0102-311X,0102-311X,0066-782X,0066-782X,0066-782X,...,2237-2660,1983-1447,1414-8145,2237-2660,2237-2660,1983-1447,1983-1447,2237-2660,1983-1447,2237-2660
title,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Cadernos de Saúde Pública,Arquivos Brasileiros de Cardiologia,Arquivos Brasileiros de Cardiologia,Arquivos Brasileiros de Cardiologia,...,Revista Brasileira de Estudos da Presença,Revista Gaúcha de Enfermagem,Escola Anna Nery,Revista Brasileira de Estudos da Presença,Revista Brasileira de Estudos da Presença,Revista Gaúcha de Enfermagem,Revista Gaúcha de Enfermagem,Revista Brasileira de Estudos da Presença,Revista Gaúcha de Enfermagem,Revista Brasileira de Estudos da Presença
pid,S0102-311X1998000200011,S0102-311X1998000200015,S0102-311X1998000200016,S0102-311X1998000200018,S0102-311X1998000200024,S0102-311X1998000200003,S0102-311X1998000200007,S0066-782X2003001400001,S0066-782X2003001400004,S0066-782X2003001400005,...,S2237-26602019000100204,S1983-14472019000200407,S1414-81452019000100207,S2237-26602019000100400,S2237-26602019000100202,S1983-14472019000200402,S1983-14472019000200406,S2237-26602019000100200,S1983-14472019000200405,S2237-26602019000100201
type,research-article,research-article,research-article,brief-report,article-commentary,research-article,research-article,research-article,research-article,research-article,...,research-article,research-article,research-article,research-article,research-article,research-article,research-article,research-article,research-article,research-article
languages,en;pt,pt;es,es;pt,en;pt,pt;es,en;pt,pt;es,en;pt,en;pt,en;pt,...,en;pt,en;pt,en;pt,en;pt,en;pt,en;pt,en;pt,en;pt,en;pt,en;pt
is_citable,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
year,1998,1998,1998,1998,1998,1998,1998,2003,2003,2003,...,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019
document_pt,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
document_es,0,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
document_en,1,0,0,1,0,1,0,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [14]:
df2 = df.rename(columns={"pid":"docs"})

In [15]:
df2.columns

Index(['issn', 'title', 'docs', 'type', 'languages', 'is_citable', 'year',
       'document_pt', 'document_es', 'document_en', 'document_other_languages',
       'pub_year', 'tipo_review', 'citable_pt', 'citable_es', 'citable_en',
       'citable_other_lang', 'doc_2_more_lang', 'citable_doc_2_more_lang'],
      dtype='object')

In [16]:
values_list = [
    "docs",
    "is_citable", 
    "tipo_review",
    "document_pt", 
    "document_es", 
    "document_en",
    "document_other_languages",
    "doc_2_more_lang",
    "citable_pt",
    "citable_es",
    "citable_en",
    "citable_other_lang",
    "citable_doc_2_more_lang"]

td = df2.pivot_table(
     index=["issn"],
     values=values_list,
     columns=["pub_year"],
     aggfunc=np.count_nonzero,
     fill_value=0)
td['docs'][:11].T

issn,0001-3714,0001-3765,0002-0591,0004-2730,0004-2749,0004-2803,0004-282X,0006-8705,0011-5258,0020-3874,0021-7557
pub_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008,0,65,0,206,204,65,252,125,32,0,108
2009,0,78,0,160,165,71,265,124,31,0,102
2010,0,101,0,136,118,68,217,150,32,0,98
2011,0,116,0,116,107,56,243,127,32,0,99
2012,0,108,24,109,99,65,271,70,32,28,93
2013,0,163,33,112,103,55,248,65,33,29,93
2014,0,160,31,140,102,63,239,57,40,38,109
2015,0,190,0,0,105,71,255,73,36,39,99
2016,0,205,0,0,118,56,173,49,36,48,108
2017,0,291,0,0,100,71,201,79,32,51,103


In [17]:
td.columns.levels

FrozenList([['citable_doc_2_more_lang', 'citable_en', 'citable_es', 'citable_other_lang', 'citable_pt', 'doc_2_more_lang', 'docs', 'document_en', 'document_es', 'document_other_languages', 'document_pt', 'is_citable', 'tipo_review'], ['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', 'anterior']])

### Renames the labels for CSV

In [18]:
td.keys()
for k in td.keys():
    print(k)

('citable_doc_2_more_lang', '2008')
('citable_doc_2_more_lang', '2009')
('citable_doc_2_more_lang', '2010')
('citable_doc_2_more_lang', '2011')
('citable_doc_2_more_lang', '2012')
('citable_doc_2_more_lang', '2013')
('citable_doc_2_more_lang', '2014')
('citable_doc_2_more_lang', '2015')
('citable_doc_2_more_lang', '2016')
('citable_doc_2_more_lang', '2017')
('citable_doc_2_more_lang', '2018')
('citable_doc_2_more_lang', '2019')
('citable_doc_2_more_lang', 'anterior')
('citable_en', '2008')
('citable_en', '2009')
('citable_en', '2010')
('citable_en', '2011')
('citable_en', '2012')
('citable_en', '2013')
('citable_en', '2014')
('citable_en', '2015')
('citable_en', '2016')
('citable_en', '2017')
('citable_en', '2018')
('citable_en', '2019')
('citable_en', 'anterior')
('citable_es', '2008')
('citable_es', '2009')
('citable_es', '2010')
('citable_es', '2011')
('citable_es', '2012')
('citable_es', '2013')
('citable_es', '2014')
('citable_es', '2015')
('citable_es', '2016')
('citable_es', '20

In [19]:
newlabel = []
for k in td.keys():
    newlabel.append(k[0]+'_'+k[1])

In [20]:
newlabel

['citable_doc_2_more_lang_2008',
 'citable_doc_2_more_lang_2009',
 'citable_doc_2_more_lang_2010',
 'citable_doc_2_more_lang_2011',
 'citable_doc_2_more_lang_2012',
 'citable_doc_2_more_lang_2013',
 'citable_doc_2_more_lang_2014',
 'citable_doc_2_more_lang_2015',
 'citable_doc_2_more_lang_2016',
 'citable_doc_2_more_lang_2017',
 'citable_doc_2_more_lang_2018',
 'citable_doc_2_more_lang_2019',
 'citable_doc_2_more_lang_anterior',
 'citable_en_2008',
 'citable_en_2009',
 'citable_en_2010',
 'citable_en_2011',
 'citable_en_2012',
 'citable_en_2013',
 'citable_en_2014',
 'citable_en_2015',
 'citable_en_2016',
 'citable_en_2017',
 'citable_en_2018',
 'citable_en_2019',
 'citable_en_anterior',
 'citable_es_2008',
 'citable_es_2009',
 'citable_es_2010',
 'citable_es_2011',
 'citable_es_2012',
 'citable_es_2013',
 'citable_es_2014',
 'citable_es_2015',
 'citable_es_2016',
 'citable_es_2017',
 'citable_es_2018',
 'citable_es_2019',
 'citable_es_anterior',
 'citable_other_lang_2008',
 'citable_o

In [21]:
newlabel[::24]

['citable_doc_2_more_lang_2008',
 'citable_en_2019',
 'citable_other_lang_2017',
 'doc_2_more_lang_2015',
 'document_en_2013',
 'document_other_languages_2011',
 'is_citable_2009',
 'tipo_review_anterior']

In [22]:
td.columns = newlabel

In [23]:
td.T

issn,0001-3714,0001-3765,0002-0591,0004-2730,0004-2749,0004-2803,0004-282X,0006-8705,0011-5258,0020-3874,...,2359-3997,2359-5647,2446-4740,2447-536X,2448-167X,2448-2455,2526-8910,2531-0488,2531-1379,2595-3192
citable_doc_2_more_lang_2008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2010,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2011,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2012,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2013,0,0,0,6,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2014,0,0,0,2,0,0,1,13,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2015,0,0,0,0,0,0,0,13,0,0,...,1,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2016,0,0,0,0,0,0,0,17,0,0,...,0,0,0,0,0,0,0,0,0,0
citable_doc_2_more_lang_2017,0,0,0,0,0,0,0,5,0,0,...,0,71,0,0,0,0,0,0,0,0


In [24]:
# td.to_csv("output/td_documents_languages_network.csv")
td.to_csv("output/td_documents_languages_bra_190123.csv")