# Languages pivot table

In [1]:
from datetime import datetime
start = datetime.utcnow() # For measuring the total processing time

In [2]:
import json
from urllib.request import urlopen
import pandas as pd
import numpy as np

  return f(*args, **kwds)


## Get collection information from ArticleMeta 

In [3]:
AMC_URL = "http://articlemeta.scielo.org/api/v1/collection/identifiers/"
amc_data = pd.DataFrame(json.load(urlopen(AMC_URL)))

In [4]:
amc_data.head(6)

Unnamed: 0,acron,acron2,code,document_count,domain,has_analytics,is_active,journal_count,name,original_name,status,type
0,arg,ar,arg,38216.0,www.scielo.org.ar,True,True,"{'current': 125, 'deceased': 22}","{'es': 'Argentina', 'en': 'Argentina', 'pt': '...",Argentina,certified,journals
1,chl,cl,chl,62623.0,www.scielo.cl,True,True,"{'current': 105, 'suspended': 1, 'deceased': 13}","{'es': 'Chile', 'en': 'Chile', 'pt': 'Chile'}",Chile,certified,journals
2,col,co,col,68549.0,www.scielo.org.co,True,True,"{'current': 225, 'suspended': 7}","{'es': 'Colombia', 'en': 'Colombia', 'pt': 'Co...",Colombia,certified,journals
3,cub,cu,cub,33492.0,scielo.sld.cu,True,True,"{'current': 61, 'suspended': 4, 'deceased': 2}","{'es': 'Cuba', 'en': 'Cuba', 'pt': 'Cuba'}",Cuba,certified,journals
4,esp,es,esp,37648.0,scielo.isciii.es,True,True,"{'current': 43, 'suspended': 11, 'deceased': 6}","{'es': 'España', 'en': 'Spain', 'pt': 'Espanha'}",España,certified,journals
5,mex,mx,mex,61585.0,www.scielo.org.mx,True,True,"{'current': 156, 'suspended': 47, 'deceased': 12}","{'es': 'Mexico', 'en': 'Mexico', 'pt': 'Mexico'}",Mexico,certified,journals


Some collections won't be analyzed, mainly to avoid duplicates
(there are articles in more than one collection).
The `spa` (*Public Health* collection) should have part of it
kept in the result, but it's not a collection
whose journals/articles are assigned to a single country.
The collections below are linked to a single country:

In [5]:
dont_evaluate = ["bio", "cci", "cic", "ecu", "psi", "pry", "rve", "rvo", "rvt", "sss", "spa", "wid"]
amc_names_map = {
    "code": "collection",
    "acron2": "origin",
}
amc_pairs = amc_data \
    [(amc_data["acron2"].str.len() == 2) &
     ~amc_data["code"].isin(dont_evaluate)] \
    [list(amc_names_map.keys())] \
    .rename(columns=amc_names_map) \
    .assign(origin=lambda df: df["origin"].str.upper())
amc_pairs

Unnamed: 0,collection,origin
0,arg,AR
1,chl,CL
2,col,CO
3,cub,CU
4,esp,ES
5,mex,MX
6,prt,PT
7,scl,BR
10,sza,ZA
11,ven,VE


## ISSN selection from `spa`

These journals in the `spa` collection have the following countries:

In [6]:
spa_issn_country = pd.DataFrame([
    ("0021-2571", "IT"),
    ("0042-9686", "CH"),
    ("1020-4989", "US"),
    ("1555-7960", "US"),
], columns=["issn", "origin"])
spa_issn_country # For collection = "spa", only!

Unnamed: 0,issn,origin
0,0021-2571,IT
1,0042-9686,CH
2,1020-4989,US
3,1555-7960,US


## Languages dataset

This dataset is the
[Network spreadsheet/CSV pack](https://static.scielo.org/tabs/tabs_network.zip)
 which can be found in the
[SciELO Analytics report](https://analytics.scielo.org/w/reports)
web page.
The first two rows of it are:

#### Unzip the CSV file

In [7]:
import zipfile
# Use the Zip file in jcatalog/data/scielo
with zipfile.ZipFile('../../data/scielo/tabs_network_181203.zip', 'r') as zip_ref:
    zip_ref.extract('documents_languages.csv', 'csv_files')

In [8]:
dataset = pd.read_csv("csv_files/documents_languages.csv", keep_default_na=False)
dataset.head(3).T

Unnamed: 0,0,1,2
extraction date,2018-11-10,2018-11-10,2018-11-10
study unit,document,document,document
collection,scl,scl,scl
ISSN SciELO,0100-879X,0100-879X,0100-879X
ISSN's,0100-879X;1414-431X,0100-879X;1414-431X,0100-879X;1414-431X
title at SciELO,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...,Brazilian Journal of Medical and Biological Re...
title thematic areas,Biological Sciences;Health Sciences,Biological Sciences;Health Sciences,Biological Sciences;Health Sciences
title is agricultural sciences,0,0,0
title is applied social sciences,0,0,0
title is biological sciences,1,1,1


We won't need all the information,
and we can simplify the column names
for the columns we need:

In [26]:
names_map = {
    "ISSN SciELO": "issn",
    "title at SciELO":"title",
    "document publishing ID (PID SciELO)": "pid",
    "document is citable": "is_citable",
    "document publishing year": "year",
    "document pt": "document_pt",
    "document es": "document_es",
    "document en": "document_en",
    "document other languages": "document_other_languages"
}
df = dataset[list(names_map.keys())].rename(columns=names_map)
df[:5]

Unnamed: 0,issn,title,pid,is_citable,year,document_pt,document_es,document_en,document_other_languages
0,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800006,1,1998,0,0,1,0
1,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800011,1,1998,0,0,1,0
2,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800005,1,1998,0,0,1,0
3,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800009,1,1998,0,0,1,0
4,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800010,1,1998,0,0,1,0


#### Add pub_year (ate_1996)

In [28]:
df["pub_year"] = np.where(df['year'] <= 1996, 'ate_1996', df["year"])
df[:5]

Unnamed: 0,issn,title,pid,is_citable,year,document_pt,document_es,document_en,document_other_languages,pub_year
0,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800006,1,1998,0,0,1,0,1998
1,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800011,1,1998,0,0,1,0,1998
2,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800005,1,1998,0,0,1,0,1998
3,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800009,1,1998,0,0,1,0,1998
4,0100-879X,Brazilian Journal of Medical and Biological Re...,S0100-879X1998000800010,1,1998,0,0,1,0,1998


In [30]:
df.shape

(855107, 10)

## Adding journal country as `origin`

The `country` column in the last dataframe is the affiliation country,
not the journal/article origin country.
Let's add the former as a new `origin` column,
grabbing it from the collection
or from the ISSN (when collection is `spa`):

In [10]:
cdfwof = pd.concat([
    pd.merge(cdf[cdf["collection"] != "spa"], amc_pairs,        how="inner", on="collection"),
    pd.merge(cdf[cdf["collection"] == "spa"], spa_issn_country, how="inner", on="issn"),
])
cdfwof[610_000::80_000] # wof stands for "With Origin, Filtered"

Unnamed: 0,pid,country,is_citable,issn,collection,year,origin
610000,S0104-11692014000500755,BR,1,0104-1169,scl,2014,BR
690000,S2237-96222016000300499,BR,1,2237-9622,scl,2016,BR
770000,S0034-71672018000700704,BR,1,0034-7167,scl,2018,BR
850000,S1851-300X2012000400008,AR,0,1851-300X,arg,2012,AR
930000,S0717-75182010000300001,BR,1,0717-7518,chl,2010,CL
1010000,S0034-74931996000200001,,1,0034-7493,cub,1996,CU
1090000,S0123-93922011000100008,CO,1,0123-9392,col,2011,CO
1170000,S0123-59232017000200153,CO,1,0123-5923,col,2017,CO
1250000,S0212-71992005000900004,,1,0212-7199,esp,2005,ES
1330000,S1870-34532013000200032,MX,1,1870-3453,mex,2013,MX


The rows without an assignable origin have been removed:

### Add years

In [11]:
cdfwof["years"] = np.where(cdfwof['year'] <= 1996, 'ate_1996', cdfwof["year"])

In [12]:
cdf.shape

(1725496, 6)

In [13]:
cdfwof.shape

(1554496, 8)

In [14]:
cdfwof[(cdfwof["pid"] == "S0004-27302009000900010")]

Unnamed: 0,pid,country,is_citable,issn,collection,year,origin,years
287004,S0004-27302009000900010,BR,1,0004-2730,scl,2009,BR,2009
287005,S0004-27302009000900010,BR,1,0004-2730,scl,2009,BR,2009
287006,S0004-27302009000900010,BR,1,0004-2730,scl,2009,BR,2009


## Country summary

Are the affiliations countries and the journal/origin country always the same?
The goal now is to create a summary of the affiliation countries
by comparing them to the journal/origin country.

In [15]:
origin_country = cdfwof["country"] == cdfwof["origin"]

In [16]:
result = cdfwof.assign(
    origin_country=origin_country,
    other_country=~(origin_country | (cdfwof["country"] == "")),
    no_country=cdfwof["country"] == "",
).groupby("pid").sum().assign(
    has_origin=lambda df: df["origin_country"].apply(bool),
    has_other=lambda df: df["other_country"].apply(bool),
    has_no=lambda df: df["no_country"].apply(bool),
).assign(
    has_both=lambda df: df["has_origin"] & df["has_other"],
    all_no=lambda df: ~(df["has_origin"] | df["has_other"]),
).applymap(int)

In [17]:
result[:20_000:2_500]

Unnamed: 0_level_0,is_citable,year,origin_country,other_country,no_country,has_origin,has_other,has_no,has_both,all_no
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
S0001-37141998000300001,1,1998,0,0,1,0,0,1,0,1
S0001-60022001000300006,1,2001,0,0,1,0,0,1,0,1
S0003-25732005000500009,1,2005,0,0,1,0,0,1,0,1
S0004-06222003000200001,1,2003,0,0,1,0,0,1,0,1
S0004-27302009000900010,3,6027,3,0,0,1,0,0,0,0
S0004-27492007000400003,3,6021,3,0,0,1,0,0,0,0
S0004-28032017000400356,1,2017,1,0,0,1,0,0,0,0
S0004-282X1992000100015,1,1992,1,0,0,1,0,0,0,0


Each row has an affiliation summary for a single article,
identified by its PID.
A brief explanation of the columns:

* `origin_country`: Number of affiliations whose country is the origin country;
* `other_country`: Number of affiliations whose country isn't the origin country;
* `no_country`: Number of affiliations whose country is unknown;
* `has_origin`: This article has at least one affiliation whose country is the origin country;
* `has_other`: This article has at least one affiliation whose country isn't the origin country;
* `has_no`: This article has at least one affiliation whose country is unknown;
* `has_both`: This article has affiliations from both the origin country and another country;
* `all_no`: All affiliations are from unknown countries.

The trailing columns are represented by the integers
`1` (meaning `True`) and `0` (meaning `False`).

## Final result

Let's join the ISSN, collection and origin information to our analysis:

In [18]:
full_result = \
    pd.merge(result.reset_index(),
             cdfwof[["pid", "issn", "collection", "origin", "is_citable", "years"]].drop_duplicates(),
             how="left", on="pid") \
      .set_index("pid") \
      .sort_index()
full_result[7_500::30_000]

Unnamed: 0_level_0,is_citable_x,year,origin_country,other_country,no_country,has_origin,has_other,has_no,has_both,all_no,issn,collection,origin,is_citable_y,years
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
S0004-06222003000200001,1,2003,0,0,1,0,0,1,0,1,0004-0622,ven,VE,1,2003
S0026-17422016000400056,0,2016,0,0,1,0,0,1,0,1,0026-1742,mex,MX,0,2016
S0034-89102014000600925,3,6042,2,1,0,1,1,0,1,0,0034-8910,scl,BR,1,2014
S0065-17372003000100008,2,4006,0,0,2,0,0,1,0,1,0065-1737,mex,MX,1,2003
S0100-204X2013000200011,2,4026,2,0,0,1,0,0,0,0,0100-204X,scl,BR,1,2013
S0100-879X2002000100002,1,2002,1,0,0,1,0,0,0,0,0100-879X,scl,BR,1,2002
S0102-311X2009000800005,4,8036,4,0,0,1,0,0,0,0,0102-311X,scl,BR,1,2009
S0103-21862011000100013,0,2011,0,0,1,0,0,1,0,1,0103-2186,scl,BR,0,2011
S0104-07072005000400014,1,2005,0,1,0,0,1,0,0,0,0104-0707,scl,BR,1,2005
S0120-28042014000300002,4,8056,4,0,0,1,0,0,0,0,0120-2804,col,CO,1,2014


### Check

In [19]:
full_result[153234:154000].head(70)

Unnamed: 0_level_0,is_citable_x,year,origin_country,other_country,no_country,has_origin,has_other,has_no,has_both,all_no,issn,collection,origin,is_citable_y,years
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
S0100-83581995000100002,2,3990,0,2,0,0,1,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100003,2,3990,2,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100004,2,3990,2,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100005,3,5985,3,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100006,2,3990,1,0,1,1,0,1,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100007,1,1995,1,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100008,2,3990,2,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000100009,3,5985,3,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000200001,2,3990,2,0,0,1,0,0,0,0,0100-8358,scl,BR,1,ate_1996
S0100-83581995000200002,4,7980,3,0,1,1,0,1,0,0,0100-8358,scl,BR,1,ate_1996


## Checking the result

There should be no more affiliations than what we had when we started... nor less...

In [20]:
full_result[["origin_country", "other_country", "no_country"]].values.sum() == cdfwof.shape[0]

True

In [21]:
full_result.shape

(774180, 15)

In [22]:
print(f"Notebook processing duration: {datetime.utcnow() - start}")

Notebook processing duration: 0:00:21.680409


## Pivot Table

###### filter by is_citable

In [23]:
filter_citables = full_result.loc[(full_result['is_citable_y'] == 1)]
filter_citables.shape

(683694, 15)

In [24]:
values_list = ["has_origin", "has_other", "has_no", "has_both", "all_no"]

td = filter_citables.pivot_table(
     index=["issn"],
     values=values_list,
     columns=["years"],
     aggfunc=np.count_nonzero,
     fill_value=0)

In [25]:
td.T

Unnamed: 0_level_0,issn,0001-3714,0001-3765,0001-6002,0001-6365,0002-0591,0002-192X,0002-7014,0003-2573,0004-0592,0004-0614,...,2451-6600,2468-9963,2469-0961,2504-3145,2518-4431,2520-9868,2531-0488,2531-1379,2545-7756,2595-3192
Unnamed: 0_level_1,years,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
all_no,1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,1998,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,1999,0,0,0,40,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,2000,0,12,20,45,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,2001,0,20,22,45,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,2002,0,1,3,48,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,2003,0,1,4,50,0,14,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,2004,0,1,17,40,0,7,0,25,0,0,...,0,0,0,0,0,0,0,0,0,0
all_no,2005,0,1,26,36,0,9,2,39,0,24,...,0,0,0,0,0,0,0,0,0,0
all_no,2006,0,0,12,54,0,10,0,35,0,14,...,0,0,0,0,0,0,0,0,0,0


#### Renames the labels for CSV

In [26]:
# r is rename
r = {"has_origin":"pais_",
     "has_other":"estrang_",
     "has_no":"nao_ident_",
     "has_both":"pais_estrang_",
     "all_no":"nao_ident_todos_"
    }
newlabel = []
for k in td.keys():
    newlabel.append(r[k[0]]+k[1]) 

In [27]:
newlabel[::24]

['nao_ident_todos_1997',
 'pais_estrang_1997',
 'nao_ident_1997',
 'pais_1997',
 'estrang_1997']

In [28]:
td.columns = newlabel

In [29]:
td.head(9)

Unnamed: 0_level_0,nao_ident_todos_1997,nao_ident_todos_1998,nao_ident_todos_1999,nao_ident_todos_2000,nao_ident_todos_2001,nao_ident_todos_2002,nao_ident_todos_2003,nao_ident_todos_2004,nao_ident_todos_2005,nao_ident_todos_2006,...,estrang_2011,estrang_2012,estrang_2013,estrang_2014,estrang_2015,estrang_2016,estrang_2017,estrang_2018,estrang_2019,estrang_ate_1996
issn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001-3714,0,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0001-3765,0,0,0,12,20,1,1,1,1,0,...,30,16,29,24,46,48,48,64,0,0
0001-6002,0,0,0,20,22,3,4,17,26,12,...,5,0,0,0,0,1,3,3,0,0
0001-6365,0,0,40,45,45,48,50,40,36,54,...,0,0,0,0,0,0,0,0,0,0
0002-0591,0,0,0,0,0,0,0,0,0,0,...,0,4,4,3,0,0,0,0,0,0
0002-192X,0,0,0,0,0,1,14,7,9,10,...,3,1,1,0,0,0,0,0,0,0
0002-7014,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
0003-2573,0,0,0,0,0,0,0,25,39,35,...,10,16,11,16,6,10,8,0,0,0
0004-0592,0,0,0,0,0,0,0,0,0,0,...,114,57,60,61,0,0,0,0,0,0


In [30]:
td.to_csv("output/td_languages_network.csv")

In [31]:
print(f"Notebook processing duration: {datetime.utcnow() - start}")

Notebook processing duration: 0:00:23.641049
