# Dates pivot table

In [1]:
from datetime import datetime
start = datetime.utcnow() # For measuring the total processing time

In [2]:
import json
from urllib.request import urlopen
import pandas as pd
import numpy as np

  return f(*args, **kwds)


## Get collection information from ArticleMeta 

In [3]:
AMC_URL = "http://articlemeta.scielo.org/api/v1/collection/identifiers/"
amc_data = pd.DataFrame(json.load(urlopen(AMC_URL)))

print("Number of collections: " + str(amc_data.shape[0]+1))
amc_data.head(2)

Number of collections: 34


Unnamed: 0,acron,acron2,code,document_count,domain,has_analytics,is_active,journal_count,name,original_name,status,type
0,arg,ar,arg,39006.0,www.scielo.org.ar,True,True,"{'deceased': 22, 'current': 125}","{'en': 'Argentina', 'pt': 'Argentina', 'es': '...",Argentina,certified,journals
1,chl,cl,chl,63467.0,www.scielo.cl,True,True,"{'deceased': 13, 'suspended': 1, 'current': 105}","{'en': 'Chile', 'pt': 'Chile', 'es': 'Chile'}",Chile,certified,journals


##### Filtering valid collections and renames 'code' to 'collection'

Some collections won't be analyzed, mainly to avoid duplicates
(there are articles in more than one collection).
The `spa` (*Public Health* collection) should have part of it
kept in the result, but it's not a collection
whose journals/articles are assigned to a single country.
The collections below are linked to a single country:

In [4]:
dont_evaluate = ["bio", "cci", "cic", "ecu", "psi", "pry", "rve", "rvo", "rvt", "sss", "spa", "wid"]
amc_names_map = {"code": "collection"}
amc_pairs = amc_data[(amc_data["acron2"].str.len() == 2) & 
                     ~amc_data["code"].isin(dont_evaluate)]\
                    .rename(columns=amc_names_map)\
#                    "~amc_data["code"].isin(dont_evaluate)]" is denying the list "dont_evaluate"
print("Number of collections: " + str(amc_pairs.shape[0]+1))

collections = amc_pairs[['collection']].copy()
collections

Number of collections: 15


Unnamed: 0,collection
0,arg
1,chl
2,col
3,cub
4,esp
5,mex
6,prt
8,scl
11,sza
12,ven


## ISSN selection from `spa`

These journals in the `spa` collection have the following countries:

In [5]:
spa_issn_country = pd.DataFrame([
    ("0021-2571"),
    ("0042-9686"),
    ("1020-4989"),
    ("1555-7960"),
], columns=["issn"])
spa_issn_country # For collection = "spa", only!

Unnamed: 0,issn
0,0021-2571
1,0042-9686
2,1020-4989
3,1555-7960


## Dates dataset

This dataset is the
[Network spreadsheet/CSV pack](https://static.scielo.org/tabs/tabs_network.zip)
 which can be found in the
[SciELO Analytics report](https://analytics.scielo.org/w/reports)
web page.
The first two rows of it are:

#### Unzip the CSV file

In [6]:
import zipfile

# Use the Zip file in jcatalog/data/scielo
with zipfile.ZipFile( "../../data/scielo/tabs_network_190210.zip", 'r') as zip_ref:
    zip_ref.extract('documents_dates.csv', 'csv_files')

In [7]:
df0 = pd.read_csv('csv_files/documents_dates.csv', keep_default_na=False, low_memory=False)
df0.shape

(877068, 49)

#### Simplify the column names

In [8]:
names_map = {
    "ISSN SciELO": "issn",
    "title at SciELO":"title",
    "document publishing ID (PID SciELO)": "docs",
    "document type":"type",
    "document is citable": "is_citable",
    "document publishing year": "year"
}
#  df[list(names_map.keys())].rename(columns=names_map, inplace=True)
df0.rename(columns=names_map, inplace=True)
df0.head(2)

Unnamed: 0,extraction date,study unit,collection,issn,ISSN's,title,title thematic areas,title is agricultural sciences,title is applied social sciences,title is biological sciences,...,document published at month,document published at day,document published in SciELO at,document published in SciELO at year,document published in SciELO at month,document published in SciELO at day,document updated in SciELO at,document updated in SciELO at year,document updated in SciELO at month,document updated in SciELO at day
0,2019-02-10,document,scl,0100-879X,0100-879X;1414-431X,Brazilian Journal of Medical and Biological Re...,Biological Sciences;Health Sciences,0,0,1,...,8,,1998-09-21,1998,9,21,2016-06-30,2016,6,30
1,2019-02-10,document,scl,0100-879X,0100-879X;1414-431X,Brazilian Journal of Medical and Biological Re...,Biological Sciences;Health Sciences,0,0,1,...,8,,1998-09-21,1998,9,21,2016-06-30,2016,6,30


### Creates a new DataFrame: filtering SPA and discarding those collections that are not analyzable

In [9]:
df = pd.concat([
    pd.merge(df0[df0["collection"] != "spa"], collections,      how="inner", on="collection"),
    pd.merge(df0[df0["collection"] == "spa"], spa_issn_country, how="inner", on="issn"),
])

In [10]:
df.head(2)

Unnamed: 0,extraction date,study unit,collection,issn,ISSN's,title,title thematic areas,title is agricultural sciences,title is applied social sciences,title is biological sciences,...,document published at month,document published at day,document published in SciELO at,document published in SciELO at year,document published in SciELO at month,document published in SciELO at day,document updated in SciELO at,document updated in SciELO at year,document updated in SciELO at month,document updated in SciELO at day
0,2019-02-10,document,scl,0100-879X,0100-879X;1414-431X,Brazilian Journal of Medical and Biological Re...,Biological Sciences;Health Sciences,0,0,1,...,8,,1998-09-21,1998,9,21,2016-06-30,2016,6,30
1,2019-02-10,document,scl,0100-879X,0100-879X;1414-431X,Brazilian Journal of Medical and Biological Re...,Biological Sciences;Health Sciences,0,0,1,...,8,,1998-09-21,1998,9,21,2016-06-30,2016,6,30


In [11]:
# compare
df0.shape

(877068, 49)

In [12]:
df.shape

(793648, 49)

In [13]:
set(df.collection)

{'arg',
 'bol',
 'chl',
 'col',
 'cri',
 'cub',
 'esp',
 'mex',
 'per',
 'prt',
 'scl',
 'spa',
 'sza',
 'ury',
 'ven'}

### Add pub_year (ate_1996)

In [14]:
df["pub_year"] = np.where(df['year'] <= 1996, 'ate_1996', df["year"])

### Convert strint to int

In [15]:
df['document published at year'] = pd.to_numeric(df['document published at year'], errors='coerce')
df['document published at month'] = pd.to_numeric(df['document published at month'], errors='coerce')

df['document accepted at year'] = pd.to_numeric(df['document accepted at year'], errors='coerce')
df['document accepted at month'] = pd.to_numeric(df['document accepted at month'], errors='coerce')

df['document submitted at year'] = pd.to_numeric(df['document submitted at year'], errors='coerce')
df['document submitted at month'] = pd.to_numeric(df['document submitted at month'], errors='coerce')

### Get the current Year

In [16]:
current_year = datetime.now().year
print(current_year)

2019


### Insert columns for checking

In [17]:
df['check_doc_pub_scielo'] = np.where(
    (df['document published in SciELO at year'] >= 1997) & 
    (df['document published in SciELO at year'] <= current_year) & 
    (df['document published in SciELO at month'] >= 1) & 
    (df['document published in SciELO at month'] <= 12) &
    (df['document published in SciELO at day'] >= 1) & 
    (df['document published in SciELO at day'] <= 31), 0,1)

In [18]:
df['check_doc_pub'] = np.where(
    (df['document published at year'] >= 1997) & 
    (df['document published at year'] <= current_year) & 
    (df['document published at month'] >= 1) & 
    (df['document published at month'] <= 12), 0, 1)

In [19]:
df['check_doc_accepted'] = np.where(
    (df['document accepted at year'] >= 1997) & 
    (df['document accepted at year'] <= current_year) & 
    (df['document accepted at month'] >= 1) & 
    (df['document accepted at month'] <= 12), 0, 1)

In [20]:
df['check_doc_submitted'] = np.where(
    (df['document submitted at year'] >= 1997) & 
    (df['document submitted at year'] <= current_year) & 
    (df['document submitted at month'] >= 1) & 
    (df['document submitted at month'] <= 12), 0, 1)

### Insert columns with calcule of months

In [21]:
df['meses_sub_aprov'] = np.where(
    (df.check_doc_submitted == 0) & (df.check_doc_accepted == 0),
    (df['document accepted at year'] * 12 + df['document accepted at month']) - 
    (df['document submitted at year'] * 12 + df['document submitted at month']), np.nan)

In [22]:
df['meses_aprov_pub'] = np.where(
    (df.check_doc_accepted == 0) & (df.check_doc_pub == 0),
    (df['document published at year'] * 12 + df['document published at month']) - 
    (df['document accepted at year'] * 12 + df['document accepted at month']), np.nan)

In [23]:
df['meses_sub_pub'] = np.where(
    (df.check_doc_submitted == 0) & (df.check_doc_pub == 0),
    (df['document published at year'] * 12 + df['document published at month']) - 
    (df['document submitted at year'] * 12 + df['document submitted at month']), np.nan)

In [24]:
df['meses_aprov_pub_scielo'] = np.where(
    (df.check_doc_accepted == 0) & (df.check_doc_pub_scielo == 0),
    (df['document published in SciELO at year'] * 12 + df['document published in SciELO at month']) - 
    (df['document accepted at year'] * 12 + df['document accepted at month']), np.nan)

In [25]:
df['meses_sub_pub_scielo'] = np.where(
    (df.check_doc_submitted == 0) & (df.check_doc_pub_scielo == 0),
    (df['document published in SciELO at year'] * 12 + df['document published in SciELO at month']) - 
    (df['document submitted at year'] * 12 + df['document submitted at month']), np.nan)

### Filter citables documents

In [26]:
dfcit = df[df['is_citable'] == 1]
dfcit.shape

(700756, 59)

### Pivot Table

In [27]:
values_list = ['meses_sub_aprov', 
               'meses_aprov_pub', 
               'meses_sub_pub', 
               'meses_aprov_pub_scielo', 
               'meses_sub_pub_scielo']

td = dfcit.pivot_table(
     index=["issn"],
     values=values_list,
     columns=["pub_year"],
     aggfunc=[np.nanmean, np.nanstd],
     fill_value="")

  f = lambda x: func(x, *args, **kwargs)
  keepdims=keepdims)


In [28]:
td.head(10).T

Unnamed: 0_level_0,Unnamed: 1_level_0,issn,0001-3714,0001-3765,0001-6002,0001-6365,0002-0591,0002-192X,0002-7014,0003-2573,0004-0592,0004-0614
Unnamed: 0_level_1,Unnamed: 1_level_1,pub_year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
nanmean,meses_aprov_pub,1997,,,,,,,,,,
nanmean,meses_aprov_pub,1998,1.46875,,,,,,,,,
nanmean,meses_aprov_pub,1999,1.42553,,,,,,,,,
nanmean,meses_aprov_pub,2000,,5.10638,4.22222,6.16667,,,,,,
nanmean,meses_aprov_pub,2001,,5.71739,2.15,6.8125,,,,,,
nanmean,meses_aprov_pub,2002,,5.16,3.45455,6.95238,,,,,,
nanmean,meses_aprov_pub,2003,,4.675,2.5,8.5,,,,,,
nanmean,meses_aprov_pub,2004,,5.10976,3.26087,11.7826,,,,,,
nanmean,meses_aprov_pub,2005,,6.91667,3.26087,11.9286,,,10.0357,,,
nanmean,meses_aprov_pub,2006,,7.70968,4.56667,16.5484,,,11.2041,,,


In [29]:
td.columns.levels

FrozenList([['nanmean', 'nanstd'], ['meses_aprov_pub', 'meses_aprov_pub_scielo', 'meses_sub_aprov', 'meses_sub_pub', 'meses_sub_pub_scielo'], ['1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', 'ate_1996']])

### Renames the labels for CSV

In [30]:
td.keys()
for k in td.keys():
    print(k)

('nanmean', 'meses_aprov_pub', '1997')
('nanmean', 'meses_aprov_pub', '1998')
('nanmean', 'meses_aprov_pub', '1999')
('nanmean', 'meses_aprov_pub', '2000')
('nanmean', 'meses_aprov_pub', '2001')
('nanmean', 'meses_aprov_pub', '2002')
('nanmean', 'meses_aprov_pub', '2003')
('nanmean', 'meses_aprov_pub', '2004')
('nanmean', 'meses_aprov_pub', '2005')
('nanmean', 'meses_aprov_pub', '2006')
('nanmean', 'meses_aprov_pub', '2007')
('nanmean', 'meses_aprov_pub', '2008')
('nanmean', 'meses_aprov_pub', '2009')
('nanmean', 'meses_aprov_pub', '2010')
('nanmean', 'meses_aprov_pub', '2011')
('nanmean', 'meses_aprov_pub', '2012')
('nanmean', 'meses_aprov_pub', '2013')
('nanmean', 'meses_aprov_pub', '2014')
('nanmean', 'meses_aprov_pub', '2015')
('nanmean', 'meses_aprov_pub', '2016')
('nanmean', 'meses_aprov_pub', '2017')
('nanmean', 'meses_aprov_pub', '2018')
('nanmean', 'meses_aprov_pub', '2019')
('nanmean', 'meses_aprov_pub', 'ate_1996')
('nanmean', 'meses_aprov_pub_scielo', '1997')
('nanmean', 'm

In [31]:
newlabel = []
for k in td.keys():
    newlabel.append(k[0]
                    .replace('nanmean', 'media')
                    .replace('nanstd', 'desvp')+'_'+k[1]+'_'+k[2])

In [32]:
newlabel

['media_meses_aprov_pub_1997',
 'media_meses_aprov_pub_1998',
 'media_meses_aprov_pub_1999',
 'media_meses_aprov_pub_2000',
 'media_meses_aprov_pub_2001',
 'media_meses_aprov_pub_2002',
 'media_meses_aprov_pub_2003',
 'media_meses_aprov_pub_2004',
 'media_meses_aprov_pub_2005',
 'media_meses_aprov_pub_2006',
 'media_meses_aprov_pub_2007',
 'media_meses_aprov_pub_2008',
 'media_meses_aprov_pub_2009',
 'media_meses_aprov_pub_2010',
 'media_meses_aprov_pub_2011',
 'media_meses_aprov_pub_2012',
 'media_meses_aprov_pub_2013',
 'media_meses_aprov_pub_2014',
 'media_meses_aprov_pub_2015',
 'media_meses_aprov_pub_2016',
 'media_meses_aprov_pub_2017',
 'media_meses_aprov_pub_2018',
 'media_meses_aprov_pub_2019',
 'media_meses_aprov_pub_ate_1996',
 'media_meses_aprov_pub_scielo_1997',
 'media_meses_aprov_pub_scielo_1998',
 'media_meses_aprov_pub_scielo_1999',
 'media_meses_aprov_pub_scielo_2000',
 'media_meses_aprov_pub_scielo_2001',
 'media_meses_aprov_pub_scielo_2002',
 'media_meses_aprov_pub_

In [33]:
newlabel[0::24]

['media_meses_aprov_pub_1997',
 'media_meses_aprov_pub_scielo_1997',
 'media_meses_sub_aprov_1997',
 'media_meses_sub_pub_1997',
 'media_meses_sub_pub_scielo_1997',
 'desvp_meses_aprov_pub_1997',
 'desvp_meses_aprov_pub_scielo_1997',
 'desvp_meses_sub_aprov_1997',
 'desvp_meses_sub_pub_1997',
 'desvp_meses_sub_pub_scielo_1997']

In [34]:
td.columns = newlabel

In [35]:
td.T

issn,0001-3714,0001-3765,0001-6002,0001-6365,0002-0591,0002-192X,0002-7014,0003-2573,0004-0592,0004-0614,...,2504-3145,2518-4431,2520-9868,2526-8910,2531-0488,2531-1379,2545-7756,2594-1321,2595-3192,2619-6573
media_meses_aprov_pub_1997,,,,,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_1998,1.46875,,,,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_1999,1.42553,,,,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_2000,,5.10638,4.22222,6.16667,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_2001,,5.71739,2.15,6.8125,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_2002,,5.16,3.45455,6.95238,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_2003,,4.675,2.5,8.5,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_2004,,5.10976,3.26087,11.7826,,,,,,,...,,,,,,,,,,
media_meses_aprov_pub_2005,,6.91667,3.26087,11.9286,,,10.0357,,,,...,,,,,,,,,,
media_meses_aprov_pub_2006,,7.70968,4.56667,16.5484,,,11.2041,,,,...,,,,,,,,,,


In [36]:
td.to_csv("output/td_documents_dates_network.csv")

In [37]:
print(f"Notebook processing duration: {datetime.utcnow() - start}")

Notebook processing duration: 0:00:35.673077
