# Monthly Update
October 2, 2017

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import date
import numpy as np
import os
import pandas as pd
import sys
ancestor = os.path.abspath(os.path.join(os.pardir, os.pardir))
if ancestor not in sys.path:
    sys.path.append(ancestor)
from Biblio_Reader import manager as mg
from Biblio_Reader.biblio_reader import pdf_utils
from Biblio_Reader.biblio_reader import pub_med
from Biblio_Reader.biblio_reader.text_tools import convertToText, text_tools
initial_date = date(2017, 3, 22)

Load existing data…

In [None]:
ytd = '../outputs/FCP_DATA_updated_{0}.csv'.format(date.today().year)
existing = pd.read_csv(ytd) if os.path.exists(ytd) else pd.read_csv('../outputs/FCP_DATA.csv')
if not os.path.exists(ytd):
    existing['DateAdded'] = initial_date
existing.dropna(axis=0, how='all', inplace=True)

Load today's data…

In [None]:
today = pd.read_csv(os.path.join('../outputs', '{0}.csv'.format(str(date.today()))))
today['DateAdded'] = date.today()

Lowercase, stripped titles for comparison…

In [None]:
today_lower = today.copy()
existing_lower = existing.copy()
today_lower['Title'] = today_lower['Title'].apply(lambda x: x.strip().lower())
existing_lower['Title'] = existing_lower['Title'].apply(lambda x: x.strip().lower())

Drop duplicates…

In [None]:
today_lower = today_lower.loc[~today_lower.duplicated(subset=['URL'])]  # internal dupes by URL
today_lower = today_lower.loc[~today_lower.duplicated(subset=['Title'])]  # internal dupes by Title
today_lower = today_lower.loc[~today_lower['URL'].isin(existing_lower['URL'])]  # cross-csv dupes by URL
today_lower = today_lower.loc[~today_lower['Title'].isin(existing_lower['Title'])]  # cross-csv dupes by Title
today = today[today['Unnamed: 0'].isin(today_lower['Unnamed: 0'])]

Visually check for straggling duplicates:

In [None]:
list(today['Title'].sort_values().unique())

See what preprints have since been published…

In [None]:
preprints_and_others = existing_lower[existing_lower['Journal Category'].isin(['Other', 'Preprint'])]
preprints = existing_lower[existing_lower['Journal Category'] == 'Preprint']
potentials = list(set(today_lower.loc[today_lower['Title'].isin(preprints_and_others['Title'])]['Title']
                ) | set(today_lower.loc[today_lower['Authors'].isin(preprints_and_others['Authors'])]['Title']))
print(potentials)

In [None]:
today['i'] = range(max(existing['i']) + 1, max(existing['i']) + 1 + today.shape[0])
today = today.drop('Unnamed: 0', axis=1)
today.index = today.i

In [None]:
today_dir = os.path.abspath(os.path.join(ancestor, 'Biblio_Reader', str(date.today())))
if not os.path.exists(today_dir):
    os.makedirs(today_dir)

PubMed IDs:

In [None]:
today['PMCID'] = pub_med.get_ids(today)
today_pubmed = os.path.join(mg.WORKING_PATH, str(date.today()))
if not os.path.exists(today_pubmed):
    os.makedirs(today_pubmed)
pub_med.write_bib(today, today_pubmed)
pub_med.parse_bib(today_pubmed, "{0}_bibs.csv".format(today_pubmed))

In [None]:
d = pdf_utils.pdffinder(
    zip(today['i'], today['URL']),
    today_dir)

In [None]:
d = pdf_utils.pdfopener(today, today_dir)

In [None]:
d = pdf_utils.arxiv_open(zip(today['i'], today['URL']),
    today_dir)

In [None]:
d = pdf_utils.plos_open(zip(today['i'], today['URL']),
    today_dir)

In [None]:
d = pdf_utils.liebert_open(zip(today['i'], today['URL']),
    today_dir)

In [None]:
d = pdf_utils.citeseer_open(zip(today['i'], today['URL']),
    today_dir)

Manually download as many of these as you can:

In [None]:
have_nots = set(pdf_utils.find_corrupted(today_dir))
haves = [int(i.rstrip('.pdf')) for i in os.listdir(today_dir) if i.endswith('pdf')]
have_nots = ({i for i in today['i'] if i not in haves} | have_nots)

In [None]:
for i in have_nots:
    path = today['URL']
    path = path.apply(lambda x: x[26:] if x.startswith('http://scholar.google.com/') else x)
    print(i, end=" : ")
    print(path[i], end=" : ")
    print(today['Title'][i])

Check again…

In [None]:
have_nots = set(pdf_utils.find_corrupted(today_dir))
haves = [int(i.rstrip('.pdf')) for i in os.listdir(today_dir) if i.endswith('pdf')]
have_nots = ({i for i in today['i'] if i not in haves} | have_nots)

In [None]:
for i in have_nots:
    path = today['URL']
    path = path.apply(
           lambda x:
               x[26:] if x.startswith('http://scholar.google.com/') else x
           )
    print(i, end=" : ")
    print(path[i], end=" : ")
    print(today['Title'][i])

After downloading pdfs, convert to plaintext:

In [None]:
txt_dir = os.path.join(today_dir, 'txt')
if not os.path.exists(txt_dir):
    os.makedirs(txt_dir)
convertToText.walkAndText(today_dir, txt_dir)

Collect relevant ¶s:

In [None]:
today['Excerpt'] = pd.Series(
                       text_tools.find_paragraphs(
                           txt_dir,
                           [*mg.WEIGHTED_SETS, *mg.UNWEIGHTED_SETS],
                           os.path.join(today_dir, 'excerpts')
                       )
                   ).apply(lambda x: "\n\n".join(x))

In [None]:
sets = text_tools.assoc_sets(
            today,
            txt_dir,
            mg.WEIGHTED_SETS,
            mg.UNWEIGHTED_SETS
        )
today['Sets'] = pd.Series(
    {k:sets[k]
        for k in sets if k>=min(today['i'])
    }
)

Update csv:

In [None]:
existing.merge(today, how='outer').to_csv(ytd)

Vet what's left on the web:

In [None]:
pdf_utils.find_articles_left(
    zip(today['i'].astype(str), today['URL']),
    txt_dir)

Human-readable excerpt txt file:

In [None]:
with open(os.path.join(today_dir, 'excerpts.txt'), 'w') as f:
    f.write("".join(["{0}\n\n{1}\n\n\n\n".format(i, j)
        for i, j in zip(today.index, today['Excerpt'])])
    )

Come back after all the manual vetting and run the numbers!

In [7]:
existing.columns

Index(['Unnamed: 0', 'i', 'Title', 'URL', 'Year', 'Citations', 'Versions',
       'Cluster ID', 'PDF link', 'Citations list', 'Versions list',
       'Citation link', 'Excerpt', 'Journal', 'Authors', 'Publisher',
       'Citations Per Year', 'Journal Category', 'PMCID', 'Affiliations',
       'Qualifiers', 'Data Use', 'Sets', 'Contributor', 'Duplicate of',
       'DateAdded'],
      dtype='object')

In [None]:
existing.shape

In [None]:
len(existing.loc[
    (existing['Data Use'].isin(['Y', 'N'])) &
    (existing['Contributor'] == 'Contributor')
])

In [None]:
len(existing.loc[
    (existing['Journal Category'] == 'Journal') & 
    (existing['Data Use'].isin(['Y'])) &
    (existing['Contributor'] == 'Contributor')
])

In [None]:
len(existing.loc[
    (existing['Data Use'].isin(['Y', 'N'])) &
    (existing['Contributor'] == 'Not a Contributor')
])

In [15]:
len(existing.loc[
    ((existing['Journal Category'] == 'Journal') |
     (existing['Qualifiers'] == 'peer-reviewed')) & 
    (existing['Data Use'].isin(['Y', 'S', 'N'])) &
    (existing['Contributor'] == 'Not a Contributor')
])

923

In [16]:
len(existing.loc[
    ((existing['Journal Category'] == 'Journal') |
     (existing['Qualifiers'] == 'peer-reviewed')) & 
    (existing['Data Use'].isin(['Y', 'S', 'N']))
])

1060

In [6]:
len(existing.loc[
    (existing['Data Use'].isin(['Y', 'N']))
])

1442

In [5]:
len(existing.loc[    
    (existing['Journal Category'] == 'Journal') & 
    (existing['Data Use'].isin(['Y', 'N']))
])

993