### Data Pre-processing and Cleaning

In [1]:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook

In [2]:
import math

def isOpen(line):
    try:
        if math.isnan(line):
            line = 'closed'
    except TypeError:
        line = 'open'
    return line

# Create a new column that checks whether the paper is open or not
#data['isopen'] = data.pmd.apply(lambda l:isOpen(l))

#### Tried this bash approach 

This did not work. The join command could not accept tab separated output. 

In [4]:
%%bash
LANG=en_EN sort -d ../Data/CleanedPubidJournalYear.txt \
>../Data/CleanedPubidJournalYearSorted.txt

LANG=en_EN sort -d ../Data/pmid_pmc_check.txt \
>../Data/pmid_pmc_check_sorted.txt

LANG=en_EN join ../Data/pmid_pmc_check_sorted.txt \
../Data/CleanedPubidJournalYearSorted.txt \
>../Data/CleanedPubidJournalYearPmic.txt

#### So I Wrote a Python Code to parse the Abstracts for useful information

In [5]:
with open('../Data/CleanedPubidJournalYear.txt','w') as clean:
    with open('../Data/abstracts.txt') as abstract:
        tag = False
        for line in abstract:
            if line[0].isdigit() and (
                line[1:3] == '. ' or line[2:4] == '. ' or line[3:5] == '. '):
                if tag:
                    continue
                else:
                    try:
                        date = line.replace(
                            ';','.').replace(':','.').split('.')[2]
                        journal = line.replace(
                            ';','.').replace(':','.').split('.')[1]
                        tag = True
                    except IndexError:
                        print(line)
                        tag = False
            if tag and line.startswith('PMID:'):
                pubid = line.split()[1]
                tag=False
                clean.write('%s\t%s\t%s\n' % (pubid, journal, date.strip()))
            

11. Providing Sustainable Mental and Neurological Health Care in Ghana and

9. RETRACTED ARTICLE

40. RETRACTED ARTICLE

68. RETRACTED ARTICLE

84. RETRACTED ARTICLE



Keep in mind that 4 papers had been retrated and therefore their details were not parsed correctly and were not included in the analysis.

Used the script below to confirm the recheck articles that were published in August 2018. Most had not been assigned PMIC ID yet they were open.

In [6]:
%%bash
for line in 30106283 30105967 30105965 30105964 30105251
    do
        efetch -db pubmed -id $line \
        -format xml | xtract \
        -pattern ArticleIdList -element ArticleId |cut -f1,4
    done

30106283
30105967
30105965
30105964
30105251


#### Merge the data 

In [7]:
pmc_pmid = pd.read_table('../Data/pmid_pmc_check.txt',
                         header=None, names=['pmid', 'pmcid'])

pmc_pmid['isopen'] = pmc_pmid['pmcid'].apply(lambda l:isOpen(l))

In [8]:
journal_year = pd.read_table(
    '../Data/CleanedPubidJournalYear.txt',
    header=None, names=['pmid','journal','date'])

In [9]:
data = pd.merge(pmc_pmid, journal_year, on="pmid")
data.set_index('pmid', inplace=True)

#### Conver the date column to date format

In [10]:
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data['year'] = data.date.dt.year
data['month'] = data.date.dt.month

### Save the data in a csv for future re-use

In [11]:
data.to_csv('../Data/PMID_PMC_Journal_Year.csv')