In [93]:
import requests
import pandas as pd
from pathlib import Path
from argparse import ArgumentParser
import re
import numpy as np

In [94]:
inputfile = Path("../oaa-2018/data/applicants.csv")
df = pd.read_csv(inputfile, na_values=[""])

headers = {'Accept': 'application/json'} 
issn_url = "https://doaj.org/api/v1/search/journals/issn:{}"
name_url = "https://doaj.org/api/v1/search/journals/title:{}"

In [95]:
# check duplicate submission
print("*** Checking summit IDs")
unique_cols = ['Name (First)', 'Name (Last)', 'Email', 'Student ID#', 'Article Title', 'Please provide the ID of your paper in the SFU Research Summit repository']
df['duplicate_submission'] = df.duplicated(subset=unique_cols)
print("{} duplicate submissions were found among the applications.".format(df['duplicate_submission'].sum()))

*** Checking summit IDs
0 duplicate submissions were found among the applications.


In [96]:
print("*** Checking summit IDs")
df['uploaded_to_summit'] = df['Please provide the ID of your paper in the SFU Research Summit repository'].notna()
print("{} submission did not provide a Summit ID".format((~df['uploaded_to_summit']).sum()))

*** Checking summit IDs
1 submission did not provide a Summit ID


In [103]:
print("*** Checking journals")

i = 1
application_count = len(df)
for ix, row in df[["Journal Name","Journal ISSN"]].iterrows():
    name = row["Journal Name"]
    issn = row["Journal ISSN"]
    
    is_open_access = None
    
    print("{}/{} - {} ({})".format(i, application_count, name, issn))
    is_issn = re.match("[\S]{4}\-[\S]{4}", issn)
    
    if is_issn:
        r = requests.get(issn_url.format(issn), headers=headers)
        total = r.json()['total']
        print("Lookup by ISSN... found {} journals via ISSN".format(0))
        if total == 1:
            is_open_access = r.json()['results'][0]['bibjson']['license'][0]['open_access']
        elif total < 1:
            r = requests.get(name_url.format(name), headers=headers)
            total = r.json()['total']
            print("ISSN lookup no results. Lookup by name... found {} journals via name".format(0))
            if total == 1:
                is_open_access = r.json()['results'][0]['bibjson']['license'][0]['open_access']
    else:
        r = requests.get(name_url.format(name), headers=headers)
        total = r.json()['total']
        print("ISSN invalid. Lookup by name... found {} journals via name".format(0))
        if total == 1:
            is_open_access = r.json()['results'][0]['bibjson']['license'][0]['open_access']
    
    error_msg = None
    if is_open_access is True:
        print("Open Access: Yes")
    elif is_open_access is False:
        print("Open Access: No")
    else:
        error_msg = "Found {} possible candidate journals in DOAJ.".format(total)
        print("Requiring manual attention: " + error_msg)
        
    df.loc[ix, 'found_on_DOAJ'] = is_open_access
    df.loc[ix, 'doaj_error'] = error_msg
    print("")
    i = i + 1

print("Found in DOAJ: {},requiring attention: {}".format(df['found_on_DOAJ'].sum(), (df['found_on_DOAJ'] != True).sum()))

*** Checking journals
1/33 - Scientific Reports (2045-2322)
Lookup by ISSN... found 0 journals via ISSN
Open Access: Yes

2/33 - Intech Open (978-1-78923-365-0)
ISSN invalid. Lookup by name... found 0 journals via name
Requiring manual attention:Found 0 possible candidate journals in DOAJ.

3/33 - Ecosphere (2150-8925)
Lookup by ISSN... found 0 journals via ISSN
Open Access: Yes

4/33 - Global Qualitative Nursing Research  (DOI: 10.1177/2333393618785095)
ISSN invalid. Lookup by name... found 0 journals via name
Open Access: Yes

5/33 - International Journal of Qualitative Methods   (DOI: 10.1177/1609406918769444)
ISSN invalid. Lookup by name... found 0 journals via name
Open Access: Yes

6/33 - International Journal of Engineering Pedagogy (iJEP) (2192-4880)
Lookup by ISSN... found 0 journals via ISSN
Open Access: Yes

7/33 - Remote sensing journal/MDPI (2072-4292)
Lookup by ISSN... found 0 journals via ISSN
Open Access: Yes

8/33 - Frontiers in Physiology (1664-042X)
Lookup by ISSN...

In [117]:
df['eligible'] = df['found_on_DOAJ'] & df['uploaded_to_summit']
df.loc[df['found_on_DOAJ'].isna(), 'eligible'] = ""

In [119]:
df['eligible']

0     False
1          
2      True
3      True
4      True
5      True
6      True
7      True
8          
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16         
17         
18     True
19         
20         
21     True
22     True
23         
24     True
25         
26     True
27     True
28     True
29     True
30     True
31     True
32     True
Name: eligible, dtype: object

In [120]:
df.to_csv("applications_validate.csv")

In [None]:
inputfile = Path("../oaa-2018/data/verified_applicants.csv")
df = pd.read_csv(inputfile, na_values=[""])