In [None]:
import sys
import os
import re
import base64
import requests
import hashlib 
import time
import urllib
import json

import subprocess as sp
import pandas as pd

from datetime import datetime
from github import Github
from pyDataverse.api import Api, NativeApi
from pyDataverse.models import Datafile, Dataset

from config import DV_ALIAS, BASE_URL, API_TOKEN, REPO, GITHUB_TOKEN, PARSABLE_EXTENSIONS

# Utils

In [None]:
def extract_urls(content: str)->list:
    matches = re.findall(r"(http[^\s'\"\\]+)", content)
    pattern = re.compile(r"([^/\w]+)$")
    return [pattern.sub("", match) for match in matches]

In [None]:
def decode_github_content(content: str) -> str:
    return base64.b64decode(content).decode("utf-8")

In [None]:
def make_dataset_id(repo_name):
    return hashlib.md5(repo_name.encode("utf-8"))

In [None]:
def make_default_dataset(data, repo_name):
    ds_id = make_dataset_id(repo_name)    
    data[ds_id] = {'metadata': make_dataset_metadata(repo_name)}
    return data

In [None]:
def make_dataset_metadata(repo_name):
    metadata = {}
    metadata['termsOfAccess'] = ''
    metadata['title'] = 'Automatic uploads from {} github repository'.format(repo_name)
    metadata['subtitle'] = ''
    metadata['author'] = [{"authorName": repo_name,"authorAffiliation": "Coronawhy"}]
    metadata['dsDescription'] = [{'dsDescriptionValue': ''}]
    metadata['subject'] = ['Medicine, Health and Life Sciences']
    metadata['datasetContact'] = [{'datasetContactName': 'https://github.com/{}'.format(repo_name),'datasetContactEmail': ''}]
    
    return metadata

In [None]:
def make_file_metadata(repo_name, file, url):
    metadata = {}

    metadata['description'] = file
    metadata['filename'] = url
    metadata['datafile_id'] = hashlib.md5(url.encode("utf-8"))
    metadata['dataset_id'] = hashlib.md5(repo_name.encode("utf-8"))
    return metadata

In [None]:
def create_dataset(api, ds, dv_alias, mapping_dsid2pid, ds_id, base_url):
    try:
        resp = api.create_dataset(dv_alias, ds.json())
        pid = resp.json()['data']['persistentId']
    except:
        print(resp.content)
        return resp, mapping_dsid2pid
    
    mapping_dsid2pid[ds_id] = pid
    time.sleep(1)
    print('{0}/dataset.xhtml?persistentId={1}&version=DRAFT'.format(base_url,
                                                                    pid))
    return resp, mapping_dsid2pid

In [None]:
# Implementation adapted from http://guides.dataverse.org/en/latest/api/native-api.html#id62
def upload_datafile(server, api_key, p_id, repo_name, filename, repo_file, url):
    dataverse_server = server
    api_key = api_key
    persistentId = p_id


    files = {'file': (url.split('/')[-1], open(filename, 'rb'))}

    params = dict(description=repo_file,
                categories=[repo_name.split('/')[1]])

    params_as_json_string = json.dumps(params)

    payload = dict(jsonData=params_as_json_string)

    url_persistent_id = '%s/api/datasets/:persistentId/add?persistentId=%s&key=%s' % (dataverse_server, persistentId, api_key)

    print('-' * 40)
    print('making request')
    r = requests.post(url_persistent_id, data=payload, files=files)

    print('-' * 40)
    try:
        print(r.json())
    except:
        print(r.content)
    print(r.status_code)

# Github scraping

In [None]:
g = Github(GITHUB_TOKEN)

## Find urls in selected file extensions

In [None]:
repo = g.get_repo(REPO)
contents = repo.get_contents("")
urls_found = {}
while contents:
    file_content = contents.pop(0)
    if file_content.type == "dir":
        contents.extend(repo.get_contents(file_content.path))
        continue
        
    if len(PARSABLE_EXTENSIONS) == 0 or file_content.name.split('.')[-1] in PARSABLE_EXTENSIONS:
        urls = extract_urls(decode_github_content(file_content.content))
        if len(urls) > 0:
            urls_found[file_content.path] = extract_urls(decode_github_content(file_content.content))

print('Found {} URLs'.format(len(urls_found)))

## Create the dataset in dataverse

In [None]:
native_api = NativeApi(BASE_URL, API_TOKEN)

In [None]:
ds_id = str(int(make_dataset_id(REPO).hexdigest(), 16))[:6] ## turn the md5 string into a 6 digits integer
metadata = make_dataset_metadata(REPO)

In [None]:
mapping_dsid2pid = {}
ds = Dataset()
ds.set(metadata)
ds.displayName=metadata['title']
resp, mapping_dsid2pid = create_dataset(native_api, ds, DV_ALIAS, mapping_dsid2pid, ds_id, BASE_URL)

### Uploading files for the dataset

In [None]:
for file, urls in urls_found.items():
    for url in urls:
        try:
            tmpfile = urllib.request.urlretrieve(url) # retrieve the csv in a temp file, if there is a problem with the URL it throws and we continue
        except:
            continue
            
        try:
            filename = 'file://{}'.format(tmpfile[0])
            # TODO: try gzipped datasets as well
            pd.read_csv(filename) # try reading it as csv, if fails continue
            metadata = make_file_metadata(REPO, file, url)
            print('- uploading the following dataset {}'.format(url))
        except:
            continue
        
        upload_datafile(BASE_URL, API_TOKEN, mapping_dsid2pid[ds_id], REPO, tmpfile[0], file, url)
