In [341]:
import re
import numpy as np
import pandas as pd
import json
from io import StringIO
import os

basedir = "%s/%s" % (os.getenv("HOME"), "fair-datool")
datadir = "%s/data" % basedir
provenancefile = "questionare.xml"

In [342]:
def to_xml(df, filename=None, mode='w'):
    def row_to_xml(row):
        xml = ['<prov:entity prov:id="questionnaire">']
        for i, col_name in enumerate(row.index):
            xml.append('  <field name="{0}">{1}</field>'.format(col_name, row.iloc[i]))
        xml.append('</prov:entity>')
        return '\n'.join(xml)
    res = '\n'.join(df.apply(row_to_xml, axis=1))

    xmlres = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><prov:document>\n" + res
    xmlres = xmlres + "\n</prov:document"

    if filename is None:
        return xmlres
    with open(filename, mode) as f:
        f.write(xmlres)

In [343]:
with open ("%s/%s" % (datadir, "Metrics.csv"), "r") as myfile:
    filedata=myfile.readlines()
filedata

['F1a,F1b,F2a,F2b,F3a,F3b,F3c,F4a,F4b,F5a,F5b,F\r\n',
 'No,,,,,,No,,,,,1\r\n',
 'No,,,,,,Yes,,,,,3\r\n',
 'Yes,,,,,No,,,,,,2\r\n',
 'Yes,,,,,Yes,,,,Yes,,5\r\n',
 'Yes,,,,,Yes,,,,No,,4\r\n',
 ',,,,,,,,,,,\r\n',
 'A1a,A1b,A2a,A2b,A3a,A3b,A4a,A4b,A5a,A5b,,A\r\n',
 'No,No,,,,,,,,,,1\r\n',
 'No,Yes,,,,,,,,,,2\r\n',
 'Yes,,,,No,,,,,,,5\r\n',
 'Yes,,,,Yes,,No,,,,,3\r\n',
 'Yes,,,,Yes,,Yes,,,,,4\r\n',
 ',,,,,,,,,,,\r\n',
 'I0,I1a,I1b,I2a,I2b,I3a,I3b,I4a,I4b,I5a,I5b,I\r\n',
 'Yes,,Yes,,,,,,,,,1\r\n',
 'Yes,,No,Most,,,,,,,,1.5\r\n',
 'Yes,,No,Half,,,,,,,,2\r\n',
 'Yes,,No,Few,,,,,,,,2.5\r\n',
 'Yes,,No,None,,,,,No,,,3\r\n',
 'Yes,,No ,None,,,,,Yes,No,,4\r\n',
 'Yes,,No,None,,,,,Yes,Yes,,5\r\n',
 'No,Yes,,,,,,,,,,1\r\n',
 'No,No,,,,No,,,,,,2\r\n',
 'No,No,,,,Yes,,,No,,,3\r\n',
 'No,No,,,,Yes,,,Yes,Yes,,5\r\n',
 'No,No,,,,Yes,,,Yes,No,,4']

In [344]:
def readmetrics(filename):
    dataframe = pd.read_csv("%s/%s" % (datadir, filename))
    metrics = dataframe 
    metrics = metrics.replace(np.nan, '', regex=True)
    return metrics

metrics = {}
metrics['F'] = readmetrics('F.csv')
metrics['I'] = readmetrics('I.csv')
metrics['A'] = readmetrics('A.csv')

In [345]:
def readcodes(filename):
    dataframe = pd.read_csv("%s/%s" % (datadir, filename), names=['Question', 'Code','Reference'])
    codes = dataframe 
    return codes

codes = readcodes('Codes.csv')
q1 = "Is the data file in a proprietary format?"
x = codes[codes['Question'] == q1]
to_xml(codes, "%s/provenance/%s" % (basedir, provenancefile))

In [346]:
filename = "sampedataset2.csv"
dataframe = pd.read_csv("%s/%s" % (datadir, filename), delimiter=';')

# Reading Excel as source
#filename = "sampedataset2.xlsx"
#dataframe = pd.ExcelFile("%s/%s" % (datadir, filename))
#dataframe = pd.read_excel("%s/%s" % (datadir, filename), sheetname=0, header=None, skiprows=0, encoding = 'utf8')

def get_mappings(dataframe):
    mapping = {}
    newcolumns = []
    for q in dataframe.columns:
        findcode = codes[codes['Question'] == q]
        if findcode['Code'].any():
            mapping[q] = findcode['Code'].values[0]
            newcolumns.append(mapping[q])
        else:
            newcolumns.append(q)
            #print "%s %s" % (q, findcode['Code'].values[0])
    return (mapping, newcolumns)

(mapping, newcols) = get_mappings(dataframe)
dataframe.columns = newcols

In [347]:
doi = "http://dx.doi.org/10.17632/crnmszmb8h.1"
doi = "http://dx.doi.org/10.17632/yjrpmr5mwn.1"
doi = "http://dx.doi.org/10.17632/nhtjgdkft4.1"
def find_datasets(doi):
    doikey = "Please enter the PID of the dataset you are going to review:(i.e. https://doi.org/10.1000/xyz123)"
    thisdata = dataframe[dataframe[doikey] == doi]
    #print x.columns
    return thisdata

thisdata = find_datasets(doi)

def ratedata():
    metricscodes = ['F', 'A', 'I']
    result = {}
    for code in metricscodes:
        thismetrics = metrics[code]
        mcol = []
        for col in thismetrics.columns:
            if col in dataframe.columns:
                mcol.append(col)
        tmpdata = thisdata[mcol]
        tmpdata.ix[tmpdata.index[0]]
        tmpdata = tmpdata.replace(np.nan, '', regex=True)
        result[code] = tmpdata
    return result

In [348]:
thisdata.columns

Index([u'Respondent ID', u'Collector ID', u'Start Date', u'End Date',
       u'IP Address', u'Email Address', u'First Name', u'Last Name',
       u'Custom Data',
       u'Please enter the PID of the dataset you are going to review:(i.e. https://doi.org/10.1000/xyz123)',
       u'Name of the reviewer: (this is just for the review process)',
       u'Name of the repository:', u'Date of review', u'F1a',
       u'Any remarks about scoring the dataset at this level:', u'F3b',
       u'Any remarks about scoring the dataset at this level:.1',
       u'Is there sufficient metadata available?.1',
       u'Any remarks about scoring the dataset at this level:.2', u'F5a',
       u'Any remarks about scoring the dataset at this level:.3', u'A1a',
       u'Any remarks about scoring the dataset at this level:.4', u'A1b',
       u'Any remarks about scoring the dataset at this level:.5', u'A3a',
       u'Any remarks about scoring the dataset at this level:.6', u'A4a',
       u'Any remarks about scoring 

In [349]:
stars = ratedata()
metricscodes = ['F', 'A', 'I']
result = {}
for code in metricscodes:
    print stars[code]
    thisindex = stars[code].columns
    newindex = []
    for colname in thisindex:
        newindex.append(colname)
    newindex.append(code)
    res = metrics[code][newindex]
    result[code] = res

    F1a F3b F5a
15  Yes  No    
    A1a A1b A3a A4a
15  Yes      No    
    I0 I1a I1b I2a  I3a I4b I5a
15  No  No          Yes  No    


In [350]:
def getstars(code):
    A = metrics[code]
    B = stars[code]
    B[code] = 1
    # Merging assestment matrix with data matrix
    match = A[B.columns]

    thisstars = []
    for index1 in B.index:
        for m in match.index:
            matrix = match.ix[m].eq(B.ix[index1])
            result = matrix.drop(code)
            docheck = result.all()
            if docheck:
                thisstars.append(int(match.ix[m][code]))
    return np.array(thisstars).mean()

print "(F) Findable: %s stars" % getstars('F')
print "(A) Accessible: %s stars" % getstars('A')
print "(I) Interoperable: %s stars" % getstars('I')
rank = np.array([getstars('F'), getstars('A'), getstars('I')]).mean()
print "(R) Reusability: %s stars" % rank

(F) Findable: 2.0 stars
(A) Accessible: 5.0 stars
(I) Interoperable: 3.0 stars
(R) Reusability: 3.33333333333 stars


In [351]:
print "Findability matrix"
metrics['F']

Findability matrix


Unnamed: 0,F1a,F1b,F2a,F2b,F3a,F3b,F3c,F4a,F4b,F5a,F5b,F
0,No,,,,,,No,,,,,1
1,No,,,,,,Yes,,,,,3
2,Yes,,,,,No,,,,,,2
3,Yes,,,,,Yes,,,,Yes,,5
4,Yes,,,,,Yes,,,,No,,4


In [352]:
print "Accessability matrix"
metrics['A']

Accessability matrix


Unnamed: 0,A1a,A1b,A2a,A2b,A3a,A3b,A4a,A4b,A5a,A5b,Unnamed: 10,A
0,No,No,,,,,,,,,,1
1,No,Yes,,,,,,,,,,2
2,Yes,,,,No,,,,,,,5
3,Yes,,,,Yes,,No,,,,,3
4,Yes,,,,Yes,,Yes,,,,,4


In [353]:
print "Interoperability matrix"
metrics['I']

Interoperability matrix


Unnamed: 0,I0,I1a,I1b,I2a,I2b,I3a,I3b,I4a,I4b,I5a,I5b,I
0,Yes,,Yes,,,,,,,,,1.0
1,Yes,,No,Most,,,,,,,,1.5
2,Yes,,No,Half,,,,,,,,2.0
3,Yes,,No,Few,,,,,,,,2.5
4,Yes,,No,,,,,,No,,,3.0
5,Yes,,No,,,,,,Yes,No,,4.0
6,Yes,,No,,,,,,Yes,Yes,,5.0
7,No,Yes,,,,,,,,,,1.0
8,No,No,,,,No,,,,,,2.0
9,No,No,,,,Yes,,,No,,,3.0
