In [75]:
import re
import numpy as np
import pandas as pd
import json
from io import StringIO
import os

basedir = "%s/%s" % (os.getenv("HOME"), "fair-datool")
datadir = "%s/data" % basedir

In [76]:
with open ("%s/%s" % (datadir, "Metrics.csv"), "r") as myfile:
    filedata=myfile.readlines()
filedata

['F1a,F1b,F2a,F2b,F3a,F3b,F3c,F4a,F4b,F5a,F5b,F\r\n',
 'No,,,,,,No,,,,,1\r\n',
 'No,,,,,,Yes,,,,,3\r\n',
 'Yes,,,,,No,,,,,,2\r\n',
 'Yes,,,,,Yes,,,,Yes,,5\r\n',
 'Yes,,,,,Yes,,,,No,,4\r\n',
 ',,,,,,,,,,,\r\n',
 'A1a,A1b,A2a,A2b,A3a,A3b,A4a,A4b,A5a,A5b,,A\r\n',
 'No,No,,,,,,,,,,1\r\n',
 'No,Yes,,,,,,,,,,2\r\n',
 'Yes,,,,No,,,,,,,5\r\n',
 'Yes,,,,Yes,,No,,,,,3\r\n',
 'Yes,,,,Yes,,Yes,,,,,4\r\n',
 ',,,,,,,,,,,\r\n',
 'I0,I1a,I1b,I2a,I2b,I3a,I3b,I4a,I4b,I5a,I5b,I\r\n',
 'Yes,,Yes,,,,,,,,,1\r\n',
 'Yes,,No,Most,,,,,,,,1.5\r\n',
 'Yes,,No,Half,,,,,,,,2\r\n',
 'Yes,,No,Few,,,,,,,,2.5\r\n',
 'Yes,,No,None,,,,,No,,,3\r\n',
 'Yes,,No ,None,,,,,Yes,No,,4\r\n',
 'Yes,,No,None,,,,,Yes,Yes,,5\r\n',
 'No,Yes,,,,,,,,,,1\r\n',
 'No,No,,,,No,,,,,,2\r\n',
 'No,No,,,,Yes,,,No,,,3\r\n',
 'No,No,,,,Yes,,,Yes,Yes,,5\r\n',
 'No,No,,,,Yes,,,Yes,No,,4']

In [77]:
def readmetrics(filename):
    dataframe = pd.read_csv("%s/%s" % (datadir, filename))
    metrics = dataframe 
    metrics = metrics.replace(np.nan, '', regex=True)
    return metrics

metrics = {}
metrics['F'] = readmetrics('F.csv')
metrics['I'] = readmetrics('I.csv')
metrics['A'] = readmetrics('A.csv')

In [78]:
def readcodes(filename):
    dataframe = pd.read_csv("%s/%s" % (datadir, filename), names=['Question', 'Code','Reference'])
    codes = dataframe 
    return codes

codes = readcodes('Codes.csv')
q1 = "Is the data file in a proprietary format?"
x = codes[codes['Question'] == q1]
codes

Unnamed: 0,Question,Code,Reference
0,Does the dataset have a persistent identifier ...,F1a,https://docs.google.com/document/d/1bRQDN_VFSP...
1,Is there sufficient metadata available?,F3b,https://docs.google.com/document/d/1bRQDN_VFSP...
2,Is there sufficient metadata available?,F3c,https://docs.google.com/document/d/1bRQDN_VFSP...
3,Is there extensive metadata and rich additiona...,F5a,https://docs.google.com/document/d/1bRQDN_VFSP...
4,Does the dataset have a user license?,A1a,https://docs.google.com/document/d/1bRQDN_VFSP...
5,Are the metadata accessible?,A1b,https://docs.google.com/document/d/1bRQDN_VFSP...
6,Does the user license have any user restrictio...,A3a,https://docs.google.com/document/d/1bRQDN_VFSP...
7,Is the dataset available for public access? (i...,A4a,https://docs.google.com/document/d/1bRQDN_VFSP...
8,Is the dataset multi-file?,I0,
9,Is the data file in a proprietary format?,I1a,https://docs.google.com/document/d/1bRQDN_VFSP...


In [79]:
filename = "sampedataset2.csv"
dataframe = pd.read_csv("%s/%s" % (datadir, filename), delimiter=';')

# Reading Excel as source
#filename = "sampedataset2.xlsx"
#dataframe = pd.ExcelFile("%s/%s" % (datadir, filename))
#dataframe = pd.read_excel("%s/%s" % (datadir, filename), sheetname=0, header=None, skiprows=0, encoding = 'utf8')

def get_mappings(dataframe):
    mapping = {}
    newcolumns = []
    for q in dataframe.columns:
        findcode = codes[codes['Question'] == q]
        if findcode['Code'].any():
            mapping[q] = findcode['Code'].values[0]
            newcolumns.append(mapping[q])
        else:
            newcolumns.append(q)
            #print "%s %s" % (q, findcode['Code'].values[0])
    return (mapping, newcolumns)

(mapping, newcols) = get_mappings(dataframe)
dataframe.columns = newcols
dataframe.columns

Index([u'Respondent ID', u'Collector ID', u'Start Date', u'End Date',
       u'IP Address', u'Email Address', u'First Name', u'Last Name',
       u'Custom Data',
       u'Please enter the PID of the dataset you are going to review:(i.e. https://doi.org/10.1000/xyz123)',
       u'Name of the reviewer: (this is just for the review process)',
       u'Name of the repository:', u'Date of review', u'F1a',
       u'Any remarks about scoring the dataset at this level:', u'F3b',
       u'Any remarks about scoring the dataset at this level:.1',
       u'Is there sufficient metadata available?.1',
       u'Any remarks about scoring the dataset at this level:.2', u'F5a',
       u'Any remarks about scoring the dataset at this level:.3', u'A1a',
       u'Any remarks about scoring the dataset at this level:.4', u'A1b',
       u'Any remarks about scoring the dataset at this level:.5', u'A3a',
       u'Any remarks about scoring the dataset at this level:.6', u'A4a',
       u'Any remarks about scoring 

In [80]:
doi = "http://dx.doi.org/10.17632/crnmszmb8h.1"
def find_datasets(doi):
    doikey = "Please enter the PID of the dataset you are going to review:(i.e. https://doi.org/10.1000/xyz123)"
    thisdata = dataframe[dataframe[doikey] == doi]
    #print x.columns
    return thisdata

thisdata = find_datasets(doi)

def ratedata():
    metricscodes = ['F', 'A', 'I']
    result = {}
    for code in metricscodes:
        thismetrics = metrics[code]
        mcol = []
        for col in thismetrics.columns:
            if col in dataframe.columns:
                mcol.append(col)
        tmpdata = thisdata[mcol]
        tmpdata.ix[tmpdata.index[0]]
        #tmpdata = tmpdata.replace(np.nan, '', regex=True)
        result[code] = tmpdata
    return result

In [81]:
thisdata.columns

Index([u'Respondent ID', u'Collector ID', u'Start Date', u'End Date',
       u'IP Address', u'Email Address', u'First Name', u'Last Name',
       u'Custom Data',
       u'Please enter the PID of the dataset you are going to review:(i.e. https://doi.org/10.1000/xyz123)',
       u'Name of the reviewer: (this is just for the review process)',
       u'Name of the repository:', u'Date of review', u'F1a',
       u'Any remarks about scoring the dataset at this level:', u'F3b',
       u'Any remarks about scoring the dataset at this level:.1',
       u'Is there sufficient metadata available?.1',
       u'Any remarks about scoring the dataset at this level:.2', u'F5a',
       u'Any remarks about scoring the dataset at this level:.3', u'A1a',
       u'Any remarks about scoring the dataset at this level:.4', u'A1b',
       u'Any remarks about scoring the dataset at this level:.5', u'A3a',
       u'Any remarks about scoring the dataset at this level:.6', u'A4a',
       u'Any remarks about scoring 

In [82]:
stars = ratedata()
metricscodes = ['F', 'A', 'I']
for code in metricscodes:
    print stars[code]

    F1a  F3b F5a
14  Yes  Yes  No
19  Yes  Yes  No
    A1a  A1b A3a  A4a
14  Yes  NaN  No  NaN
19  Yes  NaN  No  NaN
     I0  I1a  I1b  I2a  I3a  I4b
14   No   No  NaN  NaN   No  NaN
19  Yes  NaN  Yes  NaN  NaN  NaN


In [83]:
metrics['F']

Unnamed: 0,F1a,F1b,F2a,F2b,F3a,F3b,F3c,F4a,F4b,F5a,F5b,F
0,No,,,,,,No,,,,,1
1,No,,,,,,Yes,,,,,3
2,Yes,,,,,No,,,,,,2
3,Yes,,,,,Yes,,,,Yes,,5
4,Yes,,,,,Yes,,,,No,,4


In [84]:
metrics['A']

Unnamed: 0,A1a,A1b,A2a,A2b,A3a,A3b,A4a,A4b,A5a,A5b,Unnamed: 10,A
0,No,No,,,,,,,,,,1
1,No,Yes,,,,,,,,,,2
2,Yes,,,,No,,,,,,,5
3,Yes,,,,Yes,,No,,,,,3
4,Yes,,,,Yes,,Yes,,,,,4


In [85]:
metrics['I']

Unnamed: 0,I0,I1a,I1b,I2a,I2b,I3a,I3b,I4a,I4b,I5a,I5b,I
0,Yes,,Yes,,,,,,,,,1.0
1,Yes,,No,Most,,,,,,,,1.5
2,Yes,,No,Half,,,,,,,,2.0
3,Yes,,No,Few,,,,,,,,2.5
4,Yes,,No,,,,,,No,,,3.0
5,Yes,,No,,,,,,Yes,No,,4.0
6,Yes,,No,,,,,,Yes,Yes,,5.0
7,No,Yes,,,,,,,,,,1.0
8,No,No,,,,No,,,,,,2.0
9,No,No,,,,Yes,,,No,,,3.0
