In [63]:
### Import standard modules ###
import numpy as np
import pandas as pd
import json

In [64]:
### Define set of parsing functions ###

def CustomParser(data):
    j1 = json.loads(data)
    return j1

def filter_json(x):
    x=x[0]
    try:
        x['value']=x['value'][0]
    except:
        x['value'] = {u'answers': {}, u'choice': {}, u'filters': {}}
    return x

def extract_choice(x):
    y = []
    y.append((str(x['value']['choice'])))
    return y

def extract_tasks(x):
    x=x['task']
    return x

def extract_answers(x):
    x=x['value']['answers']
    return x

def extract_filters(x):
    x=['value']['filters']
    return x

def extract_zooID(x):
    x=int(list(x.keys())[0])
    return x

def extract_FileName1(x):
    try:
        x = x[list(x.keys())[0]]['Filename1'].split('_')[1]
    except:
        x = ''
    return x

def check_upload(x):
    if len(x.split(';')) == 4:
        x = True
    else:
        x = False
    return x

def check_anno(x):
    if len(x) == 1:
        x = True
    else:
        x = False
    return x

def convert_to_int(x):
    try:
        x=int(x)
    except:
        x=0
    return x

In [65]:
### Read in csv with custom read for those column in JSON format ###

# Define location of classification file
class_file = "gravity-spy-classifications.csv" 

# Create dataframe from csv
data1 = pd.read_csv(class_file,converters={'annotations':CustomParser,'subject_data':CustomParser})

# Change ID to int
data1['user_id']        = data1['user_id'].apply(convert_to_int)
# Doing a mild work around for the json format of the annontation column
data1['annotations']    = data1['annotations'].apply(filter_json)
# Extract choice and making it a column
data1['choice']         = data1['annotations'].apply(extract_choice)
# Extract the task entry and making it a column
data1['tasks']          = data1['annotations'].apply(extract_tasks)
# Extract answers and making it a column
data1['answers']        = data1['annotations'].apply(extract_answers)
# Extract zooniverse ID it gave this subject and making it a column
data1['zooID']          = data1['subject_data'].apply(extract_zooID) 
# Extract uniqueID assigned to the image during image creation and making it a column
data1['imageID']        = data1['subject_data'].apply(extract_FileName1)
# Get cumulative count of number of prior classifications by user
data1['classification_number'] = data1.groupby('user_id').cumcount()
# Check that the subject_ids for a given classification is 4. If not I uploaded the images wrong for that subject
data1['goodUpload']     = data1['subject_ids'].apply(check_upload)
# Check that the number of annotation is of size 1 (i.e. they did not do multiple annotation)
data1['numAnnotations'] = data1['choice'].apply(check_anno)


# Dropping annotations,subject_data, and subject_ids
data1 = data1.drop('annotations',1)
data1 = data1.drop('subject_data',1)
data1 = data1.drop('subject_ids',1)

In [66]:
### Check if workflow version is acceptable ###
versions = [692.102,714.11399999999992] # List of acceptable versions
data1['goodWorkFlow'] = (data1['workflow_version'].isin(versions)) # Add column of booleans, true means acceptable

In [67]:
### Version specific quality checks ###

# Data for converting old to new imageIDs
id_data = pd.read_csv('IDmatchall.txt',delim_whitespace=True,skiprows=1,names=['new_imageID','old_imageID'])

beta_check = ~data1['workflow_version'].isin([692.102, 714.11399999999992]) # Check if classification from beta 2.0
id_check = data1['imageID'].isin(id_data['old_imageID']) # Check if imageID has a new ID

data1['goodID'] = beta_check | id_check # Apply 'bitwise-or' to checks, append to dataframe

In [68]:
### Apply data quality cuts ###
data1 = data1[data1.goodUpload & data1.numAnnotations & data1.goodWorkFlow & data1.goodID & data1.user_id != 0]

# Drop unnecessary columns
data1 = data1.drop('user_ip',1)
data1 = data1.drop('workflow_name',1)
data1 = data1.drop('created_at',1)
data1 = data1.drop('gold_standard',1)
data1 = data1.drop('expert',1)
data1 = data1.drop('tasks',1)
data1 = data1.drop('answers',1)
data1 = data1.drop('goodUpload',1)
data1 = data1.drop('numAnnotations',1)
data1 = data1.drop('goodWorkFlow',1)
data1 = data1.drop('goodID',1)
data1 = data1.drop('metadata',1)

In [69]:
### Convert alpha labels to int labels and old to new imageIDs ###

label_dict = {'45MHZLGHTMDLTN':5,'LGHTMDLTN':5,'50HZ':8,'RCMPRSSR50HZ':8,'BLP':9,'CHRP':2,'XTRMLLD':6,'HLX':14,'KFSH':18,
              'LWFRQNCBRST':1,'LWFRQNCLN':7,'NGLTCH':19,'DNTSGLTCH':19,'NNFTHBV':16,'PRDDVS':11,'60HZPWRLN':10,'60HZPWRMNS':10,
              'PWRLN60HZ':10,'RPTNGBLPS':3,'SCTTRDLGHT':4,'SCRTCH':15,'TMT':12,'VLNHRMNC500HZ':17,'VLNMDHRMNC500HZ':17,
              'HRMNCS':17,'WNDRNGLN':13,'WHSTL':0}

def choice_replace(x):
    return label_dict[x[0]]

old_imageID = list(id_data['old_imageID'])
new_imageID = list(id_data['new_imageID'])
id_dict = {}

for a,b in zip(old_imageID,new_imageID):
    id_dict[a] = b

def imageID_replace(x):
    try:
        x = id_dict[x]
        return x
    except:
        return x
    
data1['choice']      = data1['choice'].apply(choice_replace)
data1['imageID']     = data1['imageID'].apply(imageID_replace)

In [70]:
### Pivot dataframe to make index imageID and get choice, user_id, and workflow_version ###

# Function to aggregate data
def lister(x):
    return list(x)

# Use pandas pivot_table, create columns corresponding to image type and true label
image_values         = ['choice', 'user_id','workflow_version','classification_number','zooID']
images               = pd.pivot_table(data1,index='imageID',values=image_values,aggfunc=lister)
images['zooID']      = images['zooID'].apply(np.unique)
images['type']       = ['T']*len(images)
images['true_label'] = [-1]*len(images)

In [71]:
### Read in ML_scores ###

# Remove Hanford and Livingston designations
def name_clean(x):
    x = x.split('_')[1]
    return x

ML_scores_L       = pd.read_csv('scores_L.csv')
ML_scores_H       = pd.read_csv('scores_H.csv')
ML_scores         = ML_scores_L.append(ML_scores_H)
ML_scores['Name'] = ML_scores['Name'].apply(name_clean)
ML_scores

Unnamed: 0,Name,Label,confidence of class 0,confidence of class 1,confidence of class 2,confidence of class 3,confidence of class 4,confidence of class 5,confidence of class 6,confidence of class 7,...,confidence of class 10,confidence of class 11,confidence of class 12,confidence of class 13,confidence of class 14,confidence of class 15,confidence of class 16,confidence of class 17,confidence of class 18,confidence of class 19
0,006VRLdkqX,12,0.000000e+00,2.210000e-16,3.740000e-25,6.390000e-15,5.890000e-08,2.650000e-11,6.260000e-15,2.940000e-40,...,4.590000e-33,2.940000e-11,8.588120e-01,0.000000e+00,1.840000e-25,0.000000e+00,1.411716e-01,1.710000e-14,1.620000e-05,1.120000e-43
1,00elCEegkz,7,2.460000e-07,1.910000e-05,1.680000e-13,1.240000e-15,9.350000e-11,1.110000e-08,6.500000e-15,9.999712e-01,...,6.550000e-13,9.490000e-11,1.800000e-11,1.810000e-07,3.220000e-10,1.480000e-14,7.560000e-09,1.320000e-06,3.950000e-17,8.000000e-06
2,00laMLoQkR,15,1.040000e-10,1.260000e-18,3.780000e-13,4.780000e-13,1.120000e-18,1.250000e-16,9.160000e-12,5.930000e-11,...,4.880000e-10,4.290000e-18,6.470000e-19,1.800000e-15,2.168229e-03,9.953448e-01,4.860000e-10,5.130000e-18,1.710000e-18,2.487035e-03
3,00vTs3nZOL,19,1.420000e-07,2.440000e-14,9.660000e-11,3.700000e-10,1.580000e-11,3.240000e-14,4.000000e-11,4.980000e-10,...,1.551094e-02,1.050000e-12,3.260000e-13,2.140000e-11,2.225720e-04,1.602482e-03,3.960000e-09,8.170000e-11,5.820000e-15,9.826638e-01
4,010tp7CFjC,19,2.080000e-06,3.910000e-09,2.710000e-09,7.020000e-11,2.030000e-05,4.670000e-10,9.720000e-11,5.604270e-04,...,5.310510e-04,1.150000e-08,6.250000e-09,4.270000e-08,2.110000e-06,3.950000e-09,2.190000e-07,1.320000e-05,1.640000e-13,9.988624e-01
5,01KXBXZ31c,19,1.910000e-06,3.370000e-12,3.960000e-09,9.680000e-11,4.910000e-10,7.500000e-13,2.990000e-12,3.490000e-07,...,3.606546e-02,8.720000e-11,2.180000e-11,1.200000e-09,1.370000e-06,9.540000e-07,1.400000e-09,1.160000e-08,1.480000e-14,9.639298e-01
6,01OELgCIdU,1,5.690000e-13,9.962059e-01,2.150000e-20,3.310000e-21,3.490000e-11,3.960000e-12,9.700000e-17,3.793220e-03,...,5.360000e-24,1.990000e-12,2.170000e-13,1.250000e-13,3.020000e-17,1.010000e-24,2.960000e-09,7.960000e-07,1.830000e-18,5.690000e-12
7,01rOWDF7m4,19,1.873704e-03,2.180000e-08,6.510000e-09,3.440000e-10,9.900000e-09,3.450000e-09,1.600000e-11,1.357293e-02,...,1.660000e-05,8.690000e-09,4.460000e-09,3.930000e-06,1.014310e-04,1.570000e-07,8.690000e-08,1.430000e-05,7.990000e-14,9.844155e-01
8,01T5gHjoRc,1,2.410000e-12,9.999919e-01,1.170000e-20,2.120000e-19,1.610000e-12,1.630000e-12,4.430000e-16,8.160000e-06,...,3.970000e-25,1.100000e-12,1.420000e-11,3.450000e-15,4.330000e-16,8.360000e-26,3.800000e-09,2.080000e-08,2.630000e-16,7.800000e-15
9,01TahHCA8k,18,3.050000e-29,4.060000e-21,7.010000e-14,7.012116e-02,7.100000e-24,4.880000e-07,1.260000e-16,3.780000e-44,...,7.500000e-27,3.720000e-19,3.810000e-10,4.160000e-39,4.500000e-17,6.280000e-29,3.080000e-06,7.870000e-20,9.298661e-01,0.000000e+00


In [72]:
### Append ML_posterior matrix ###

# Get number of classes
classes = len(ML_scores.columns[2:])

# Create posterior matrix from dataframe columns
ML_posterior = ML_scores['confidence of class 0']

# Iterate over columns of dataframe
for i in range(1,classes): 
    ML_posterior = np.vstack((ML_posterior,ML_scores['confidence of class %s' % str(i)]))

ML_posterior = ML_posterior.T
ML_posterior = list(ML_posterior)
imageIDs = list(ML_scores['Name'])

# Map imageID to ML_posterior
ML_dict = {}
for a,b in zip(imageIDs,ML_posterior):
    ML_dict[a] = b
    
def ML_append(x):
    try:
        return ML_dict[x]
    except:
        return []

images_index = pd.Series(images.index)
ML_posterior = images_index.apply(ML_append)

# Append ML_posterior matrix to corresponding imageID
images['ML_posterior'] = list(ML_posterior)

In [73]:
### Get ML_label and ML_confidence ###

# Function to get index of max value in ML_posterior
def max_index(x):
    x = np.array(x)
    try:
        return np.argmax(x)
    except:
        return -1

# Function to get max confidence value in ML_posterior    
def get_max(x):
    x = np.array(x)
    try:
        return max(x)
    except:
        return -1
    
images['ML_label']          = images['ML_posterior'].apply(max_index)
images['ML_confidence']     = images['ML_posterior'].apply(get_max)

In [74]:
### Read classification of golden images ###

goldendata = pd.read_csv('GLabel.csv')

# Map zooID to true_label
gold_dict = {}
for a,b in zip(goldendata['ZooID'],goldendata['Classification']):
    gold_dict[int(a)] = int(b)

# Change type of golden images 
def type_map(x):
    x = int(x)
    if x in list(gold_dict.keys()):
        return 'G'
    else:
        return 'T'

# Change true_label of golden images  
def label_map(x):
    x = int(x)
    try:
        return gold_dict[x]
    except:
        return -1

images['type']       = images['zooID'].apply(type_map)
images['true_label'] = images['zooID'].apply(label_map)

In [75]:
### Code to check label options for each workflow version ###

for iV in versions:
    version = np.unique(data1[data1['workflow_version'] == iV]['choice'])
    print("version {0}".format(iV))
    print("length {0}".format(len(data1[data1['workflow_version'] == iV])))
    print(version)
    print("end")

version 692.102
length 2687
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end
version 714.1139999999999
length 2463
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
end


In [76]:
### CC_classifier ###

