In [310]:
### Import standard modules ###

import numpy as np
import pandas as pd
import json

In [311]:
### Define set of parsing functions ###

def CustomParser(data):
    j1 = json.loads(data)
    return j1

def filter_json(x):
    x=x[0]
    try:
        x['value']=x['value'][0]
    except:
        x['value'] = {u'answers': {}, u'choice': {}, u'filters': {}}
    return x

def extract_choice(x):
    y = []
    y.append((str(x['value']['choice'])))
    return y

def extract_tasks(x):
    x=x['task']
    return x

def extract_answers(x):
    x=x['value']['answers']
    return x

def extract_filters(x):
    x=['value']['filters']
    return x

def extract_zooID(x):
    x=int(list(x.keys())[0])
    return x

def extract_FileName1(x):
    try:
        x = str(x[list(x.keys())[0]]['Filename1'].split('_')[1])
    except:
        x = ''
    return x

def check_upload(x):
    if len(x.split(';')) == 4:
        x = True
    else:
        x = False
    return x

def check_anno(x):
    if len(x) == 1:
        x = True
    else:
        x = False
    return x

def convert_to_int(x):
    try:
        x=int(x)
    except:
        x=0
    return x

In [312]:
### Read in csv with custom read for those column in JSON format ###

# Define location of classification file
class_file = "gravity-spy-classifications.csv" 

# Create dataframe from csv
class_data = pd.read_csv(class_file,converters={'annotations':CustomParser,'subject_data':CustomParser})

# Change ID to int
class_data['userID']                   = class_data['user_id'].apply(convert_to_int)
# Doing a mild work around for the json format of the annontation column
class_data['annotations']              = class_data['annotations'].apply(filter_json)
# Extract choice and making it a column
class_data['choice']                   = class_data['annotations'].apply(extract_choice)
# Extract the task entry and making it a column
class_data['tasks']                    = class_data['annotations'].apply(extract_tasks)
# Extract answers and making it a column
class_data['answers']                  = class_data['annotations'].apply(extract_answers)
# Extract zooniverse ID it gave this subject and making it a column
class_data['zooID']                    = class_data['subject_data'].apply(extract_zooID) 
# Extract uniqueID assigned to the image during image creation and making it a column
class_data['imageID']                  = class_data['subject_data'].apply(extract_FileName1)
# Get cumulative count of number of prior classifications by user
class_data['classification_number']    = class_data.groupby('user_id').cumcount()
# Check that the subject_ids for a given classification is 4. If not I uploaded the images wrong for that subject
class_data['goodUpload']               = class_data['subject_ids'].apply(check_upload)
# Check that the number of annotation is of size 1 (i.e. they did not do multiple annotation)
class_data['numAnnotations']           = class_data['choice'].apply(check_anno)


# Dropping annotations,subject_data, and subject_ids
class_data = class_data.drop('annotations',1)
class_data = class_data.drop('user_id',1)
class_data = class_data.drop('subject_data',1)
class_data = class_data.drop('subject_ids',1)

In [313]:
### Check if workflow version is acceptable ###
all_versions = list(np.unique(class_data['workflow_version']))
good_versions = [714.11399999999992] # List of acceptable versions #692.102 is beta with 18 columns
class_data['goodWorkFlow'] = (class_data['workflow_version'].isin(good_versions)) # Add column of booleans, true means acceptable

In [314]:
### Version specific quality checks ###

# Data for converting old to new imageIDs
id_data = pd.read_csv('IDmatchall.txt',delim_whitespace=True,skiprows=1,names=['new_imageID','old_imageID'])

# Data for bad golden images
bad_data = pd.read_csv('bad_golden_images.csv',header=None)

# Remove Hanford and Livingston designations
def name_clean(x):
    x = x.split('_')[1]
    return x

# List of bad golden images
bad_images = list(bad_data[0].apply(name_clean))
bad_images.append('ulfd56vzbx')
bad_images.append('uV9zDEjP2N')

not_beta_check = ~class_data['workflow_version'].isin([692.102, 714.11399999999992]) # Check if classification from beta 2.0
new_id_check = class_data['imageID'].isin(id_data['old_imageID']) # Check if imageID has a new ID
not_bad_id = ~class_data['imageID'].isin(bad_images)

class_data['goodID'] = (not_beta_check | new_id_check) & not_bad_id # Apply bitwise boolean operators,, append to dataframe

In [315]:
### Apply data quality cuts ###
final_check = class_data.goodUpload & class_data.numAnnotations & class_data.goodWorkFlow & class_data.goodID & class_data.userID != 0
class_data  = class_data[final_check]

# Drop unnecessary columns
class_data = class_data.drop('user_ip',1)
class_data = class_data.drop('workflow_name',1)
class_data = class_data.drop('created_at',1)
class_data = class_data.drop('gold_standard',1)
class_data = class_data.drop('expert',1)
class_data = class_data.drop('tasks',1)
class_data = class_data.drop('answers',1)
class_data = class_data.drop('goodUpload',1)
class_data = class_data.drop('numAnnotations',1)
class_data = class_data.drop('goodWorkFlow',1)
class_data = class_data.drop('goodID',1)
class_data = class_data.drop('metadata',1)

In [316]:
### Convert alpha labels to int labels and old to new imageIDs ###

label_dict = {'45MHZLGHTMDLTN':5,'LGHTMDLTN':5,'50HZ':8,'RCMPRSSR50HZ':8,'BLP':9,'CHRP':2,'XTRMLLD':6,'HLX':14,'KFSH':18,
              'LWFRQNCBRST':1,'LWFRQNCLN':7,'NGLTCH':19,'DNTSGLTCH':19,'NNFTHBV':16,'PRDDVS':11,'60HZPWRLN':10,'60HZPWRMNS':10,
              'PWRLN60HZ':10,'RPTNGBLPS':3,'SCTTRDLGHT':4,'SCRTCH':15,'TMT':12,'VLNHRMNC500HZ':17,'VLNMDHRMNC500HZ':17,
              'HRMNCS':17,'WNDRNGLN':13,'WHSTL':0}

# Convert alpha labels to int labels
def choice_replace(x):
    return label_dict[x[0]]

old_imageID = list(id_data['old_imageID'])
new_imageID = list(id_data['new_imageID'])
id_dict = {}

for a,b in zip(old_imageID,new_imageID):
    id_dict[a] = b

# Convert old to new imageIDs
def imageID_replace(x):
    try:
        x = id_dict[x]
        return x
    except:
        return x
    
class_data['choice']      = class_data['choice'].apply(choice_replace)
class_data['imageID']     = class_data['imageID'].apply(imageID_replace)

In [317]:
### Sort class_data by classification number ###

class_data = class_data.sort_values('classification_id')

In [318]:
### Pivot dataframe to make index imageID and get choice, user_id, and workflow_version ###

# Function to aggregate data
def lister(x):
    return list(x)

# Use pandas pivot_table, create columns corresponding to image type and true label
image_values         = ['choice', 'userID','workflow_version','classification_number','zooID']
images               = pd.pivot_table(class_data,index='imageID',values=image_values,aggfunc=lister)
images['zooID']      = images['zooID'].apply(np.unique)
images['type']       = ['T']*len(images)
images['true_label'] = [-1]*len(images)
images['pp_matrix']  = [0]*len(images)
images['pp_matrix']  = images['pp_matrix'].astype(object)

In [319]:
### Append ML_posterior matrix ###

ML_scores_L       = pd.read_csv('scores_L.csv')
ML_scores_H       = pd.read_csv('scores_H.csv')
ML_scores         = ML_scores_L.append(ML_scores_H)
ML_scores['Name'] = ML_scores['Name'].apply(name_clean)

# Get number of classes
classes = len(ML_scores.columns[2:])

# Create posterior matrix from dataframe columns
ML_posterior = ML_scores['confidence of class 0']

# Iterate over columns of dataframe
for i in range(1,classes): 
    ML_posterior = np.vstack((ML_posterior,ML_scores['confidence of class %s' % str(i)]))

ML_posterior = ML_posterior.T
ML_posterior = list(ML_posterior)
imageIDs = list(ML_scores['Name'])

# Map imageID to ML_posterior
ML_dict = {}
for a,b in zip(imageIDs,ML_posterior):
    ML_dict[a] = b
    
def ML_append(x):
    try:
        return ML_dict[x]
    except:
        return []

images_index = pd.Series(images.index)
ML_posterior = images_index.apply(ML_append)

# Append ML_posterior matrix to corresponding imageID
images['ML_posterior'] = list(ML_posterior)

In [320]:
### Get ML_label and ML_confidence ###

# Function to get index of max value in ML_posterior
def max_index(x):
    x = np.array(x)
    try:
        return np.argmax(x)
    except:
        return -1

# Function to get max confidence value in ML_posterior    
def get_max(x):
    x = np.array(x)
    try:
        return max(x)
    except:
        return -1
    
images['ML_label']          = images['ML_posterior'].apply(max_index)
images['ML_confidence']     = images['ML_posterior'].apply(get_max)

In [321]:
### Read classification of golden images ###

goldendata = pd.read_csv('GLabel.csv')

# Map zooID to true_label
gold_dict = {}
for a,b in zip(goldendata['ZooID'],goldendata['Classification']):
    gold_dict[int(a)] = int(b)

# Change type of golden images 
def type_map(x):
    x = int(x)
    if x in list(gold_dict.keys()):
        return 'G'
    else:
        return 'T'

# Change true_label of golden images  
def label_map(x):
    x = int(x)
    try:
        return gold_dict[x]
    except:
        return -1

images['type']       = images['zooID'].apply(type_map)
images['true_label'] = images['zooID'].apply(label_map)

In [322]:
### CC_classifier ###

# Import standard modules
import numpy as np
import pandas as pd
import pickle as pk
from scipy.io import loadmat
import random

In [323]:
### Initialize constants ###

retired_images = pd.DataFrame({ 'imageID' : [], 'class' : []})
retired_images.set_index('imageID')

r_lim = 4 # Make 23              # Max citizens who can look at image before it is given to upper class if threshold not reached
c = 20                           # Classes
priors = (np.ones((1,c))/c)[0]   # Flat priors b/c we do not know what category the image is in
alpha = .4*np.ones((c,1))        # Threshold vector for user promotion
g_c = .5*np.ones((1,c))          # Threshold vector for updating confusion matrix
t = .7*np.ones((c,1))            # Threshold vector for image retirement

In [324]:
### Function to create blank pp_matrices ###

pp_count = {}
conf_matrices = {}

def make_pp_matrices(x):
    
    if x['type'] == 'T':
        
        pp_count[x.name] = 0
        
        return [np.zeros((c,len(x['userID'])+1))]
    
    else:
        
        pass

def make_conf_matrices(x):
    
    for userID in x:
        
        if userID not in list(conf_matrices.keys()): # If user does not have a confusion matrix
            
            conf_matrices[userID] = np.zeros((c,c)) # Create a blank confusion matrix

a = images['userID'].apply(make_conf_matrices)
images['pp_matrix'] = images[['userID','type']].apply(make_pp_matrices, axis = 1)

In [325]:
for imageID,userID,user_label in zip(class_data['imageID'],class_data['userID'],class_data['choice']):
    
                
    if images.loc[imageID,'type'] == 'G': # If golden image
        
        true_label = images.loc[imageID,'true_label']
                
        conf_matrices[userID][true_label,user_label] += 1 # Update confusion matrix
        
        #print('Confusion matrix updated')        
    
    
    if images.loc[imageID,'type'] == 'T': # If training image
                
        conf_divided,a1,a2,a3 = np.linalg.lstsq(np.diag(np.sum(conf_matrices[userID],axis=1)),conf_matrices[userID])
        
        temp_matrix = priors
        
        if sum(conf_divided[:,user_label]) != 0: # If column of conf_divided corresponding to user label is not blank
        
            temp_matrix = (conf_divided[:,user_label]*priors[user_label])/sum(conf_divided[:,user_label]*priors)
        
        pp_matrix = images.loc[imageID,'pp_matrix'][0]
        pp_matrix[:,pp_count[imageID]] = temp_matrix
        images.set_value(imageID,'pp_matrix',[pp_matrix])
        pp_count[imageID] += 1
        
        #print('Posterior matrix updated')

In [326]:
def decider(x):
    
    x['pp_matrix'][0][:,-1] = np.array(x['ML_posterior'])
    x['pp_matrix'][0].T
    v = np.sum(x['pp_matrix'][0], axis=1)/np.sum(np.sum(x['pp_matrix'][0])) # Create vector of normalized sums of pp_matrix2
    maximum = np.amax(v) # Initialize maximum, max value of v
    maxIdx = np.argmax(v) # Initialize maxIdx, index of max value of v

    if maximum >= t[maxIdx]: # If maximum is above threshold for given class, retire image
            
        true_label = maxIdx
        images.set_value(x.name, 'true_label', true_label)
        images.set_value(x.name, 'type', 'R')
            
        print('Image is retired to class', true_label)
        return 1

    elif len(x['choice']) >= r_lim: # Pass to upper class if more than r_lim annotators and no decision reached
            
        print('Image is given to the upper class')
        return 2
            

    else: # If fewer than r_lim annotators have looked at image, keep image
            
        print('More labels are needed for the image')
        return 3
    
images['decision'] = images[images['type']=='T'][['pp_matrix','ML_posterior','choice']].apply(decider,axis=1)

Image is retired to class 5
Image is retired to class 12
More labels are needed for the image
Image is retired to class 19
More labels are needed for the image
Image is retired to class 1
More labels are needed for the image
More labels are needed for the image
Image is retired to class 7
Image is retired to class 1
Image is retired to class 19
More labels are needed for the image
More labels are needed for the image
Image is retired to class 9
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
Image is retired to class 1
Image is retired to class 1
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
More labels are needed for the image
Image is retired to class 1
More labels are needed for the image
Image is retired to class 1
More labels are needed for the i

In [327]:
images

Unnamed: 0_level_0,choice,classification_number,userID,workflow_version,zooID,type,true_label,pp_matrix,ML_posterior,ML_label,ML_confidence,decision
imageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
001riNKlIk,[5],[153],[1502635],[714.114],[2201499],T,5,"[[[0.0, 0.0], [0.0, 3.49e-16], [0.0, 6.87e-44]...","[0.0, 3.49e-16, 6.87e-44, 1.03e-25, 1.22e-34, ...",5,1.000000,1
00cweQNJbb,[9],[1219],[4261],[714.114],[2199741],T,12,"[[[0.0, 2.04e-22], [0.0, 6.64e-11], [0.0, 1.69...","[2.04e-22, 6.64e-11, 1.69e-10, 1.46e-08, 2.67e...",12,0.995909,1
06QGka2Lah,"[7, 7]","[388, 370]","[5209, 1498433]","[714.114, 714.114]",[2223935],T,-1,"[[[0.05, 0.0, 1.55e-06], [0.05, 0.0, 1.46e-06]...","[1.55e-06, 1.46e-06, 1.9e-12, 7.91e-15, 3.44e-...",7,0.999762,3
06d3eIYFDU,[19],[991],[5209],[714.114],[2208904],T,19,"[[[0.0, 0.000392942], [0.0, 5.73e-06], [0.0, 1...","[0.000392942, 5.73e-06, 1.78e-08, 1.98e-08, 2....",19,0.801026,1
08ClYsqQbW,"[7, 7, 4]","[11, 305, 458]","[59701, 530281, 1498433]","[714.114, 714.114, 714.114]",[2221959],T,-1,"[[[0.0, 0.0, 0.05, 2.08e-06], [0.0, 0.0, 0.05,...","[2.08e-06, 3.38e-05, 1.63e-12, 2.11e-14, 2.67e...",7,0.999937,3
0CcfB9GX0B,[1],[1023],[5209],[714.114],[2220906],T,1,"[[[0.0, 5.13e-20], [1.0, 1.0], [0.0, 2.28e-29]...","[5.13e-20, 1.0, 2.28e-29, 1.67e-27, 3.09e-15, ...",1,1.000000,1
0HkNURMSy1,[10],[647],[5209],[714.114],[2219495],T,-1,"[[[0.0, 9.12e-05], [0.0, 4.03e-11], [0.0, 5.35...","[9.12e-05, 4.03e-11, 5.35e-08, 2.04e-09, 8.09e...",19,0.869200,3
0Hrp6DpX4x,"[10, 15]","[1170, 468]","[4261, 1498433]","[714.114, 714.114]",[2215282],T,-1,"[[[0.05, 0.0, 4.03e-06], [0.05, 0.0, 1.78e-10]...","[4.03e-06, 1.78e-10, 1.29e-07, 6.06e-09, 2.93e...",10,0.619845,3
0J9aCtLSOV,[7],[392],[1498433],[714.114],[2201857],T,7,"[[[0.0, 4.09e-07], [0.0, 0.008218152], [0.0, 1...","[4.09e-07, 0.008218152, 1.65e-14, 7.76e-16, 5....",7,0.991772,1
0K54KYKWVC,"[1, 1, 1]","[24, 1011, 1348]","[1502149, 5209, 4261]","[714.114, 714.114, 714.114]",[2212117],T,1,"[[[0.05, 0.0, 0.0, 0.0], [0.05, 1.0, 1.0, 1.0]...","[0.0, 1.0, 0.0, 0.0, 1e-08, 3.27e-25, 7.11e-28...",1,1.000000,1


In [242]:
import matplotlib.pyplot as plt
from matplotlib import cm

cs=cm.Set1(np.arange(20)/20.)
labels = ['WHSTL','LWFRQNCBRST','CHRP','RPTNGBLPS','SCTTRDLGHT','LGHTMDLTN','XTRMLLD','LWFQNCLN','RCMPRSSR50HZ','BLP',
          'PWRLN60HZ','PRDDVS','TMT','WNDRNGLN','HLX','SCRTCH','NNFTHBV','VLNMDHRMNC500HZ','KFSH','NGLTCH']
true_labels = list(images[images['type'] == 'G']['true_label'])
label_counts = []

for i in list(np.arange(0,20,1)):
        label_counts.append(true_labels.count(i))

plt.pie(label_counts,labels=labels,colors=cs,autopct='%1.11f%%', shadow=True, startangle=90)
plt.axis('equal')

fig = plt.figure()
ax = fig.gca()
ax.set_aspect('equal')
plt.show()

In [236]:
import matplotlib.pyplot as plt
import matplotlib.colors as colors

sample_conf = conf_matrices[4261] # Melina_t confusion matrix
plt.matshow(sample_conf, cmap='viridis',norm=colors.LogNorm(vmin=1, vmax=100))
plt.colorbar()
plt.xlabel('classes')
plt.ylabel('classes')
plt.title('Visualization of confusion matrix \n')
ax = plt.gca()
ax.set_xticks(np.arange(0,20,1))
ax.set_yticks(np.arange(0,20,1))
plt.show()

In [235]:
a = images.loc['10cIO3ngYQ','pp_matrix']
a = a[0]

plt.matshow(a, cmap='viridis',norm=colors.LogNorm(vmin=1e-30, vmax=10))
plt.colorbar()
plt.xlabel('p(i|j)')
plt.ylabel('classes')
ax = plt.gca()
ax.set_yticks(np.arange(0,20,1))
plt.show()

[[  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    1.19000000e-08]
 [  1.00000000e+00   1.00000000e+00   5.00000000e-02   1.00000000e+00
    9.98696744e-01]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    4.71000000e-17]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    3.79000000e-17]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    1.35000000e-12]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    2.35000000e-09]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    4.59000000e-15]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    1.30304800e-03]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    4.30000000e-18]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    3.95000000e-16]
 [  0.00000000e+00   0.00000000e+00   5.00000000e-02   0.00000000e+00
    7.60000000e-20]
 [  0.0000