In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

In [2]:
data = pd.read_csv('./sw_classifications_20160527.csv')
expert_data = pd.read_csv('./sw_classifications_20160526-experts.csv')

In [3]:
expert_data.shape

(578, 7)

We want to analyze the expert answers vs. the other user responses. The first question I want to ask is: how much triplicate response data has expert labels (so we can look at how much that improves answer quality). If there's not enough overlap between expert responses + triplicate responses, we may need to get more data.

In [4]:
gb = data.groupby(['swi_id', 'swj_id'])
groups = gb.groups
more_3 = {key: value for key, value in groups.iteritems() if len(value) >= 3}

In [5]:
def label_fraction(sw_labelset):
    '''
    :param sw_labelset: List of 't' (True) or 'f' (False) labels for a single sidewalk pair.
    :type sw_labelset: list
    
    '''
    tf = data.iloc[sw_labelset, :]['connected']
    fraction_t = sum([1 for label in tf if label == 't']) / float(len(sw_labelset))
    
    return fraction_t

def decide_label(fraction):
    '''Decide how to label a given sidewalk pair given multiple responses.'''
    # If majority is 't', choose 't'
    if fraction > 0.5:
        return 't'
    # If majority is 'f', choose 'f'
    elif fraction < 0.5:
        return 'f'
    # If even split, ignore entirely for now
    else:
        return None

In [6]:
fractions3 = []
labels3 = []
for key, value in more_3.iteritems():
    fraction = label_fraction(value)
    label = decide_label(fraction)
    if label is not None:
        newrow = data.ix[value[0],['swi_id', 'swj_id', 'connected']]
        newrow['connected'] = label
        labels3.append(newrow)
        fractions3.append(fraction)

labels3 = pd.DataFrame(labels3)
labels3.head()

Unnamed: 0,swi_id,swj_id,connected
6366,325902,326474,f
9498,295084,298785,t
826,326184,326185,f
2638,297773,303110,f
2015,322238,330093,f


In [7]:
intersection3 = pd.merge(labels3, expert_data, how='inner', on=['swi_id', 'swj_id'])
intersection3.shape

(142, 8)

There are 142 triplicate-response labels. Hopefully this is enough!

Now we can get into the error rate - we will simply count how often the individual labels disagree with the expert labels, and how often the triplicate labels disagree with the expert labels.

In [8]:
# First, let's look at expert users and make sure they always agree with one another
def agree(array):
    if np.all(array == 't') or np.all(array == 'f'):
        return True
    else:
        return False
    return np.all(array == 't')

expert_agreement = expert_data.groupby(['swi_id', 'swj_id'])['connected'].agg(agree)
expert_agreement.head()

swi_id  swj_id
288896  328952    True
288990  330196    True
288995  297844    True
289052  289054    True
289100  300963    True
Name: connected, dtype: bool

In [9]:
experts_agree_frac = expert_agreement.sum() / float(expert_agreement.shape[0])
print 'Fraction of expert-answered cases with complete agreement: {}'.format(experts_agree_frac)

Fraction of expert-answered cases with complete agreement: 0.982352941176


In [10]:
data_agreement = data.groupby(['swi_id', 'swj_id'])['connected'].agg(agree)
data_agree_frac = data_agreement.sum() / float(data.shape[0])
print 'Fraction of user-answered cases with complete agreement: {}'.format(data_agree_frac)

Fraction of user-answered cases with complete agreement: 0.732213963366


In [11]:
# We will want to compare user responses with cases where experts agreed (our ground truth)
expert_agreed = expert_data.groupby(['swi_id', 'swj_id']).agg(np.all)[expert_agreement].reset_index()
expert_agreed.head()

Unnamed: 0,swi_id,swj_id,swi_geom,swj_geom,connected,username,timestamp
0,288896,328952,010200000002000000EC31ACB3C1965EC0FE445B88EED0...,01020000000200000035E1086DC4965EC0F1E9C078EED0...,f,Nick_expert,2016-05-27 03:28:08.627158+00
1,288990,330196,0102000000020000001FE61A97DC965EC0B5ACD65AEFD4...,01020000000200000079349C5CD2965EC08103A68DE5D4...,f,Vero_expert,2016-05-27 03:23:51.189268+00
2,288995,297844,0102000000020000007681EBBADE965EC0C032038831D4...,010200000002000000192A9A25DF965EC0EDA009F668D4...,f,Nick_expert,2016-05-27 03:48:40.70959+00
3,289052,289054,010200000002000000810705C500945EC0C34B3F1F8ADC...,010200000002000000D5E9E78F00945EC06D30CCCF17DC...,t,Anat_Expert,2016-05-27 03:25:26.055121+00
4,289100,300963,010200000002000000590692A2A4975EC0EC39EDC5AFD7...,010200000002000000E59A8193B3975EC0E66FD4BE76D7...,t,Anat_Expert,2016-05-27 03:24:53.450795+00


In [12]:
# Get the intersection between user responses and expert labels (indexing on user responses)
data_ix_experts = pd.merge(data, expert_agreed, how='inner', on=['swi_id', 'swj_id'])
print data_ix_experts.shape
data_ix_experts.head()

(585, 12)


Unnamed: 0,swi_id,swj_id,swi_geom_x,swj_geom_x,connected_x,username_x,timestamp_x,swi_geom_y,swj_geom_y,connected_y,username_y,timestamp_y
0,309546,309547,0102000000020000006C89F4BFD7935EC02C6725F509D0...,010200000002000000A536BDA100945EC0B47C95AE06D0...,f,VS,2016-05-27 06:37:06.807044+00,0102000000020000006C89F4BFD7935EC02C6725F509D0...,010200000002000000A536BDA100945EC0B47C95AE06D0...,f,Anat_Expert,2016-05-27 03:38:51.136498+00
1,309546,309547,0102000000020000006C89F4BFD7935EC02C6725F509D0...,010200000002000000A536BDA100945EC0B47C95AE06D0...,f,sumitmukherjee2,2016-02-05 20:31:27.269169+00,0102000000020000006C89F4BFD7935EC02C6725F509D0...,010200000002000000A536BDA100945EC0B47C95AE06D0...,f,Anat_Expert,2016-05-27 03:38:51.136498+00
2,309546,309547,0102000000020000006C89F4BFD7935EC02C6725F509D0...,010200000002000000A536BDA100945EC0B47C95AE06D0...,f,Ladan,2016-05-27 06:41:38.33052+00,0102000000020000006C89F4BFD7935EC02C6725F509D0...,010200000002000000A536BDA100945EC0B47C95AE06D0...,f,Anat_Expert,2016-05-27 03:38:51.136498+00
3,333287,333290,01020000000200000066F5A4D558975EC0F57F82FD8DD9...,010200000002000000F81FC2BC55975EC0F60AF49B19D9...,f,Stacey,2016-05-27 06:02:10.534513+00,01020000000200000066F5A4D558975EC0F57F82FD8DD9...,010200000002000000F81FC2BC55975EC0F60AF49B19D9...,f,Anat_Expert,2016-05-27 03:21:17.328162+00
4,333287,333290,01020000000200000066F5A4D558975EC0F57F82FD8DD9...,010200000002000000F81FC2BC55975EC0F60AF49B19D9...,f,lmbrettner,2016-02-09 01:25:13.102414+00,01020000000200000066F5A4D558975EC0F57F82FD8DD9...,010200000002000000F81FC2BC55975EC0F60AF49B19D9...,f,Anat_Expert,2016-05-27 03:21:17.328162+00


In [13]:
matches = data_ix_experts['connected_x'] == data_ix_experts['connected_y']
error = data_ix_experts.shape[0] - matches.sum()

print 'Comparing all user responses directly to experts:'
print 'Number of matching responses: {}'.format(matches.shape[0])
print 'Error rate: {}'.format(error / float(data_ix_experts.shape[0]))

Comparing all user responses directly to experts:
Number of matching responses: 585
Error rate: 0.034188034188


Interesting! 3% is higher than we expected, but then again, we want to rely on triplicate responses from here on out. What's the error rate on the consensus response?

In [14]:
trip_ix_experts = pd.merge(intersection3, expert_agreed, how='inner', on=['swi_id', 'swj_id'])
print trip_ix_experts.shape
trip_ix_experts.head()

(138, 13)


Unnamed: 0,swi_id,swj_id,connected_x,swi_geom_x,swj_geom_x,connected_y,username_x,timestamp_x,swi_geom_y,swj_geom_y,connected,username_y,timestamp_y
0,290405,331910,f,01020000000200000006935F8E21935EC0F7C4FE2642DA...,01020000000200000088EAEFD31E935EC0D86B2F2642DA...,f,Anat_Expert,2016-05-27 03:28:57.776047+00,01020000000200000006935F8E21935EC0F7C4FE2642DA...,01020000000200000088EAEFD31E935EC0D86B2F2642DA...,f,Anat_Expert,2016-05-27 03:28:57.776047+00
1,316621,316877,f,010200000002000000F6BC5726DE935EC05875599998C9...,010200000002000000C397CF1ECB935EC0FD4872999AC9...,t,Vero_expert,2016-05-27 03:26:26.90163+00,010200000002000000F6BC5726DE935EC05875599998C9...,010200000002000000C397CF1ECB935EC0FD4872999AC9...,t,Vero_expert,2016-05-27 03:26:26.90163+00
2,307553,307571,f,0102000000020000006DEC78FC51945EC0347EC7F0C6D6...,0102000000020000002D9DA3967B945EC0BB0B2B37B0D6...,f,Sumit_expert,2016-05-27 03:29:02.683813+00,0102000000020000006DEC78FC51945EC0347EC7F0C6D6...,0102000000020000002D9DA3967B945EC0BB0B2B37B0D6...,f,Sumit_expert,2016-05-27 03:29:02.683813+00
3,324110,326874,t,010200000002000000DBD063B113975EC0120BF3F0EBC2...,010200000002000000CCA16C431B975EC0004D35E2B2C2...,t,Nick_expert,2016-05-27 03:28:16.62483+00,010200000002000000DBD063B113975EC0120BF3F0EBC2...,010200000002000000CCA16C431B975EC0004D35E2B2C2...,t,Nick_expert,2016-05-27 03:28:16.62483+00
4,289052,289054,t,010200000002000000810705C500945EC0C34B3F1F8ADC...,010200000002000000D5E9E78F00945EC06D30CCCF17DC...,t,Anat_Expert,2016-05-27 03:25:26.055121+00,010200000002000000810705C500945EC0C34B3F1F8ADC...,010200000002000000D5E9E78F00945EC06D30CCCF17DC...,t,Anat_Expert,2016-05-27 03:25:26.055121+00


In [15]:
matches3 = trip_ix_experts['connected_x'] == trip_ix_experts['connected_y']
error3 = trip_ix_experts.shape[0] - matches3.sum()

print 'Comparing all user responses directly to experts:'
print 'Number of matching responses: {}'.format(matches3.shape[0])
print 'Error rate: {}'.format(error3 / float(trip_ix_experts.shape[0]))

Comparing all user responses directly to experts:
Number of matching responses: 138
Error rate: 0.036231884058
