<a href="https://colab.research.google.com/github/Alexir/CHANEL/blob/master/Valid_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Authenticate `git` user
You must have previously ensured that you enable github in your colab space:
* In Colab, go to `Tools`->`Settings`; click `Request Github access...`
* In Github, [generate a token](https://docs.github.com/en/enterprise/2.13/user/articles/creating-a-personal-access-token-for-the-command-line) to enable login under your account.
* Use your Github id and the `token` and set the env vars. You can use the fancy lookup that I do or you can just type in your info (but be careful with the Sharing).


In [None]:
#@title Access code & data: auth to github, mount GoogDrive
# authenticate  ---> do this cell once when you start a session

#mount your google drive, forcibly
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

# read git token from a file on your My Drive/
import pandas as pd
TOKEN = pd.read_csv( "/content/drive/My Drive/CHANEL/DataAnalysis.txt")
# your git identity
import os
os.environ['GIT_TOKEN']=TOKEN.columns[0]
os.environ['GIT_USER'] ='Alexir'  # <-- your git name here

# ***NOTE that the above will randomly time out; you have to keep trying
# or, just paste it in


## Set up environment and Data Preparation

In [None]:
#@title Set up environment
import sys,os

%pip install xmltodict
%pip install stats

import numpy as np
import stats
import statistics
import pandas as pd
import xmltodict  # get the data tables

pd.set_option('display.width', 180)

## Data Preparation  **Fetch the repositories**
# (which should have the current version of things).
# The data here are from the `Data-Collection/` (i.e. RawEval) repository.

# You need to have the `GIT_USER` and `GIT_TOKEN` env vars to be set (see previous step).
# get a clean copy of the repository info
%cd /content/
%rm -fr Data-Collection ChatEval-AMT-Interface CHANEL
!git --version
!git  clone https://${GIT_USER}:${GIT_TOKEN}@github.com/CHANEL-JSALT-2020/Data-Collection
!git  clone https://github.com/CHANEL-JSALT-2020/ChatEval-AMT-Interface 

!git  clone https://github.com/Alexir/CHANEL 

# linear fitting to duration data 
!pip install pwlf --upgrade --no-deps
!pip install pyDOE --upgrade
import pwlf

# for agreement computations
!pip install Krippendorff
import krippendorff

# convenience function
# dataframe must have:  columns='i',index='c',values='k'
def compute_ordinal_alpha(df) :
  """ apply alpha() to a krip-formatted data frame """
  return(krippendorff.alpha(df.values, level_of_measurement='ordinal')
  )



print('\nReady')
#

In [None]:
#@title Fetch (session, Raw, Eval) data from `.csv` 
# extract .pkl data into session and make tables

# put put Metrics data up at /content level
!mkdir /content/Metrics
!mkdir /content/Metrics/Raw
!mkdir /content/Metrics/Eval

%cd /content/Data-Collection/
!git checkout analysis  # fetch current code & data form (air)'s branch

# unpickle the Raw data; make _all, _clean and _attn files
!bash scripts/run_extraction.sh raws/030820_first_public.pkl work/
 # get Raw data into colab space, session data with durations
!cp -p /content/Data-Collection/work/*.csv /content/Metrics/Raw/  
!cp -p /content/CHANEL/Analysis_Valid/test_data/*-session.csv /content/Metrics/Raw/ 
# get the Eval data
!cp -p /content/ChatEval-AMT-Interface/data/annotated/030820_first_public_finding_atts.csv   /content/Metrics/Eval/ # get Eval data

print()
!ls -l /content/Metrics/*/  # show data files

# remember file paths (note column orders)
RawAssignment_file = '/content/Metrics/Raw/'+ '030820_first_public-assigmt.csv'  # session data
Session_file = '/content/CHANEL/Analysis_Valid/test_data/030820_first_public-session.csv' # w/ session durations
RawData_file =  '/content/Metrics/Raw/'+  "030820_first_public-raw.csv"   # [i,c,k]
EvalData_file = '/content/Metrics/Eval/'+ "030820_first_public_finding_atts.csv"   # [i,k,c]

#

In [None]:
#@title Process data into local formats 
# -- column order
# -- filter out catch trials
# note that Raw and Eval have different col orders

# prepare RawData and breakouts
RawData_all = pd.read_csv(RawData_file,names=['i','c','k'],delimiter='\t') # alpha() dim names 
RawData_all['k'] = RawData_all['k'].astype(np.float64)  # fix number type
RawData_attn =  RawData_all[  RawData_all['i'].str.contains("ATTENTION")] 
RawData_clean = RawData_all[ ~RawData_all['i'].str.contains("ATTENTION")] 

# prepare EvalData and breakouts
ata = pd.read_csv(EvalData_file,names=['i','k','c']) # alpha() dim names 
hedless = ata.drop(index=[0])  #drop the header line  
EvalData_all = hedless[['i','c','k']]   # nltk-wise column order
EvalData_all['k'] = EvalData_all['k'].astype(np.float64)  # fix number type
EvalData_attn =  EvalData_all[  EvalData_all['i'].str.contains('ATTENTION')] 
EvalData_clean = EvalData_all[ ~EvalData_all['i'].str.contains('ATTENTION')] 

# prepare  (Raw) Session data; there is no Eval session data
RawSession = pd.read_csv(Session_file, delimiter='\t', header=0 )


print('done')
#

##Compare with `krippendorff_test.ipnb`
Is the data format handling consistent? If yes, the alphas should be the same. This a sanity check...

In [None]:
#@title Krippendorff validation (previous computation)
#@markup check whether the data in the two notebooks are the same, by computing alpha
#

# make Krippendorff-style formats; check w/ other notebook, to be the same
# PIVOT into correct format: [c,k,i] --> [c,i] /k
RawData_krip_all =   pd.pivot_table(RawData_all,  columns='i',index='c',values='k')
RawData_krip_attn  = pd.pivot_table(RawData_attn, columns='i',index='c',values='k')
RawData_krip_clean = pd.pivot_table(RawData_clean,columns='i',index='c',values='k')

EvalData_krip_all =   pd.pivot(EvalData_all,  columns='i',index='c',values='k')
EvalData_krip_attn  = pd.pivot(EvalData_attn, columns='i',index='c',values='k')
EvalData_krip_clean = pd.pivot(EvalData_clean,columns='i',index='c',values='k')

# compute alphas: number should match...
print('"clean" subset:')
print('RawData ordinal  [0.321]:', round(compute_ordinal_alpha(RawData_krip_clean),3))
print('EvalData ordinal [0.389]:', round(compute_ordinal_alpha(EvalData_krip_clean),3))


#

## Basic stats for Raw, Eval

Various metrics that allow the data to be characterized in useful ways including:
* For a deployment what is the profile of hit accepts?
* distribution of session durations
* incidence of poor attention (check trials)
* corpus
* etc


---


Key to column names: (Kippendorff nomenclature --- Metrics data format)
* `c --- WorkerId`
* `i --- Excerpts`
* `k --- Rating`
* `H --- hit`












In [None]:
#@title pull Raw, Eval data into local space, drop check trials
# 

Raw_sessions = pd.read_csv(RawAssignment_file,delimiter='\t')
print('raw sessions information: \n',Raw_sessions.shape,Raw_sessions.columns,'\n' )

Raw_data = pd.read_csv(RawData_file, delimiter='\t', names=['i','c','k']) 
Raw_data = Raw_data[ ~Raw_data['i'].str.contains('ATTENTION')]  # remove
#print(Raw_data[0:2])

# the Eval data has a header. Also remove catch trials 
Eval_data = pd.DataFrame()
ata = pd.read_csv(EvalData_file,names=['i','k','c'])
hedless = ata.drop(index=[0])  # drop the header line  
Eval_hless = hedless[['i','c','k']]   # make column order same as Raw
Eval_hless.dropna(subset=['i'],inplace=True) # why was this needed? ***
Eval_data = Eval_hless[ ~Eval_hless['i'].str.contains('ATTENTION')] 
Eval_data['k'] = Eval_data['k'].astype(int)  # why was *this* needed?
# print(Raw_data[0:6],'\n',Eval_data[0:6])

print()
print('raw excerpts: ',Raw_data.shape,Raw_data.columns)
print('eval excerpts:',Eval_data.shape,Eval_data.columns)
print()
#

In [None]:
#@title compute session durations (Raw data)
# this only works for the Raw_data (for which session data is available)
workers = pd.DataFrame(columns=['WorkerId',
                                'Sessions',   # sessions attempted
                                'Excerpts',   # excerpts completed
                                'FirstTime',  # when engagement began
                                'LastTime',   # when it ended
                                'Duration']
)

workers['WorkerId'] = pd.unique(Raw_data['c'])  # get unique Worker_Id's
print('number of workers:  ',len(workers['WorkerId']))

# compute counts per worker
session_cnt = Raw_sessions['WorkerId'].value_counts()  # count up instances of sessions,
excerpt_cnt = Raw_data['c'].value_counts()  # count up instances of excerpts
  
# go thru worker list, get index for a unique workerId; insert computed counts
for a in session_cnt.index :
  workers['Sessions'][ workers[workers['WorkerId']==a].index ] = session_cnt[a]
  workers['Excerpts'][ workers[workers['WorkerId']==a].index ] = excerpt_cnt[a]

# get session times; for each worker their first sign-on, last one, and delta
for wrkr_idx in workers['WorkerId'] :
  A_time = Raw_sessions[Raw_sessions['WorkerId']==wrkr_idx]['AcceptTime']  # get vectors of times 
  S_time = Raw_sessions[Raw_sessions['WorkerId']==wrkr_idx]['SubmitTime']

  # get min/max and duration of engagement for each worker
  workers.loc[workers.WorkerId == wrkr_idx, 'FirstTime']   = pd.to_datetime(A_time.min())
  workers.loc[workers.WorkerId == wrkr_idx, 'LastTime']   = pd.to_datetime(S_time.max())
  workers.loc[workers.WorkerId == wrkr_idx, 'Duration'] = workers['LastTime'] - workers['FirstTime']
  #print(workers.loc[workers.WorkerId == wrkr_idx, 'Duration'])

# global durs
print('min engagement time:', workers['Duration'].min())
print('max engagement time:', workers['Duration'].max())
#

In [None]:
#@title Cumulative items per worker, Raw data (ordered by productivity)
xx = session_cnt.sort_values(ascending=False).cumsum().plot() 
# (not sure why this is interesting; but it's a 1 liner)

In [None]:
#@title Stats for the (Raw) session file
#@markdown 
maxi = workers.Excerpts.max()
print('max sessions:', workers['Sessions'].max())
print('min excepts: ', workers['Excerpts'].min())
print('max exerpts: ', workers['Excerpts'].max())
print()

# assume return is a single row; get 0'th value, the worker id
maxw = workers.loc[workers.Excerpts == workers['Excerpts'].max()].values[0][0]
# variance for the worker who worked the hardest
vari = Raw_data.loc[Raw_data['c'] == maxw].var(skipna=True).values[0]
print("max()'s var:",round(vari,3),' (for hardest worker)')
#

#workers

In [None]:
#@title Distribution of HITs accepted, over time
# much action at the start: it's the new hit on the block 
# steady state thereafter then it peters out; did the repeaters use up their quota?
   
Raw_sessions['date'] = Raw_sessions['AcceptTime'].astype("datetime64")
print('range:',[ Raw_sessions['date'].min(), Raw_sessions['date'].max() ], '\nengagement:',Raw_sessions['date'].max() -  Raw_sessions['date'].min(),'\n');
print('distribution:',Raw_sessions['date'].hist(bins=36,xrot=45));

In [None]:
#@title compute session durations; (->TimeDelta)

Raw_sessions['duration'] = Raw_sessions['SubmitTime'].astype("datetime64") - Raw_sessions['AcceptTime'].astype("datetime64")
# plot
foo = pd.to_timedelta(Raw_sessions['duration'],unit='s').astype('timedelta64[s]') #/60.0
foo.hist(bins=60)
print(f'quantile(25,50,75,90): {foo.quantile(0.25):.0f} {foo.quantile(0.5):.0f} {foo.quantile(0.75):.0f} {foo.quantile(0.90):.0f} (sec)')
print(f'mean: {foo.mean():.2f}sec - {(foo.mean()/60.0):.2f}min')
print(f'mode: {foo.mode().mean():.1f}sec')

#

In [None]:
#@title Histo for ratings in catch trials
#  does not look so good. 

# Need to have info about individual items: Is it the people or the items?
chksum = Raw_sessions['CHECK'].count()
print(f'count: {chksum} ')
histog = pd.cut(Raw_sessions['CHECK'],4).value_counts()
# print(type (histog.divide(chksum)))
hout = histog.divide(chksum)
print (f'{hout}')
Raw_sessions['CHECK'].hist();

## Item Quality; basic stats
Extracts should show some consistency of markup across annotators. If not, there might be inherent problems in either selection or instructures. Identify such.

In [None]:
#@title Compute and plot item variances (unsorted view)
import matplotlib.pyplot as plt 
from pprint import pprint

#print(Raw_data.describe)
y_raw  = pd.pivot_table(Raw_data,  columns='c',index='i',values='k')
y_eval = pd.pivot_table(Eval_data, columns='c',index='i',values='k')

# collect variances into an array
datah_raw  = []
datah_eval = []

# collect item details into obs_*
obs_raw =  pd.DataFrame( index=y_raw.index,  columns=['var','obs','wrk','cnt'] )
obs_eval = pd.DataFrame( index=y_eval.index, columns=['var','obs','wrk','cnt'])

# in each item (row index), use jusy non-Nan values (i.e. across all workers)
for item in y_raw.index   : 
  var = y_raw.loc[item].var(skipna=True)  # using only real values
  obs_raw.at[item,'var'] = var 
  if pd.isna(obs_raw.at[item,'obs']) : obs_raw.at[item,'obs'] = []
  if pd.isna(obs_raw.at[item,'wrk']) : obs_raw.at[item,'wrk'] = []
  obs_raw.at[item,'obs'] = list( RawData_all.loc[RawData_all['i']==item]['k'] )
  obs_raw.at[item,'wrk'] = list( RawData_all.loc[RawData_all['i']==item]['c'] )
  obs_raw.at[item,'cnt'] = len( obs_raw.at[item,'wrk'] )
  datah_raw.append(var)  # for plots

for item in y_eval.index   : 
  var = y_eval.loc[item].var(skipna=True)  # a row or ratings
  obs_eval.at[item,'var'] = var 
  if pd.isna(obs_eval.at[item,'obs']) : obs_eval.at[item,'obs'] = []
  if pd.isna(obs_eval.at[item,'wrk']) : obs_eval.at[item,'wrk'] = []
  obs_eval.at[item,'obs'] = list(EvalData_all.loc[EvalData_all['i']==item]['k'])
  obs_eval.at[item,'wrk'] = list(EvalData_all.loc[EvalData_all['i']==item]['c'])
  obs_eval.at[item,'cnt'] = len(obs_eval.at[item,'wrk'])
  datah_eval.append(var)

print('obs_raw and obs_eval created.\n')


# Show item scatter across variances (not useful, but why not?)
plt.figure(figsize=(15,5))
plt.subplot(121)
plt.ylabel('variance')
plt.xlabel('item')
plt.title('Raw')
plt.xlim(-50,800)
_= plt.plot(datah_raw)
plt.subplot(122)
plt.ylabel('variance')
plt.xlabel('item')
plt.title('Eval')
plt.xlim(-50,800)
_= plt.plot(datah_eval)
plt.show()

from statistics import mean
print()
print('raw mean: ', round(mean(datah_raw),6) )
print('eval mean:', round(mean(datah_eval),6) )

In [None]:
# inspect datasets

cnt = 0
for i in obs_eval.index : 
  cnt+=1
  if (cnt<10) : continue
  print('\n',cnt,i,'------------ \nRawData_all',RawData_all.loc[RawData_all['i']==i])
  print('* obs_eval\n',obs_eval.loc[i])
  #if obs_eval.loc[i]['cnt'] !=5 : print (obs_eval.loc[i])
  if (cnt>12) : break
print('---',RawData_all.loc[RawData_all['i']=='MPATHY-000003-0004'])

In [None]:
# inspect datasets

#RawData_all.describe,
temp = RawData_all.loc[RawData_all['i']=='MPATHY-000025-0003']
#temp = y_raw.loc['MPATHY-000025-0003'].notna()
print(temp)
print()
foo = []
for t in RawData_all.loc[RawData_all['i']=='MPATHY-000025-0003'].index :
  foo.append([temp.at[t,'c'],temp.at[t,'k']])
pprint(foo)
          

In [None]:
#@title Plot cumulative item counts against variance
# the count can be used to chose a variance cutoff.

h_r = pd.DataFrame({'items': np.arange(0,len(datah_raw)), 
                    'vars':  np.sort(datah_raw) })
h_e = pd.DataFrame({'items': np.arange(0,len(datah_eval)), 
                    'vars':  np.sort(datah_eval) })

plt.figure(figsize=(10,5))
plt.xlabel('variance')
plt.ylabel('item count')
plt.xlim(-0.3,3.0)
plt.ylim(-20,800)

plt.subplot(121)
plt.title('Raw cumulative Variance')
plt.ylim(-20,800)
plt.axis([0.0,3.0,0,800])
plt.grid(True)
plt.plot(h_r['vars'],h_r['items']);

plt.subplot(122)
plt.title('Eval cumulative Variance')
plt.ylim(-20,800)
plt.axis([0.0,3.0,0,800])
plt.grid(True)
plt.plot(h_e['vars'],h_e['items']);

plt.show()

""" [SEE NEXT CELL]
# chart showing effect of diferent cutoffs (see next cell for better ino)
print('\n% cutoff Raw   % cutoff Eval')
for i in range(0,751,50) : 
  if i < 601 :
    print('{:3d} {:6.2f}'.format(i,round(i/6.0,2)),end='')
    print('     {:3d} {:6.2f}'.format(i,round(i/7.5,2)))
  else: 
    print(' '*15,end='')
    print('{:3d} {:6.2f}'.format(i,round(i/7.5,2)))
"""
pass
#

In [None]:
#@title compute yield of items, given a threshold
#@markdown make lists of all `good` and `bad` items at a given threshold.
#@markdown `{raw,eval}_{good,bad}`
# we want enough data left over...

# display yield, collect item id's
raw_good = {} # item id's by var
raw_bad  = {} # item id's by var
eval_good = {} # item id's by var
eval_bad  = {} # item id's by var
for thresh in np.arange(3.0,-0.25,-0.25) :

  short_raw  = [] # for table...
  for item in obs_raw.index :  # (item,var) table
    if obs_raw.at[item,'var'] < thresh :
      short_raw.append(obs_raw.at[item,'var']) # collect "good" variances
      if not thresh in raw_good : raw_good[thresh] = [] # and the good items
      raw_good[thresh].append(item)
    else: # save the reject item id's
      if not thresh in raw_bad : raw_bad[thresh] = []
      raw_bad[thresh].append(item)

  short_eval  = [] # for table...
  for item in obs_eval.index :  # (item,var) table
    if obs_eval.at[item,'var'] < thresh :
      short_eval.append(obs_eval.at[item,'var']) # collect "good" variances
      if not thresh in eval_good : eval_good[thresh] = [] # and the good items
      eval_good[thresh].append(item)
    else: # save the reject item id's
      if not thresh in eval_bad : eval_bad[thresh] = []
      eval_bad[thresh].append(item)

  # the threshold lists are used to compute yields
  #print('<',thresh,'\t',end='')
  print ( 'raw:  {}<{:4.2f}= {:3d}'.format(len(datah_raw),thresh, len(short_raw)), 
       ' \tkeeps->{:6.1f}%'.format(100.0*(len(short_raw)/len(datah_raw))),'\t',end='' )
  print ( '\teval: {}<{:4.2f}= {:3d}'.format(len(datah_eval),thresh, len(short_eval)), 
       '\tkeeps->{:6.1f}%'.format(100.0*(len(short_eval)/len(datah_eval)) ))
  



In [None]:
#@title Sanity check 
print(len(datah_raw),'\t bad\t good\t sum')
for k in raw_bad.keys() :  
  if k == 0.0 : continue
  print(k,'\t',len(raw_bad[k]),'\t',
        len(raw_good[k]),'\t',len(raw_bad[k])+len(raw_good[k]))
print()
print(len(datah_eval),'\t bad\t good\t sum')
for k in eval_bad.keys() : 
  if k == 0.0 : continue
  print(k,'\t',len(eval_bad[k]),'\t',
        len(eval_good[k]),'\t',len(eval_bad[k])+len(eval_good[k]))
  


#Do subsets per threshold and compute alphas
These number appear comparable to 
the worker-based thresholding but perhaps stricter.



In [None]:
#@title [ set up vars ]
raw_plot  = []      # alpha list for plot
eval_plot = []
raw_plot_cnt  = []  # count of items supporting an alpha
eval_plot_cnt = []
raw_plot_perc  = [] # counts, as percentages
eval_plot_perc = []
x_plot = []         # x-axis values

#

In [None]:
#@title  print out the datapoints
if ( True ) :
  print('Raw Data:')
  raw_alpha = {}
  for k in raw_good.keys() :
    if not k in raw_alpha : raw_alpha[k] = pd.DataFrame() 
    raw_alpha[k] = RawData_krip_all[raw_good[k]]
    alpha = compute_ordinal_alpha(raw_alpha[k])
    print('thresh: {:4.2f} {:3d} {:6.1f}%\talpha: {:4.2f}'.format(k,len(raw_good[k]),
                                                            100.0*(len(raw_good[k])/len(obs_raw.index)),
                                                            round(alpha,2)) )
    raw_plot.append(alpha)
    raw_plot_cnt.append(len(raw_good[k]))
    raw_plot_perc.append(100.0*(len(raw_good[k])/len(obs_raw.index)))
    x_plot.append(k)

  print()
  print('Eval Data:')
  eval_alpha = {}
  for k in eval_good.keys() :
    if not k in eval_alpha : eval_alpha[k] = pd.DataFrame() 
    eval_alpha[k] = EvalData_krip_all[eval_good[k]]
    alpha = compute_ordinal_alpha(eval_alpha[k])
    print('thresh: {:4.2f} {:3d} {:6.1f}%\talpha: {:4.2f}'.format(k,len(eval_good[k]),
                                                            100.0*(len(eval_good[k])/len(obs_eval.index)),
                                                            round(alpha,2)) )
    eval_plot.append(alpha)
    eval_plot_cnt.append(len(eval_good[k]))
    eval_plot_perc.append(100.0*(len(eval_good[k])/len(obs_eval.index)))


In [None]:
#@title plot alpha curves...
print()
#for i in range(len(x_plot)) :   print(i,x_plot[i])
#  print(x_plot[i],'\t', raw_plot[i],'\t',eval_plot[i])

plt.figure(figsize=(15,7))
plt.suptitle('effect of dropping high-variance items',fontsize=16)

plt.subplot(131)
plt.title('alpha by threshold')
plt.ylim(-0.20,0.700)
plt.axis([3.2,0.0,  0.2,0.7])
# plt.yscale('log')
plt.grid(True)
plt.plot(x_plot,raw_plot,'b-o',label='raw')
plt.plot(x_plot,eval_plot,'r-o',label='eval')
plt.xlabel('threshold (on variance)')
plt.ylabel('alpha')
plt.legend()

plt.subplot(132)
plt.title('item yield for given threshold')
plt.ylim(-0.20,     100,800)
plt.axis([3.2,0.0,  100,800])
plt.grid(True)
plt.plot(x_plot,raw_plot_cnt,'b-o',label='raw')
plt.plot(x_plot,eval_plot_cnt,'r-o',label='eval')
plt.xlabel('threshold (on variance)')
plt.ylabel('count')
plt.legend(['raw','eval'])

plt.subplot(133)
plt.title('% item yield')
plt.ylim(-0.20,     15,110)
plt.axis([3.2,0.0,  15,110])
plt.grid(True)
plt.plot(x_plot,raw_plot_perc,'b-o',label='raw')
plt.plot(x_plot,eval_plot_perc,'r-o',label='eval')
plt.xlabel('threshold (on variance)')
plt.ylabel('count')
plt.legend(['raw','eval'])
plt.show()


Tentative variance threshold:   ` < 1.75`
* The point at which alpha starts to improve.without lower yield too much, but yield is still reasonable (~90%)



## Filtering criteria based on worker behavior.

In [None]:
#@title Removing workers that "fail" the check trial
#@markdown proposed thresh at 2.0; random rating would be 2.5
# 4.0 -> ignores or misunderstands task
# 1.0 -> scrupulously "correct" (as per developer selection)
#

# get summary stats per worker: session count; means for dur, CHECK, duration
Times = pd.DataFrame()
Times['count'] = Raw_sessions.groupby("WorkerId")['WorkerId'].count()
Times['duration'] = Raw_sessions.groupby('WorkerId').duration.agg(lambda x:x.mean()).to_frame()
Times['secs'] = Times['duration'].dt.seconds
Times['CHECK_mean'] = Raw_sessions.groupby('WorkerId')['CHECK'].mean()

# compute yield for various worker CHECK thresholds
check_thresh = np.arange(1.0,4.25,0.25)
x_check      = np.arange(1.0,4.0,0.25)  # for plot

Check_yield = pd.DataFrame(columns=['count'], index=check_thresh)
bogons = {}
Keepers = {}
print('\tt\tKeeper\tCheck_y\tbogons')
for t in check_thresh :  
  print(t,end='\t')
  # counts for <= each threshold...
  bogons[t] = Times.query( "CHECK_mean <= @t" )
  Check_yield['count'][t] = len(Times.loc[Times['CHECK_mean'] >= t])
  # counts for > each thresh
  Keepers[t] = Times.query( "CHECK_mean >= @t" )  # rolls over to next section
  print(t,'\t',len(Keepers[t]),'\t',Check_yield['count'][t],'\t',len(bogons[t]))

# display yield (higher thresh the more you keep)
plt.figure(figsize=[5.0,5.0])
plt.plot( check_thresh, Check_yield['count'] )  # plot
plt.grid()
plt.xlim(4,1)
plt.ylim(10,100)
plt.axis([4.5,0.5,  0,100])
plt.title('Mean score on check trials, by worker')
plt.xlabel('check score threshold')
plt.ylabel('yield (% workers)')
plt.show()
#

In [None]:
#@title Filter  workers per threshold to get "good" workers; get alphas
#@markdown This doesn't work when there's too many all NaN items.
#@markdown We can filter but then some items have to be re-collected.
#@markdown On the positive side, a thresh of 1.5 gets us to alpha ~= 0.59
#@markdown
#@markdown Things flatten out at ~2.75, probably because there are no new workers in this range. (See previous cell.)
#@markdown There's a blip up when the 4.0 people show up.
#@markdown
#@markdown Note that the all-data alpha is the same as for the earlier validation run. Lucky us.

# assemble df's   per thresh
Keeper_alpha = {}
Loser_items = pd.DataFrame(index=RawData_krip_clean.index)
print("thresh\titems\tworkrs\t alpha")
for t in check_thresh :  # compute alpha for each threshold

  # pull out all worker data for those in the current tranche
  temp = RawData_krip_clean[RawData_krip_clean.index.isin(Keepers[t].index)]
  # compute some coverage stats  
  item = 0
  for i in temp.columns :  # over all workers, count # of items done
    if int(temp[i].count()) > 0 : item += 1 
  
  #over all items, count how many workers did it (in this threshold set)
  cnt = 0
  for j in temp.columns :  # go thru workers in this set
    for i in temp.index : 
      if np.isnan(temp.loc[i,j]) : Loser_items[i] = 1
      else: cnt += 1

  # finally
  Keeper_alpha[t] = compute_ordinal_alpha(temp)
  print("{:5.2f}\t{}\t{}\t{:6.3f}".format(t,item,len(temp),Keeper_alpha[t]))
print()

# plot the result; alpha (higher thresh the more you keep)
plt.figure(figsize=[5.0,5.0])
plt.plot( list(Keeper_alpha.keys()), list(Keeper_alpha.values()), lw=3)
plt.grid()
plt.xlim(4.5,1.5)
plt.ylim(-0.2,1.0)
plt.axis([0.5,4.5,  -0.2,1.0])
plt.title('Alpha, removing workers that fail CHECK')
plt.xlabel('CHECK threshold')
plt.ylabel('alpha')
plt.show()

#

In [None]:
temp

In [None]:
# what is actually in Keepers?
foo = Keepers[4.0].index  # list of turkers that pass thresh
items = temp.columns  # list of all items
for c in foo :
  print(c)
  for i in items :
    if pd.notnull(temp.loc[c,i]) :
      print( (c,i),temp.loc[c,i] )
#  if temp.loc[ foo[2],i] 
#  print(temp.loc[ foo[2],i])
  # df1.loc['a', 'A']

In [None]:
temp

In [None]:
len(Keepers[2.0]), Keepers[2.0].describe

In [None]:
c,i,  temp.columns[0] ,foo,len(foo) , temp

In [None]:
 foo,items[2],'\n',temp[temp[items[2]].notna()]

In [None]:
#@title Mean session duration as threshold
#@markdown This might take a while to compute; 
#@markdown sometimes the model fit barfs and there's an error. 
#@markdown But the right result still seems to come out.
#@markdown 
#@markdown Judging from the breakpoints, dropping the fastest 11 workers seem to get us beyond the bogus sessions.
#@markdown 


# make series
x = np.arange(0,95,1,dtype=float)
y = np.array(Times['secs'].sort_values(),dtype=float)

# build model (<5 breaks seems to brreak the code)
model = pwlf.PiecewiseLinFit(x, y)
breaks = model.fit(6)
# build the piece-wise lines into a plot
x_hat = np.linspace(x.min(), x.max(),num=95)
y_hat = model.predict(x_hat)

plt.figure(figsize=[5.0,5.0])
plt.plot(x, y,'b +')
plt.plot(x_hat, y_hat, 'r--',lw=2)
plt.grid()
plt.xlabel('yield (/95)')
plt.ylabel('session duration (sec)')
plt.show()

print('breakpoints')
print('| dur (sec) | workrs |')
print('| --- | --- |')
for x in breaks :
  print('|    {:3.0f} | {:3d} |'.format(y[int(x)],int(x)+1))
print()

#