# Filter all data
This takes the raw results for both experiments and removes the following:
* Empty columns
* Privac sensitive columns
* Rejected workers
* Contradicting answers that were not rejected workers
* Workers that did the first experiment twice
* Iphone and android users
* Time spent on scenarios must be > 2

In [15]:
import os
import pandas as pd


inputFolder = folder+"/Data/1.raw/"
outputFolder = folder+"/Data/2.filtered/"

for f in os.listdir(inputFolder):
    # ignore non csv files
    if not f.endswith('csv'):
        continue
    print f
    
    # load data
    df = pd.read_csv(inputFolder+f)
            
    # remove empty columns
    print 'Columns:',df.shape[1]
    df.dropna(axis=1, how='all', inplace=True)
    print 'Columns:',df.shape[1],"Empty"
    
    
    # remove rejected workers
    print 'Rows:',df.shape[0]
    df = df[df['_tainted'] == False]
    print 'Rows:',df.shape[0],"Rejected"
    
    # remove contradicting answers that were not rejected workers
    if 'spam' not in df.columns:
        df['spam'] = 0
    df = df[df['spam'] == 0]
    print 'Rows:',df.shape[0],"Contradicting"
    
    # remove workers that did the first experiment twice
    # take their first answer as the true data
    if f == 'exp1_f1232791.csv':
        workers = df['_worker_id']
    if f == 'exp1_f1233325.csv':
        df = df[~df['_worker_id'].isin(workers)]
    print 'Rows:',df.shape[0],"Doubles"
    
    # remove iphone and android users
    df = df[df.apply(lambda row: 'iPhone' not in row['browser'], axis=1)]
    print 'Rows:',df.shape[0],"iPhone"
    df = df[df.apply(lambda row: 'Android' not in row['browser'], axis=1)]
    print 'Rows:',df.shape[0],"Android"
    
    # remove if time is not > 2
    if 'time_none' in df.columns:
        df = df[df['time_none'] > 2]
        df = df[df['time_warning'] > 2]
        df = df[df['time_danger'] > 2]
    else:
        df = df[df['time_suggestion'] > 2]
        df = df[df['time_hours'] > 2]
        df = df[df['time_numerical'] > 2]
    print 'Rows:',df.shape[0],"time"
    
    
    # overwrite privacy sensitive columns
    df['_ip'] = 0
    df['browser'] = 0
    df['_city'] = 0
    df['_region'] = 0
    df['naam'] = 0
    print 'Columns:',df.shape[1],"Privacy"
        
    print 'Rows:',df.shape[0],"time"
    
    df.to_csv(outputFolder+f, index=False)
#df.head()

exp1_f1232791.csv
Columns: 62
Columns: 37 Empty
Rows: 200
Rows: 178 Rejected
Rows: 178 Contradicting
Rows: 178 Doubles
Rows: 176 iPhone
Rows: 176 Android
Rows: 175 time
Columns: 38 Privacy
Rows: 175 time
exp1_f1233325.csv
Columns: 64
Columns: 38 Empty
Rows: 725
Rows: 613 Rejected
Rows: 610 Contradicting
Rows: 471 Doubles
Rows: 470 iPhone
Rows: 462 Android
Rows: 449 time
Columns: 38 Privacy
Rows: 449 time
exp2_f1233802.csv
Columns: 74
Columns: 43 Empty
Rows: 628
Rows: 535 Rejected
Rows: 535 Contradicting
Rows: 535 Doubles
Rows: 533 iPhone
Rows: 522 Android
Rows: 501 time
Columns: 43 Privacy
Rows: 501 time


# Combined results of both experiments

In [None]:
import crowdtruth


class config():
    inputColumns = ['a']
    outputColumns = [
       'experiment1','experiment2',
       'alert_suggestion', 'alert_numerical',
       'feeling_danger', 'feeling_hours',
       'feeling_none', 'feeling_numerical',
       'feeling_suggestion', 'feeling_warning',
       'imageorder', 'income', 'indebt',
       'nobuyreason',
       'product', 'regret', 's_danger',
       's_hours', 's_none', 's_numerical',
       's_suggestion', 's_warning',
       'time_danger', 'time_hours', 'time_none',
       'time_numerical', 'time_pre', 'time_suggestion',
       'time_warning', 'warnings']

    # processing of a closed task
    open_ended_task = False
    annotation_vector = []#['s_none','s_warning','s_danger']

    def processJudgments(self, judgments):
        if 's_none' not in judgments.columns:
            judgments['experiment1'] = 0
            judgments['experiment2'] = 1
            
            judgments['s_suggestion'] = judgments['s_suggestion'].map(lambda x: str(x)[:-1])
            judgments['s_hours'] = judgments['s_hours'].map(lambda x: str(x)[:-1])
            judgments['s_numerical'] = judgments['s_numerical'].map(lambda x: str(x)[:-1])

            judgments['time_suggestion'] = judgments['time_suggestion'].astype('int')
            judgments['time_hours'] = judgments['time_hours'].astype('int')
            judgments['time_numerical'] = judgments['time_numerical'].astype('int')
            
            judgments['s_none'] = 0
            judgments['s_warning'] = 0
            judgments['s_danger'] = 0
            
            judgments['time_none'] = 0
            judgments['time_warning'] = 0
            judgments['time_danger'] = 0
            
            judgments['feeling_none'] = 0
            judgments['feeling_warning'] = 0
            judgments['feeling_danger'] = 0
            
        else :
            judgments['experiment1'] = 1
            judgments['experiment2'] = 0
            judgments['alert_suggestion'] = -1
            judgments['alert_numerical'] = -1
            
            judgments['s_none'] = judgments['s_none'].map(lambda x: str(x)[:-1])
            judgments['s_warning'] = judgments['s_warning'].map(lambda x: str(x)[:-1])
            judgments['s_danger'] = judgments['s_danger'].map(lambda x: str(x)[:-1])

            
            judgments['time_none'] = judgments['time_none'].astype('int')
            judgments['time_warning'] = judgments['time_warning'].astype('int')
            judgments['time_danger'] = judgments['time_danger'].astype('int')
            
            judgments['s_suggestion'] = 0
            judgments['s_hours'] = 0
            judgments['s_numerical'] = 0
            
            judgments['time_suggestion'] = 0
            judgments['time_hours'] = 0
            judgments['time_numerical'] = 0
            
            judgments['feeling_suggestion'] = 0
            judgments['feeling_hours'] = 0
            judgments['feeling_numerical'] = 0

        
        judgments['spam'] = '0'
        #judgments['time_pre'] = judgments['time_pre'].astype('int')
        #judgments.fillna(0, inplace=True)
        #print judgments.head()
        return judgments
    
data, config = crowdtruth.load(
    directory = "/Users/benjamin/Box Sync/TFI Research/Data/2.filtered/",
    config = config()
)

In [2]:
for p in config.output:
    #print p
    #print data['judgments']['output.'+p]
    data['judgments']['output.'+p] = data['judgments']['output.'+p].apply(lambda x: ','.join(x))
#print data['judgments'].head()


In [None]:
import pandas as pd

# aggregate post questions
posts = {
    'income' : 'income',
    'nobuyreason' : 'nobuyreason',
    'timing_suggestion' : 'alert_suggestion',
    'timing_numerical' : 'alert_numerical',
    'warnings' : 'warnings',
    'affordcheck' : 'indebt',
    'payontime' : 'regret'
}

data['judgments']['output.experiment1'] = data['judgments']['output.experiment1'].astype('int')
data['judgments']['output.experiment2'] = data['judgments']['output.experiment2'].astype('int')


for p in posts:
    data[p] = data['judgments'].copy()
    data[p] = data[p][['output.'+posts[p],'output.experiment1','output.experiment2','output.s_none','output.s_warning','output.s_danger','output.s_suggestion','output.s_hours','output.s_numerical']]
    data[p].columns = [p,'experiment1','experiment2','none','warning','danger','suggestion','hours','numerical']


    data[p]['none'] = data[p]['none'].apply(lambda x: 1 if x == 'submit' else 0)
    data[p]['warning'] = data[p]['warning'].apply(lambda x: 1 if x == 'submit' else 0)
    data[p]['danger'] = data[p]['danger'].apply(lambda x: 1 if x == 'submit' else 0)
    data[p]['suggestion'] = data[p]['suggestion'].apply(lambda x: 1 if x == 'submit' else 0)
    data[p]['hours'] = data[p]['hours'].apply(lambda x: 1 if x == 'submit' else 0)
    data[p]['numerical'] = data[p]['numerical'].apply(lambda x: 1 if x == 'submit' else 0)
    
    agg = {
        p : 'count',
        'experiment1' : 'sum',
        'experiment2' : 'sum',
        'none' : 'sum',
        'warning' : 'sum',
        'danger' : 'sum',
        'suggestion' : 'sum',
        'hours' : 'sum',
        'numerical' : 'sum',
    }
    data[p] = data[p].groupby(p).agg(agg)
    data[p]['none'] = data[p].apply(lambda row: row['none'] / float(row['experiment1']), axis = 1)
    data[p]['warning'] = data[p].apply(lambda row: row['warning'] / float(row['experiment1']), axis = 1)
    data[p]['danger'] = data[p].apply(lambda row: row['danger'] / float(row['experiment1']), axis = 1)
    data[p]['suggestion'] = data[p].apply(lambda row: row['suggestion'] / float(row['experiment2']), axis = 1)
    data[p]['hours'] = data[p].apply(lambda row: row['hours'] / float(row['experiment2']), axis = 1)
    data[p]['numerical'] = data[p].apply(lambda row: row['numerical'] / float(row['experiment2']), axis = 1)
    data[p] = data[p].T
    #print data[p]



# financial responsibility
data['responsibility'] = data['judgments'].copy()
data['responsibility'] = data['responsibility'][['output.experiment1','output.experiment2','output.indebt','output.regret','output.s_none','output.s_warning','output.s_danger','output.s_suggestion','output.s_hours','output.s_numerical']]
data['responsibility'].columns = ['experiment1','experiment2','affordcheck','payontime','none','warning','danger','suggestion','hours','numerical']

data['responsibility']['affordcheck'] = data['responsibility']['affordcheck'].apply(lambda x: 1 if x == 'eens' else 0)
data['responsibility']['payontime'] = data['responsibility']['payontime'].apply(lambda x: 1 if x == 'eens' else 0)
data['responsibility']['responsibility'] = data['responsibility']['affordcheck'] + data['responsibility']['payontime']

data['responsibility']['none'] = data['responsibility']['none'].apply(lambda x: 1 if x == 'submit' else 0)
data['responsibility']['warning'] = data['responsibility']['warning'].apply(lambda x: 1 if x == 'submit' else 0)
data['responsibility']['danger'] = data['responsibility']['danger'].apply(lambda x: 1 if x == 'submit' else 0)
data['responsibility']['suggestion'] = data['responsibility']['suggestion'].apply(lambda x: 1 if x == 'submit' else 0)
data['responsibility']['hours'] = data['responsibility']['hours'].apply(lambda x: 1 if x == 'submit' else 0)
data['responsibility']['numerical'] = data['responsibility']['numerical'].apply(lambda x: 1 if x == 'submit' else 0)

agg = {
    'experiment1' : 'sum',
    'experiment2' : 'sum',
    'none' : 'sum',
    'warning' : 'sum',
    'danger' : 'sum',
    'suggestion' : 'sum',
    'hours' : 'sum',
    'numerical' : 'sum',
}

data['responsibility'] = data['responsibility'].groupby(['responsibility']).agg(agg)
data['responsibility']['none'] = data['responsibility'].apply(lambda row: row['none'] / float(row['experiment1']), axis = 1)
data['responsibility']['warning'] = data['responsibility'].apply(lambda row: row['warning'] / float(row['experiment1']), axis = 1)
data['responsibility']['danger'] = data['responsibility'].apply(lambda row: row['danger'] / float(row['experiment1']), axis = 1)
data['responsibility']['suggestion'] = data['responsibility'].apply(lambda row: row['suggestion'] / float(row['experiment2']), axis = 1)
data['responsibility']['hours'] = data['responsibility'].apply(lambda row: row['hours'] / float(row['experiment2']), axis = 1)
data['responsibility']['numerical'] = data['responsibility'].apply(lambda row: row['numerical'] / float(row['experiment2']), axis = 1)
data['responsibility'] = data['responsibility'].T
#print data['responsibility']


In [None]:
import pandas as pd
import numpy as np


#
# aggregate by time exposure
#
data['scenarios'] = data['judgments'][['output.s_none','output.s_warning','output.s_danger','output.s_suggestion','output.s_hours','output.s_numerical']].apply(pd.Series.value_counts).T
data['scenarios'].index = ['none','warning','danger','suggestion','hours','numerical']

rows = data['judgments'].index.size
#data['scenarios']['cancel_ratio'] = data['scenarios']['cancel'].apply(lambda x: float(x) / rows)
#data['scenarios']['submit_ratio'] = 0
data['scenarios']['submit_ratio'] = data['scenarios'].apply(lambda row: row['submit'] / (float(row['cancel']) + float(row['submit'])), axis=1)
#print data['scenarios']

data['scenarios']['duration_avg'] = 0
data['scenarios'].loc['none','duration_avg'] = np.asarray(data['judgments']['output.time_none'], dtype=np.float).mean()
data['scenarios'].loc['warning','duration_avg'] = np.asarray(data['judgments']['output.time_warning'], dtype=np.float).mean()
data['scenarios'].loc['danger','duration_avg'] = np.asarray(data['judgments']['output.time_danger'], dtype=np.float).mean()
data['scenarios'].loc['suggestion','duration_avg'] = np.asarray(data['judgments']['output.time_suggestion'], dtype=np.float).mean()
data['scenarios'].loc['hours','duration_avg'] = np.asarray(data['judgments']['output.time_hours'], dtype=np.float).mean()
data['scenarios'].loc['numerical','duration_avg'] = np.asarray(data['judgments']['output.time_numerical'], dtype=np.float).mean()


# scenarios
data['scenarios'] = data['judgments'].copy()
data['scenarios'] = data['scenarios'][['output.experiment1','output.experiment2','output.s_none','output.s_warning','output.s_danger','output.s_suggestion','output.s_hours','output.s_numerical']]
data['scenarios'].columns = ['experiment1','experiment2','none','warning','danger','suggestion','hours','numerical']

data['scenarios']['none'] = data['scenarios']['none'].apply(lambda x: 1 if x == 'submit' else 0)
data['scenarios']['warning'] = data['scenarios']['warning'].apply(lambda x: 1 if x == 'submit' else 0)
data['scenarios']['danger'] = data['scenarios']['danger'].apply(lambda x: 1 if x == 'submit' else 0)
data['scenarios']['suggestion'] = data['scenarios']['suggestion'].apply(lambda x: 1 if x == 'submit' else 0)
data['scenarios']['hours'] = data['scenarios']['hours'].apply(lambda x: 1 if x == 'submit' else 0)
data['scenarios']['numerical'] = data['scenarios']['numerical'].apply(lambda x: 1 if x == 'submit' else 0)

# t.tests
import scipy.stats
exp1 = data['scenarios'][data['scenarios']['experiment1'] == 1]
print 'none-warning t-test',scipy.stats.ttest_rel(exp1['none'],exp1['warning'])

agg = {
    'experiment1' : 'sum',
    'experiment2' : 'sum',
    'none' : 'sum',
    'warning' : 'sum',
    'danger' : 'sum',
    'suggestion' : 'sum',
    'hours' : 'sum',
    'numerical' : 'sum',
}

data['scenarios'] = data['scenarios'].groupby(['experiment1']).agg(agg)
data['scenarios']['none'] = data['scenarios'].apply(lambda row: row['none'] / float(row['experiment1']), axis = 1)
data['scenarios']['warning'] = data['scenarios'].apply(lambda row: row['warning'] / float(row['experiment1']), axis = 1)
data['scenarios']['danger'] = data['scenarios'].apply(lambda row: row['danger'] / float(row['experiment1']), axis = 1)
data['scenarios']['suggestion'] = data['scenarios'].apply(lambda row: row['suggestion'] / float(row['experiment2']), axis = 1)
data['scenarios']['hours'] = data['scenarios'].apply(lambda row: row['hours'] / float(row['experiment2']), axis = 1)
data['scenarios']['numerical'] = data['scenarios'].apply(lambda row: row['numerical'] / float(row['experiment2']), axis = 1)
#data['scenarios'] = data['scenarios'].T
#print data['scenarios'].head()







from scipy import stats
anova = data['judgments'].copy()
anova[['output.s_none','output.s_warning','output.s_danger']] = anova[['output.s_none','output.s_warning','output.s_danger']].apply(lambda x: x.replace('cancel',1))
anova[['output.s_none','output.s_warning','output.s_danger']] = anova[['output.s_none','output.s_warning','output.s_danger']].apply(lambda x: x.replace('submit',0))
F, p = stats.f_oneway(anova['output.s_none'], anova['output.s_warning'], anova['output.s_danger'])
print F,p

anova = data['judgments'].copy()
anova[['output.s_suggestion','output.s_hours','output.s_numerical']] = anova[['output.s_suggestion','output.s_hours','output.s_numerical']].apply(lambda x: x.replace('cancel',1))
anova[['output.s_suggestion','output.s_hours','output.s_numerical']] = anova[['output.s_suggestion','output.s_hours','output.s_numerical']].apply(lambda x: x.replace('submit',0))
F, p = stats.f_oneway(anova['output.s_suggestion'], anova['output.s_hours'], anova['output.s_numerical'])
print F,p



# feelings
def pos(feelings):
    for f in feelings.split(','):
        if f in ['tevreden','blij','opgewonden','opgelucht']:
            return 1
    return 0

def neg(feelings):
    for f in feelings.split(','):
        if f in ['bezorgd','schuldig','verdrietig','boos','beschaamd','ontevreden']:
            return 1
    return 0

def neutral(feelings):
    for f in feelings.split(','):
        if f in ['weetniet']:
            return 1
    return 0

    
feelings = data['judgments'].copy()
feelings = feelings[['output.s_none','output.s_warning','output.s_danger','output.feeling_none','output.feeling_warning','output.feeling_danger','output.s_suggestion','output.s_hours','output.s_numerical','output.feeling_suggestion','output.feeling_hours','output.feeling_numerical']]

feelings['s_none_pos'] = feelings['output.feeling_none'].apply(lambda x: pos(x))
feelings['s_none_neg'] = feelings['output.feeling_none'].apply(lambda x: neg(x))
feelings['s_none_neutral'] = feelings['output.feeling_none'].apply(lambda x: neutral(x))
feelings['s_warning_pos'] = feelings['output.feeling_warning'].apply(lambda x: pos(x))
feelings['s_warning_neg'] = feelings['output.feeling_warning'].apply(lambda x: neg(x))
feelings['s_warning_neutral'] = feelings['output.feeling_warning'].apply(lambda x: neutral(x))
feelings['s_danger_pos'] = feelings['output.feeling_danger'].apply(lambda x: pos(x))
feelings['s_danger_neg'] = feelings['output.feeling_danger'].apply(lambda x: neg(x))
feelings['s_danger_neutral'] = feelings['output.feeling_danger'].apply(lambda x: neutral(x))

feelings['s_suggestion_pos'] = feelings['output.feeling_suggestion'].apply(lambda x: pos(x))
feelings['s_suggestion_neg'] = feelings['output.feeling_suggestion'].apply(lambda x: neg(x))
feelings['s_suggestion_neutral'] = feelings['output.feeling_suggestion'].apply(lambda x: neutral(x))
feelings['s_hours_pos'] = feelings['output.feeling_hours'].apply(lambda x: pos(x))
feelings['s_hours_neg'] = feelings['output.feeling_hours'].apply(lambda x: neg(x))
feelings['s_hours_neutral'] = feelings['output.feeling_hours'].apply(lambda x: neutral(x))
feelings['s_numerical_pos'] = feelings['output.feeling_numerical'].apply(lambda x: pos(x))
feelings['s_numerical_neg'] = feelings['output.feeling_numerical'].apply(lambda x: neg(x))
feelings['s_numerical_neutral'] = feelings['output.feeling_numerical'].apply(lambda x: neutral(x))

#print feelings.head()
data['feelings'] = feelings

data['feeling_count'] = pd.DataFrame()
data['feeling_count']['none'] = pd.DataFrame([i for f in data['judgments']['output.feeling_none'].tolist() for i in f.split(',')]).loc[:,0].value_counts()
data['feeling_count']['warning'] = pd.DataFrame([i for f in data['judgments']['output.feeling_warning'].tolist() for i in f.split(',')]).loc[:,0].value_counts()
data['feeling_count']['danger'] = pd.DataFrame([i for f in data['judgments']['output.feeling_danger'].tolist() for i in f.split(',')]).loc[:,0].value_counts()
data['feeling_count']['suggestion'] = pd.DataFrame([i for f in data['judgments']['output.feeling_suggestion'].tolist() for i in f.split(',')]).loc[:,0].value_counts()
data['feeling_count']['hours'] = pd.DataFrame([i for f in data['judgments']['output.feeling_hours'].tolist() for i in f.split(',')]).loc[:,0].value_counts()
data['feeling_count']['numerical'] = pd.DataFrame([i for f in data['judgments']['output.feeling_numerical'].tolist() for i in f.split(',')]).loc[:,0].value_counts()

#data['feeling_count']['none'] = .value_counts()
#data['feeling_count']['warning'] = data['judgments']['output.feeling_warning'].value_counts()
print data['feeling_count']

crowdtruth.save(data, config, folder+'/Data/3.aggregated/')