In [125]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
datapath = '../Data/Survey/'

First, load the whole dataset and replace the force by the generated ids (technical problem that had to be solved that way)

In [127]:
initial_surv = pd.read_csv(datapath + 'OriginalResults.csv')
first_part = pd.read_csv(datapath + 'HelperIDs.csv')
initial_surv.loc[first_part.index, :] = first_part[:]

initial_surv

Unnamed: 0,Horodateur,Age,Gender,Origin,Residence,Percentage of lifetime in a city,Image1 ID,Image2 ID,Winner,Person ID
0,21/11/2021 11:25:45,22,Male,Europe,Lausanne,0.907285,139,116,116,22139116.0
1,21/11/2021 11:25:56,22,Male,Europe,Lausanne,0.907285,10,184,10,22139116.0
2,21/11/2021 11:25:58,22,Male,Europe,Lausanne,0.907285,183,13,13,22139116.0
3,21/11/2021 11:26:05,22,Male,Europe,Lausanne,0.907285,17,339,17,22139116.0
4,21/11/2021 11:26:10,22,Male,Europe,Lausanne,0.907285,281,137,281,22139116.0
...,...,...,...,...,...,...,...,...,...,...
16683,2021/11/28 8:09:03 AM UTC+1,96,Male,Europe,Europe,0.3057395,322,154,154,96174338.0
16684,2021/11/28 8:09:07 AM UTC+1,96,Male,Europe,Europe,0.3057395,82,151,82,96174338.0
16685,2021/11/28 8:09:11 AM UTC+1,96,Male,Europe,Europe,0.3057395,410,423,410,96174338.0
16686,2021/11/28 8:09:13 AM UTC+1,96,Male,Europe,Europe,0.3057395,409,407,409,96174338.0


In [128]:
initial_surv.describe()

Unnamed: 0,Age,Image1 ID,Image2 ID,Winner,Person ID
count,16688.0,16688.0,16688.0,16688.0,16688.0
mean,40.822387,227.919942,230.122064,231.354986,22044700.0
std,96.629425,132.065298,131.262576,130.992137,24357960.0
min,12.0,0.0,0.0,0.0,999.0
25%,20.0,114.0,117.0,113.0,2220863.0
50%,23.0,229.0,233.0,235.5,19356380.0
75%,30.0,342.0,343.0,340.0,24230310.0
max,999.0,457.0,457.0,457.0,96174340.0


Remove the ids "999", which were visually detected as spammers

In [129]:
surv_999 = initial_surv.drop(initial_surv[initial_surv['Person ID'] == 999].index)
surv_999.describe()

Unnamed: 0,Age,Image1 ID,Image2 ID,Winner,Person ID
count,15826.0,15826.0,15826.0,15826.0,15826.0
mean,31.807279,228.011942,229.80134,231.219639,23245370.0
std,20.234352,132.295708,131.309658,131.127368,24448260.0
min,12.0,0.0,0.0,0.0,21606.0
25%,20.0,113.0,117.0,113.0,2299160.0
50%,22.0,230.0,231.0,236.0,19416240.0
75%,30.0,342.0,342.75,339.0,24414210.0
max,96.0,457.0,457.0,457.0,96174340.0


In [130]:
#Remap indices to avoid holes
surv_999.index = range(0,len(surv_999))

A possible cleaning is to remove the person's last answers (e.g the last 25% answers), because they might answer without thinking as time goes by. This was not used in the end.

In [131]:
ids = surv_999['Person ID'].unique()
#print(ids.shape)

def remove_last_answers(survey, think_threshold = 1):
    """ Remove the 100*(1-<think_threshold>) % last answers in <survey>
        INPUT:
            - (Dataframe) survey: the current survey results to consider
            - (float) think_threshold: the threshold after which we remove the answers
        OUTPUT:
            - (Dataframe) survey: the updated Dataframe without the removed answers
    """
    for i in ids:
        rows = survey[survey['Person ID'] == i].index

        min_rows_size = int(len(rows)*think_threshold)
        rows = rows[min_rows_size:]

        survey.drop(rows, inplace=True)

    survey.index = range(0,len(survey)) 
    return survey

surv_999 = remove_last_answers(surv_999)
surv_999.describe()

Unnamed: 0,Age,Image1 ID,Image2 ID,Winner,Person ID
count,15826.0,15826.0,15826.0,15826.0,15826.0
mean,31.807279,228.011942,229.80134,231.219639,23245370.0
std,20.234352,132.295708,131.309658,131.127368,24448260.0
min,12.0,0.0,0.0,0.0,21606.0
25%,20.0,113.0,117.0,113.0,2299160.0
50%,22.0,230.0,231.0,236.0,19416240.0
75%,30.0,342.0,342.75,339.0,24414210.0
max,96.0,457.0,457.0,457.0,96174340.0


Now, we want to detect the suspicious answers induced by repetitions/spamming of one side.

In [132]:
def find_suspicious_indices(surv, batch = 25, bound = 18):
    """ Detect the suspicious repetitive answers on one side
        INPUT:
            - (Dataframe) surv: the current survey results to consider
            - (int) batch: the number of consecutive answers to look at
            - (int) bound: the number of answers on the same side above which we consider it's suspicious
        OUTPUT:
            - (Dictionary) suspicious: a collection of pair (person_id, index of suspicious answer)
    """

    #Prepare the output
    suspicious = {}

    #Loop through all survey answers
    for i in range(len(surv)):
        max_right = 0
        max_left = 0

        nb_consecutive_right = 0
        nb_consecutive_left = 0

        previous_choice = 'right'
        person_id = surv.loc[i,'Person ID']

        k=0

        #For each index i, look at the <batch> next answers and register the number of choices on each side.
        while (k < batch & i+k < len(surv)):
            if(i+k < len(surv)):

                #Look only at the answers from 1 person (the one who owns answer "i")
                if(person_id == surv.loc[i+k,'Person ID']):

                    #Register right choice
                    if(surv.loc[i+k, 'Winner'] == surv.loc[i+k, 'Image2 ID']):
                        if(previous_choice == 'right'):
                            nb_consecutive_right += 1
                        else:
                            max_left = max(max_left, nb_consecutive_left)
                            nb_consecutive_left = 0
                            nb_consecutive_right = 1

                        previous_choice = 'right'

                    #Register left choie
                    elif(surv.loc[i+k, 'Winner'] == surv.loc[i+k, 'Image1 ID']):
                        if(previous_choice == 'left'):
                            nb_consecutive_left += 1
                        else:
                            max_right = max(max_right, nb_consecutive_right)
                            nb_consecutive_right = 0
                            nb_consecutive_left = 1

                        previous_choice = 'left' 

            k+=1           

        #Finally, check if the maximum number of answers on each side is greater than <bound>
        max_left = max(max_left, nb_consecutive_left)
        max_right = max(max_right, nb_consecutive_right)      

        if((max_left > bound) or (max_right > bound)):
            #In that case, and if the person is not already suspicious, add a reference to them in the <suspicious> dictionary
            if(not (person_id in suspicious)):
                suspicious[person_id] = i

    return suspicious  

suspicious = find_suspicious_indices(surv_999)   
print(suspicious)

{228445.0: 3303, 17377300.0: 5831, 211427.0: 8967}


We see that some people have indeed been detected suspicious. They will also be detected in the check_outliers function, and therefore deleted

Before checking for outliers, we remove all duplicated data. This can happen if the person clicks too fast, and the system doesn't have time to update the battle. This can be intentional (spammer) or unintentional.

In [133]:
surv_drop_dup = surv_999.drop_duplicates(subset=['Origin', 'Residence', 'Age', 'Percentage of lifetime in a city', 'Image1 ID', 'Image2 ID', 'Winner', 'Person ID'])

#Remap indices
surv_drop_dup.index = range(0,len(surv_drop_dup))
surv_drop_dup.describe()     

Unnamed: 0,Age,Image1 ID,Image2 ID,Winner,Person ID
count,15661.0,15661.0,15661.0,15661.0,15661.0
mean,31.880276,228.112381,230.003959,231.2655,23325060.0
std,20.285596,132.310587,131.318448,131.177926,24492280.0
min,12.0,0.0,0.0,0.0,21606.0
25%,20.0,113.0,117.0,113.0,2299160.0
50%,22.0,230.0,232.0,236.0,19416240.0
75%,30.0,342.0,343.0,340.0,24414210.0
max,96.0,457.0,457.0,457.0,96174340.0


Now, look at outliers, i.e people who made a choice very different from other participants

First, we need a way to quantify this "very different choice". This is done by looking at the image ranking.

In [134]:
#rank images
def rank_images(clean_surv):
    """ Rank the images according to answers in the survey
        INPUT:
            - (Dataframe) clean_surv: the cleaned survey on which to build the ranking
        OUTPUT:
            - (Dataframe) contest: the ranking
    """

    #First collect the wins
    win = pd.DataFrame(clean_surv['Winner'].value_counts())
    win.sort_index(ascending=True, inplace=True)

    #Then the number of occurences of each image
    im1 = pd.DataFrame(clean_surv['Image1 ID'].value_counts())
    im1.sort_index(ascending=True, inplace=True)
    im2 = pd.DataFrame(clean_surv['Image2 ID'].value_counts())
    im2.sort_index(ascending=True, inplace=True)
    occur = pd.DataFrame(im1.values+im2.values, columns=['Image occurence'])

    #Compute the win ratio from these informations
    contest = pd.concat([occur, win],axis=1)
    contest['Win ratio'] = contest['Winner'] / contest['Image occurence']
    contest.reset_index(inplace=True)

    #Drop non interesting features
    contest.drop(['Image occurence','Winner'], axis=1, inplace=True)
    contest.fillna(0.0, inplace=True)

    return contest

Then we actually define the functions to spot and remove these outliers

In [135]:
def check_outliers(survey, ranking, threshold = 0.5):
    """ Spot outliers in the <survey>, based on the <ranking>, and according to the <threshold>
        INPUT:
            - (Dataframe) survey: the survey to consider
            - (Dataframe) ranking: the ranking of images, built on the survey
            - (float) threshold: the minimum rank difference between the losing image
               and the winning image for the choice to be considered outlier 
        OUTPUT:
            - (Dictionary) outliers: a set of pairs (person_id, {list of outlier choices of this person})
            - (List) outlier_indices: the list of indices of all the oulier choices
    """
    outliers = {}
    outlier_indices = []

    #Loop through all survey answers
    for i in range(len(survey)):
        person_id = survey.loc[i,'Person ID']

        #Get informations on the results of the corresponding battle
        id1 = survey.loc[i,'Image1 ID']
        id2 = survey.loc[i,'Image2 ID']
        winId = survey.loc[i,'Winner']
        lossId = id1

        rkwin = ranking.loc[winId,'Win ratio']
        if(winId == id1):
            lossId = id2

        rkloss = ranking.loc[lossId,'Win ratio']

        #Check if the winning image is ranked way worse than the losing image. In that case, update the outputs with this outlier.
        if(rkloss - rkwin > threshold):
            outliers[person_id] = outliers.get(person_id,0) + 1
            outlier_indices.append(i)

    return outliers, outlier_indices

def remove_outliers(outliers, outlier_indices, survey, threshold = 2):
    """ Remove the outliers in <survey> according to the indices in <outlier_indices>.
        If a person has more than <threshold> outliers in <outliers>, remove all the answers of this person
        INPUT:
            - (Dictionary) outliers: a set of pairs (person_id, {list of outlier choices of this person})
            - (List) outlier_indices: the list of indices of all the oulier choices
            - (Dataframe) survey: the survey to consider
            - (float) threshold: the minimum rank difference between the losing image
               and the winning image for the choice to be considered outlier 
        OUTPUT: 
            - (Dataframe) final_data: the survey results with outliers removed.
    """

    #First, detect participants with too many outliers (potential spammers)
    IDmamy = 81198294 #this person is a special case that shouldn't be removed (long story)
    outliers_ids_to_remove = []
    for id in outliers:
        if((outliers[id] >= threshold) & (id != IDmamy)):
            outliers_ids_to_remove.append(id)

    final_data = survey

    #Remove all the individual outliers
    for index in outlier_indices:
        final_data = final_data.drop(index)

    #Then remove the potential spammers
    final_data = final_data[~final_data['Person ID'].isin(outliers_ids_to_remove)]

    return final_data 

Removing the outliers from the survey will consequently change the ranking, which might reveal new outliers. Therefore we define a function to iteratively remove them by reconstructing the ranking


In [136]:
def iteratively_remove_outliers(survey, difference_threshold = 0.5, outlier_threshold = 2, nb_iterations = 10):
    """ Iteratively remove the outliers in <survey> according to the given parameters, until there are no outlier left.
        This involves rebuilding the ranking at each iteration
        INPUT:
            - (Dataframe) survey: the survey to consider
            - (float) difference_threshold: the minimum rank difference between the losing image
               and the winning image for the choice to be considered outlier 
            - (float) outlier_threshold: the minimum number of outliers a participant must have
               to be considered a spammer and deleted altogether
            - (int) nb_iterations: the maximum number iterations to do for outlier detection     
        OUTPUT: 
            - (Dataframe) surv_outliers: the survey results with outliers iteratively removed.
    """
    surv_outliers = survey
    
    for n in range(nb_iterations):
        #Build the ranking
        ranking = rank_images(surv_outliers)    

        #Check outliers in <survey> according to <ranking>
        outliers, outlier_indices = check_outliers(surv_outliers, ranking, difference_threshold)  
        nb_outliers = len(outliers)
        print(outliers, nb_outliers)

        #Remove these outliers
        surv_outliers = remove_outliers(outliers, outlier_indices, surv_outliers, outlier_threshold)
        surv_outliers.index = range(0,len(surv_outliers))

        #If no outlier has been detected, we stop this function
        if(nb_outliers == 0):
            break

    return surv_outliers    
    
surv_outliers = iteratively_remove_outliers(surv_drop_dup)
surv_outliers.describe()

{2131299.0: 1, 2331265.0: 1, 19254248.0: 1, 21242425.0: 1, 23444195.0: 2, 24230309.0: 1, 234361.0: 1, 228445.0: 3, 25176195.0: 1, 23423379.0: 1, 20180416.0: 1, 17377300.0: 4, 22362411.0: 1, 2029490.0: 1, 30359428.0: 1, 18321387.0: 1, 21289178.0: 2, 48289156.0: 2, 18367348.0: 1, 81198294.0: 2, 2521896.0: 6, 1832653.0: 1, 20396128.0: 1, 1739376.0: 2, 23118298.0: 1, 1734114.0: 1, 19397394.0: 2} 27
{2331265.0: 2, 22246338.0: 1, 22318216.0: 1, 24230309.0: 1, 2220863.0: 1, 2367228.0: 1, 2236293.0: 1, 48284448.0: 1, 30359428.0: 1, 2029490.0: 1, 22140445.0: 1, 18367348.0: 1, 74389456.0: 1, 2222576.0: 1, 50406193.0: 1, 2517315.0: 1} 16
{2299160.0: 1, 2442397.0: 1, 306341.0: 1, 195031.0: 1} 4
{21261112.0: 1} 1
{} 0


Unnamed: 0,Age,Image1 ID,Image2 ID,Winner,Person ID
count,14254.0,14254.0,14254.0,14254.0,14254.0
mean,32.559632,227.012418,230.548688,230.618353,23957580.0
std,20.90932,132.377288,131.419003,131.196153,25098330.0
min,12.0,0.0,0.0,0.0,21606.0
25%,20.0,112.0,117.0,111.0,2299160.0
50%,22.0,228.0,233.0,235.0,20139200.0
75%,30.0,341.0,344.0,338.75,24436260.0
max,96.0,457.0,457.0,457.0,96174340.0


With the final data, we describe the final ranking, and the final results

In [137]:
#Print final ranking
ranking = rank_images(surv_outliers) 
print(ranking.sort_values(by=['Win ratio']))

     index  Win ratio
304    304   0.015873
136    136   0.061538
6        6   0.070175
127    127   0.079365
413    413   0.080000
..     ...        ...
193    193   0.891304
251    251   0.894737
262    262   0.895522
326    326   0.910714
100    100   0.916667

[458 rows x 2 columns]


In [138]:
final_data = surv_outliers
final_data.describe()

Unnamed: 0,Age,Image1 ID,Image2 ID,Winner,Person ID
count,14254.0,14254.0,14254.0,14254.0,14254.0
mean,32.559632,227.012418,230.548688,230.618353,23957580.0
std,20.90932,132.377288,131.419003,131.196153,25098330.0
min,12.0,0.0,0.0,0.0,21606.0
25%,20.0,112.0,117.0,111.0,2299160.0
50%,22.0,228.0,233.0,235.0,20139200.0
75%,30.0,341.0,344.0,338.75,24436260.0
max,96.0,457.0,457.0,457.0,96174340.0


In [139]:
#Export the results in the data folder
final_data.to_csv(datapath + 'CleanedResults.csv')