In [1]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
import os, json
import pandas as pd
import numpy as np
import seaborn as sns
import math
from scipy import stats
from sklearn.linear_model import LinearRegression

def get_ranking_representations_in_order(dict):
    ranking_data = dict["ranking"]
    task_order = dict["treatmentOrder"]
    rankings_per_task = []
    for i in range(0,len(ranking_data)):
        if ranking_data[i]["category"] == "rankingRepresentation":
            rankings_per_task.append(ranking_data[i]["ranking"])
    sorted_rankings_by_task = [0] * 3;
    sorted_rankings_by_task[task_order[0]] = rankings_per_task[0]
    sorted_rankings_by_task[task_order[1]] = rankings_per_task[1]
    sorted_rankings_by_task[task_order[2]] = rankings_per_task[2]
    return sorted_rankings_by_task

def get_task_actions_in_order(dict):
    ranking_data = dict["ranking"]
    task_order = dict["treatmentOrder"]
    actions_per_task = []
    for i in range(0,len(ranking_data)):
        if ranking_data[i]["category"] == "rankingRepresentation":
            actions = []
            for j in range(i+1,len(ranking_data)):
                if ranking_data[j]["category"] == "serverEvent" or ranking_data[j]["category"] == "timeStamps":
                    i = j
                    break
                actions.append(ranking_data[j])
            actions_per_task.append(actions)
    sorted_actions_by_task = [0] * 3;
    sorted_actions_by_task[task_order[0]] = actions_per_task[0]
    sorted_actions_by_task[task_order[1]] = actions_per_task[1]
    sorted_actions_by_task[task_order[2]] = actions_per_task[2]
    return sorted_actions_by_task
        
def filter_for_choices(actions, ranking):
    selected_ids = []
    for action in actions:
        if action["eventName"] == "select":
            selected_ids.append(action["itemID"])
        if action["eventName"] == "unselect":
            selected_ids.remove(action["itemID"])
        if action["eventName"] == "fourthElementSelected":
            selected_ids.remove(action["itemID"])
    choices = []
    for id in selected_ids:
        for cand in ranking:
            try:
                if id == cand["id"]:
                    choices.append(cand)
            except:
                if id == cand[0]["id"]:
                    choices.append(cand[0])
                if id == cand[1]["id"]:
                    choices.append(cand[1])
    return choices

def get_survey_answers(dict):
    survey = dict["survey"]
    answers = []
    for entry in survey:
        if entry["category"] == "surveyQuestionAnswers":
            answers.append(entry)
    return answers

def load_data_as_df(path):
    path_to_jsonfiles = path
    cols = ['uID', 'ranking_type','briefing','task_order','task_1_ranking', 'task_2_ranking', 'task_3_ranking',
               'task_1_actions','task_2_actions','task_3_actions','task_1_choices','task_2_choices',
            'task_3_choices','survey_answers']
    data = pd.DataFrame(columns=cols)
    for file in os.listdir(path_to_jsonfiles):
        full_filename = "%s/%s" % (path_to_jsonfiles, file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            uID = file[0:len(file)-5]
            ranking_type = int(dict["ranking"][0]["itemID"])
            briefing = dict["briefing"]
            task_order = dict["treatmentOrder"]
            task_rankings = get_ranking_representations_in_order(dict)
            task_1_ranking = task_rankings[0]
            task_2_ranking = task_rankings[1]
            task_3_ranking = task_rankings[2]
            task_actions = get_task_actions_in_order(dict)
            task_1_actions = task_actions[0]
            task_2_actions = task_actions[1]
            task_3_actions = task_actions[2]
            task_1_choices = filter_for_choices(task_1_actions, task_1_ranking)
            task_2_choices = filter_for_choices(task_2_actions, task_2_ranking)
            task_3_choices = filter_for_choices(task_3_actions, task_3_ranking)
            survey_answers = get_survey_answers(dict)
            
            data = data.append({'uID': uID, 'ranking_type': ranking_type, 'briefing': briefing, 'task_order': task_order,
                               'task_1_ranking': task_1_ranking, 'task_2_ranking': task_2_ranking, 'task_3_ranking': task_3_ranking,
                               'task_1_actions': task_1_actions, 'task_2_actions': task_2_actions, 'task_3_actions': task_3_actions,
                               'task_1_choices': task_1_choices, 'task_2_choices': task_2_choices, 'task_3_choices': task_3_choices,
                               'survey_answers': survey_answers}, ignore_index=True)
    return data

def get_user_gender_age_income(survey_answers):
    for entry in survey_answers:
        if entry["category"] == "surveyQuestionAnswers" and entry["question"] == 5:
            return entry["answers"]
    return ["not available"] * 3

def get_ranked_candidates_experience_gender_rank_task_rankingtype(data_table):
    cols = ['total_xp','furniture_assembly_xp', 'home_repairs_xp', 'rank', 'task_nr', 'ranking_type', 'gender','uId'
            ,'uGender','uAge','uIncome']
    col_total = []
    col_furniture = []
    col_home = []
    col_rank = []
    col_task = []
    col_type = []
    col_gender = []
    col_uID = []
    col_uGender = []
    col_uAge = []
    col_uIncome = []
    for i in range(0, len(data_table.index)):
        for cand in data_table.iloc[i][4]:
            r_type = data_table.iloc[i][1]
            userDemographics = get_user_gender_age_income(data_table.iloc[i][13])
            uGender = userDemographics[0]
            uAge = userDemographics[1]
            uIncome = userDemographics[2]
            if r_type >= 4:
                for c in cand:
                    col_total.append(c["totalExp"])
                    col_furniture.append(c["tag1"])
                    col_home.append(c["tag2"])
                    col_rank.append(c["rank"])
                    col_task.append(1)
                    col_type.append(int(r_type))
                    col_uID.append(data_table.iloc[i][0])
                    col_uGender.append(uGender)
                    col_uAge.append(uAge)
                    col_uIncome.append(uIncome)
                    if c["gender"] == "f":
                        col_gender.append(0)
                    if c["gender"] == "m":
                        col_gender.append(1)
            else:
                col_total.append(cand["totalExp"])
                col_furniture.append(cand["tag1"])
                col_home.append(cand["tag2"])
                col_rank.append(cand["rank"])
                col_task.append(1)
                col_type.append(int(r_type))
                col_uID.append(data_table.iloc[i][0])
                col_uGender.append(uGender)
                col_uAge.append(uAge)
                col_uIncome.append(uIncome)
                if cand["gender"] == "f":
                    col_gender.append(0)
                if cand["gender"] == "m":
                    col_gender.append(1)
    for i in range(0, len(data_table.index)):
        for cand in data_table.iloc[i][5]:
            r_type = data_table.iloc[i][1]
            userDemographics = get_user_gender_age_income(data_table.iloc[i][13])
            uGender = userDemographics[0]
            uAge = userDemographics[1]
            uIncome = userDemographics[2]
            if r_type >= 4:
                for c in cand:
                    col_total.append(c["totalExp"])
                    col_furniture.append(c["tag1"])
                    col_home.append(c["tag2"])
                    col_rank.append(c["rank"])
                    col_task.append(2)
                    col_type.append(int(r_type))
                    col_uID.append(data_table.iloc[i][0])
                    col_uGender.append(uGender)
                    col_uAge.append(uAge)
                    col_uIncome.append(uIncome)
                    if c["gender"] == "f":
                        col_gender.append(0)
                    if c["gender"] == "m":
                        col_gender.append(1)
            else:
                col_total.append(cand["totalExp"])
                col_furniture.append(cand["tag1"])
                col_home.append(cand["tag2"])
                col_rank.append(cand["rank"])
                col_task.append(2)
                col_type.append(int(r_type))
                col_uID.append(data_table.iloc[i][0])
                col_uGender.append(uGender)
                col_uAge.append(uAge)
                col_uIncome.append(uIncome)
                if cand["gender"] == "f":
                    col_gender.append(0)
                if cand["gender"] == "m":
                    col_gender.append(1)
    for i in range(0, len(data_table.index)):
        for cand in data_table.iloc[i][6]:
            r_type = data_table.iloc[i][1]
            userDemographics = get_user_gender_age_income(data_table.iloc[i][13])
            uGender = userDemographics[0]
            uAge = userDemographics[1]
            uIncome = userDemographics[2]
            if r_type >= 4:
                for c in cand:
                    col_total.append(c["totalExp"])
                    col_furniture.append(c["tag1"])
                    col_home.append(c["tag2"])
                    col_rank.append(c["rank"])
                    col_task.append(3)
                    col_type.append(int(r_type))
                    col_uID.append(data_table.iloc[i][0])
                    col_uGender.append(uGender)
                    col_uAge.append(uAge)
                    col_uIncome.append(uIncome)
                    if c["gender"] == "f":
                        col_gender.append(0)
                    if c["gender"] == "m":
                        col_gender.append(1)
            else:
                col_total.append(cand["totalExp"])
                col_furniture.append(cand["tag1"])
                col_home.append(cand["tag2"])
                col_rank.append(cand["rank"])
                col_task.append(3)
                col_type.append(int(r_type))
                col_uID.append(data_table.iloc[i][0])
                col_uGender.append(uGender)
                col_uAge.append(uAge)
                col_uIncome.append(uIncome)
                if cand["gender"] == "f":
                    col_gender.append(0)
                if cand["gender"] == "m":
                    col_gender.append(1)
    allArr = [col_total, col_furniture, col_home, col_rank, col_task, col_type, col_gender, col_uID, col_uGender, col_uAge, col_uIncome]
    selected = []
    for i in range(0,len(col_total)):
        cand = []
        for j in range(0,len(allArr)):
            cand.append(allArr[j][i])
        selected.append(cand)
    data = data = pd.DataFrame(np.array(selected), columns=cols)
    cols = ['total_xp','furniture_assembly_xp', 'home_repairs_xp', 'rank', 'task_nr', 'ranking_type', 'gender','uId', 'uGender', 'uAge', 'uIncome']
    data["total_xp"] = data["total_xp"].astype(float)
    data["furniture_assembly_xp"] = data["furniture_assembly_xp"].astype(float)
    data["home_repairs_xp"] = data["home_repairs_xp"].astype(float)
    data["rank"] = data["rank"].astype(int)
    data["task_nr"] = data["task_nr"].astype(int)
    data["ranking_type"] = data["ranking_type"].astype(int)
    data["gender"] = data["gender"].astype(int)
    return data

    
        
def get_selected_candidates_experience_gender_rank_task_rankingtype(data_table):
    #gender == 0 ->f
    #gender == 1 ->m
    cols = ['total_xp','furniture_assembly_xp', 'home_repairs_xp', 'rank', 'task_nr', 'ranking_type', 'gender','uId', 'uGender', 'uAge', 'uIncome']
    col_total = []
    col_furniture = []
    col_home = []
    col_rank = []
    col_task = []
    col_type = []
    col_gender = []
    col_uID = []
    col_uGender = []
    col_uAge = []
    col_uIncome = []
    for i in range(0, len(data_table.index)):
        userDemographics = get_user_gender_age_income(data_table.iloc[i][13])
        uGender = userDemographics[0]
        uAge = userDemographics[1]
        uIncome = userDemographics[2]
        for cand in data_table.iloc[i][10]:
            col_total.append(cand["totalExp"])
            col_furniture.append(cand["tag1"])
            col_home.append(cand["tag2"])
            col_rank.append(cand["rank"])
            col_task.append(1)
            r_type = data_table.iloc[i][1]
            col_type.append(int(r_type))
            col_uID.append(data_table.iloc[i][0])
            col_uGender.append(uGender)
            col_uAge.append(uAge)
            col_uIncome.append(uIncome)
            if cand["gender"] == "f":
                col_gender.append(0)
            if cand["gender"] == "m":
                col_gender.append(1)
    for i in range(0, len(data_table.index)):
        userDemographics = get_user_gender_age_income(data_table.iloc[i][13])
        uGender = userDemographics[0]
        uAge = userDemographics[1]
        uIncome = userDemographics[2]
        for cand in data_table.iloc[i][11]:
            col_total.append(cand["totalExp"])
            col_furniture.append(cand["tag1"])
            col_home.append(cand["tag2"])
            col_rank.append(cand["rank"])
            col_task.append(2)
            r_type = data_table.iloc[i][1]
            col_type.append(int(r_type))
            col_uID.append(data_table.iloc[i][0])
            col_uGender.append(uGender)
            col_uAge.append(uAge)
            col_uIncome.append(uIncome)
            if cand["gender"] == "f":
                col_gender.append(0)
            if cand["gender"] == "m":
                col_gender.append(1)
    for i in range(0, len(data_table.index)):
        userDemographics = get_user_gender_age_income(data_table.iloc[i][13])
        uGender = userDemographics[0]
        uAge = userDemographics[1]
        uIncome = userDemographics[2]
        for cand in data_table.iloc[i][12]:
            col_total.append(cand["totalExp"])
            col_furniture.append(cand["tag1"])
            col_home.append(cand["tag2"])
            col_rank.append(cand["rank"])
            col_task.append(3)
            r_type = data_table.iloc[i][1]
            col_type.append(int(r_type))
            col_uID.append(data_table.iloc[i][0])
            col_uGender.append(uGender)
            col_uAge.append(uAge)
            col_uIncome.append(uIncome)
            if cand["gender"] == "f":
                col_gender.append(0)
            if cand["gender"] == "m":
                col_gender.append(1)
    allArr = [col_total, col_furniture, col_home, col_rank, col_task, col_type, col_gender, col_uID, col_uGender, col_uAge, col_uIncome]
    selected = []
    for i in range(0,len(col_total)):
        cand = []
        for j in range(0,len(allArr)):
            cand.append(allArr[j][i])
        selected.append(cand)
    data = data = pd.DataFrame(np.array(selected), columns=cols)
    cols = ['total_xp','furniture_assembly_xp', 'home_repairs_xp', 'rank', 'task_nr', 'ranking_type', 'gender','uId', 'uGender', 'uAge', 'uIncome']
    data["total_xp"] = data["total_xp"].astype(float)
    data["furniture_assembly_xp"] = data["furniture_assembly_xp"].astype(float)
    data["home_repairs_xp"] = data["home_repairs_xp"].astype(float)
    data["rank"] = data["rank"].astype(int)
    data["task_nr"] = data["task_nr"].astype(int)
    data["ranking_type"] = data["ranking_type"].astype(int)
    data["gender"] = data["gender"].astype(int)
    return data
    
def get_probability_distribution_single_rankings_as_array(data="all", task="all", rankingTypes_ordered=[2,3], rankingTypes_not_ordered= [0,1]
                             , xp_used="total", xp_cutoff = 12, prefix="test", uGender="all", uIncome="all", uAge="all"):
    if data=="all":
        path = "./all_test_data"
    else:
        path = "./jsonFiles_valid_third_test"
    
    data_task_start = 4
    data_task_end = 7
    suffix1 = "allTasks"
    if task == "furniture assembly":
        task_nr = 1
        data_task_start = 4
        data_task_end = 5
        suffix1 = "TaskFurnitureAssembly"
    if task == "home repairs":
        task_nr = 2
        data_task_start = 5
        data_task_end = 6
        suffix1 = "TaskHomeRepairs"
    if task == "furniture delivery":
        task_nr = 3
        data_task_start = 6
        data_task_end = 7
        suffix1 = "TaskFurnitureDelivery"
    
    xp_used_field = "totalExp"
    xp_used_index = 0
    suffix2 = "totalExp"
    if xp_used == "furniture":
        xp_used_field = "tag1"
        xp_used_index = 1
        suffix2 = "FurnitureAssemblyExp"
    if xp_used == "home":
        xp_used_field = "tag2"
        xp_used_index = 2
        suffix2 = "HomeRepairsExp"
        
    
    #Ordered Rankings
    data = load_data_as_df(path)
    #data = data[(data['ranking_type'].isin(rankingTypes_ordered))]
    seen_data_ordered = get_ranked_candidates_experience_gender_rank_task_rankingtype(data)
    seen_data_ordered = seen_data_ordered[(seen_data_ordered['ranking_type'].isin(rankingTypes_ordered))]
    if task != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['task_nr']==task_nr)]
    if uGender != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['uGender'] == uGender)]
    if uIncome != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['uIncome'] == uIncome)]
    if uAge != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['uAge'] == uAge)]
    seen_candidates_per_rank = [0]*20
    for i in range(0,20):
        seen_candidates_per_rank[i] = []
    for i in range(0,len(seen_data_ordered.index)):
        rank = seen_data_ordered.iloc[i][3]
        total_xp = seen_data_ordered.iloc[i][xp_used_index]
        if seen_data_ordered.iloc[i][6] == 0:
            gender = 'f'
        else:
            gender = 'm'
        seen_candidates_per_rank[rank-1].append((total_xp,gender))

    
    #Random Rankings
    data = load_data_as_df(path)
    #data = data[(data['ranking_type'].isin(rankingTypes_not_ordered))]
    seen_data_unordered = get_ranked_candidates_experience_gender_rank_task_rankingtype(data)
    seen_data_unordered = seen_data_unordered[(seen_data_unordered['ranking_type'].isin(rankingTypes_not_ordered))]
    if task != "all":
        seen_data_unordered = seen_data_unordered[(seen_data_unordered['task_nr']==task_nr)]
    if uGender != "all":
        seen_data_unordered = seen_data_unordered[(seen_data_unordered['uGender'] == uGender)]
    if uIncome != "all":
        seen_data_unordered = seen_data_unordered[(seen_data_unordered['uIncome'] == uIncome)]
    if uAge != "all":
        seen_data_unordered = seen_data_unordered[(seen_data_unordered['uAge'] == uAge)]
    for i in range(0,len(seen_data_unordered.index)):
        rank = seen_data_unordered.iloc[i][3]
        total_xp = seen_data_unordered.iloc[i][xp_used_index]
        if seen_data_unordered.iloc[i][6] == 0:
            gender = 'f'
        else:
            gender = 'm'
        seen_candidates_per_rank[rank-1].append((total_xp,gender))
    #Side by Side Rankings
    
    data = load_data_as_df(path)
    plot_data = get_selected_candidates_experience_gender_rank_task_rankingtype(data)
    plot_data = plot_data[(plot_data['ranking_type'].isin(rankingTypes_ordered+rankingTypes_not_ordered))]
    if uGender != "all":
        plot_data = plot_data[(plot_data['uGender'] == uGender)]
    if task != "all":
        plot_data = plot_data[(plot_data['task_nr'] == task_nr)]
    if uIncome != "all":
        plot_data = plot_data[(plot_data['uIncome'] == uIncome)]
    if uAge != "all":
        plot_data = plot_data[(plot_data['uAge'] == uAge)]
    selected_candidates_per_rank = [0]*20
    #print("n = "+str(len(plot_data)))
    for i in range(0,20):
        selected_candidates_per_rank[i] = []
    for i in range(0,len(plot_data.index)):
        rank = plot_data.iloc[i][3]
        total_xp = plot_data.iloc[i][xp_used_index]
        if plot_data.iloc[i][6] == 0:
            gender = 'f'
        else:
            gender = 'm'
        selected_candidates_per_rank[rank-1].append((total_xp,gender))

    distribution_data = {}
    
    distribution_overall = [0]*20
    for i in range(0,len(selected_candidates_per_rank)):
        distribution_overall[i]=len(selected_candidates_per_rank[i])/max(1,len(seen_candidates_per_rank[i]))
    
    distribution_data["overall"] = {}
    distribution_data["overall"]["seen"] = seen_candidates_per_rank
    distribution_data["overall"]["selected"] = selected_candidates_per_rank
    distribution_data["overall"]["overall"] = distribution_overall
    
    seen_female_candidates_per_rank = [0]*20
    selected_female_candidates_per_rank = [0]*20
    female_distribution_overall = [0]*20
    for i in range(0,20):
        seen_female_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if cand[1] == 'f']
        selected_female_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if cand[1] == 'f']
    for i in range(0,len(selected_female_candidates_per_rank)):
        female_distribution_overall[i]=len(selected_female_candidates_per_rank[i])/max(1,len(seen_female_candidates_per_rank[i]))

    distribution_data["female"] = {}
    distribution_data["female"]["seen"] = seen_female_candidates_per_rank
    distribution_data["female"]["selected"] = selected_female_candidates_per_rank
    distribution_data["female"]["overall"] = female_distribution_overall    
    
    seen_male_candidates_per_rank = [0]*20
    selected_male_candidates_per_rank = [0]*20
    male_distribution_overall = [0]*20
    for i in range(0,20):
        seen_male_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if cand[1] == 'm']
        selected_male_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if cand[1] == 'm']
    for i in range(0,len(selected_male_candidates_per_rank)):
        male_distribution_overall[i]=len(selected_male_candidates_per_rank[i])/max(1,len(seen_male_candidates_per_rank[i]))   
    
    distribution_data["male"] = {}
    distribution_data["male"]["seen"] = seen_male_candidates_per_rank
    distribution_data["male"]["selected"] = selected_male_candidates_per_rank
    distribution_data["male"]["overall"] = male_distribution_overall  
    
    seen_female_low_xp_candidates_per_rank = [0]*20
    selected_female_low_xp_candidates_per_rank = [0]*20
    female_low_xp_distribution_overall = [0]*20
    for i in range(0,20):
        seen_female_low_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] <xp_cutoff)]
        selected_female_low_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] <xp_cutoff)]
    for i in range(0,len(selected_female_candidates_per_rank)):
        female_low_xp_distribution_overall[i]=len(selected_female_low_xp_candidates_per_rank[i])/max(1,len(seen_female_low_xp_candidates_per_rank[i]))
    
    distribution_data["female_low"] = {}
    distribution_data["female_low"]["seen"] = seen_female_low_xp_candidates_per_rank
    distribution_data["female_low"]["selected"] = selected_female_low_xp_candidates_per_rank
    distribution_data["female_low"]["overall"] = female_low_xp_distribution_overall  
    
    seen_male_low_xp_candidates_per_rank = [0]*20
    selected_male_low_xp_candidates_per_rank = [0]*20
    male_low_xp_distribution_overall = [0]*20
    for i in range(0,20):
        seen_male_low_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] <xp_cutoff)]
        selected_male_low_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] <xp_cutoff)]
    for i in range(0,len(selected_male_candidates_per_rank)):
        male_low_xp_distribution_overall[i]=len(selected_male_low_xp_candidates_per_rank[i])/max(1,len(seen_male_low_xp_candidates_per_rank[i]))
    
    distribution_data["male_low"] = {}
    distribution_data["male_low"]["seen"] = seen_male_low_xp_candidates_per_rank
    distribution_data["male_low"]["selected"] = selected_male_low_xp_candidates_per_rank
    distribution_data["male_low"]["overall"] = male_low_xp_distribution_overall  
    
    seen_female_high_xp_candidates_per_rank = [0]*20
    selected_female_high_xp_candidates_per_rank = [0]*20
    female_high_xp_distribution_overall = [0]*20
    for i in range(0,20):
        seen_female_high_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] >=xp_cutoff)]
        selected_female_high_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] >=xp_cutoff)]
    for i in range(0,len(selected_female_candidates_per_rank)):
        female_high_xp_distribution_overall[i]=len(selected_female_high_xp_candidates_per_rank[i])/max(1,len(seen_female_high_xp_candidates_per_rank[i]))

    distribution_data["female_high"] = {}
    distribution_data["female_high"]["seen"] = seen_female_high_xp_candidates_per_rank
    distribution_data["female_high"]["selected"] = selected_female_high_xp_candidates_per_rank
    distribution_data["female_high"]["overall"] = female_high_xp_distribution_overall  
        
    seen_male_high_xp_candidates_per_rank = [1]*20
    selected_male_high_xp_candidates_per_rank = [0]*20
    male_high_xp_distribution_overall = [0]*20
    for i in range(0,20):
        seen_male_high_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] >=xp_cutoff)]
        selected_male_high_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] >=xp_cutoff)]
    for i in range(0,len(selected_male_candidates_per_rank)):
        male_high_xp_distribution_overall[i]=len(selected_male_high_xp_candidates_per_rank[i])/max(1,len(seen_male_high_xp_candidates_per_rank[i]))

    distribution_data["male_high"] = {}
    distribution_data["male_high"]["seen"] = seen_male_high_xp_candidates_per_rank
    distribution_data["male_high"]["selected"] = selected_male_high_xp_candidates_per_rank
    distribution_data["male_high"]["overall"] = male_high_xp_distribution_overall
        
    return distribution_data

def get_probability_distribution_double_rankings_as_array(data="all", task="all", rankingTypes=[4,5,6,7]
                             , xp_used="total", xp_cutoff = 12, prefix="test", uGender="all", uIncome="all", uAge="all"):
    if data=="all":
        path = "./all_test_data"
    else:
        path = "./jsonFiles_valid_third_test"
    
    data_task_start = 4
    data_task_end = 7
    suffix1 = "allTasks"
    if task == "furniture assembly":
        task_nr = 1
        data_task_start = 4
        data_task_end = 5
        suffix1 = "TaskFurnitureAssembly"
    if task == "home repairs":
        task_nr = 2
        data_task_start = 5
        data_task_end = 6
        suffix1 = "TaskHomeRepairs"
    if task == "furniture delivery":
        task_nr = 3
        data_task_start = 6
        data_task_end = 7
        suffix1 = "TaskFurnitureDelivery"
    
    xp_used_field = "totalExp"
    xp_used_index = 0
    suffix2 = "totalExp"
    if xp_used == "furniture":
        xp_used_field = "tag1"
        xp_used_index = 1
        suffix2 = "FurnitureAssemblyExp"
    if xp_used == "home":
        xp_used_field = "tag2"
        xp_used_index = 2
        suffix2 = "HomeRepairsExp"
        
    data = load_data_as_df(path)
    seen_data_ordered = get_ranked_candidates_experience_gender_rank_task_rankingtype(data)
    seen_data_ordered = seen_data_ordered[(seen_data_ordered['ranking_type'].isin(rankingTypes))]
    if task != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['task_nr']==task_nr)]
    if uGender != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['uGender'] == uGender)]
    if uIncome != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['uIncome'] == uIncome)]
    if uAge != "all":
        seen_data_ordered = seen_data_ordered[(seen_data_ordered['uAge'] == uAge)]
    seen_candidates_per_rank = [0]*10
    for i in range(0,10):
        seen_candidates_per_rank[i] = []
    for i in range(0,len(seen_data_ordered.index)):
        rank = seen_data_ordered.iloc[i][3]
        total_xp = seen_data_ordered.iloc[i][xp_used_index]
        if seen_data_ordered.iloc[i][6] == 0:
            gender = 'f'
        else:
            gender = 'm'
        seen_candidates_per_rank[rank-1].append((total_xp,gender))
    
    data = load_data_as_df(path)
    plot_data = get_selected_candidates_experience_gender_rank_task_rankingtype(data)
    plot_data = plot_data[(plot_data['ranking_type'].isin(rankingTypes))]
    if uGender != "all":
        plot_data = plot_data[(plot_data['uGender'] == uGender)]
    if task != "all":
        plot_data = plot_data[(plot_data['task_nr'] == task_nr)]
    if uIncome != "all":
        plot_data = plot_data[(plot_data['uIncome'] == uIncome)]
    if uAge != "all":
        plot_data = plot_data[(plot_data['uAge'] == uAge)]
    selected_candidates_per_rank = [0]*10
    #print("n = "+str(len(plot_data)))
    for i in range(0,10):
        selected_candidates_per_rank[i] = []
    for i in range(0,len(plot_data.index)):
        rank = plot_data.iloc[i][3]
        total_xp = plot_data.iloc[i][xp_used_index]
        if plot_data.iloc[i][6] == 0:
            gender = 'f'
        else:
            gender = 'm'
        selected_candidates_per_rank[rank-1].append((total_xp,gender))

    distribution_data = {}
    
    distribution_overall = [0]*10
    for i in range(0,len(selected_candidates_per_rank)):
        distribution_overall[i]=len(selected_candidates_per_rank[i])/max(1,len(seen_candidates_per_rank[i]))
    
    distribution_data["overall"] = {}
    distribution_data["overall"]["seen"] = seen_candidates_per_rank
    distribution_data["overall"]["selected"] = selected_candidates_per_rank
    distribution_data["overall"]["overall"] = distribution_overall
    
    seen_female_candidates_per_rank = [0]*10
    selected_female_candidates_per_rank = [0]*10
    female_distribution_overall = [0]*10
    for i in range(0,10):
        seen_female_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if cand[1] == 'f']
        selected_female_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if cand[1] == 'f']
    for i in range(0,len(selected_female_candidates_per_rank)):
        female_distribution_overall[i]=len(selected_female_candidates_per_rank[i])/max(1,len(seen_female_candidates_per_rank[i]))

    distribution_data["female"] = {}
    distribution_data["female"]["seen"] = seen_female_candidates_per_rank
    distribution_data["female"]["selected"] = selected_female_candidates_per_rank
    distribution_data["female"]["overall"] = female_distribution_overall    
    
    seen_male_candidates_per_rank = [0]*10
    selected_male_candidates_per_rank = [0]*10
    male_distribution_overall = [0]*10
    for i in range(0,10):
        seen_male_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if cand[1] == 'm']
        selected_male_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if cand[1] == 'm']
    for i in range(0,len(selected_male_candidates_per_rank)):
        male_distribution_overall[i]=len(selected_male_candidates_per_rank[i])/max(1,len(seen_male_candidates_per_rank[i]))   
    
    distribution_data["male"] = {}
    distribution_data["male"]["seen"] = seen_male_candidates_per_rank
    distribution_data["male"]["selected"] = selected_male_candidates_per_rank
    distribution_data["male"]["overall"] = male_distribution_overall  
    
    seen_female_low_xp_candidates_per_rank = [0]*10
    selected_female_low_xp_candidates_per_rank = [0]*10
    female_low_xp_distribution_overall = [0]*10
    for i in range(0,10):
        seen_female_low_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] <xp_cutoff)]
        selected_female_low_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] <xp_cutoff)]
    for i in range(0,len(selected_female_candidates_per_rank)):
        female_low_xp_distribution_overall[i]=len(selected_female_low_xp_candidates_per_rank[i])/max(1,len(seen_female_low_xp_candidates_per_rank[i]))
    
    distribution_data["female_low"] = {}
    distribution_data["female_low"]["seen"] = seen_female_low_xp_candidates_per_rank
    distribution_data["female_low"]["selected"] = selected_female_low_xp_candidates_per_rank
    distribution_data["female_low"]["overall"] = female_low_xp_distribution_overall  
    
    seen_male_low_xp_candidates_per_rank = [0]*10
    selected_male_low_xp_candidates_per_rank = [0]*10
    male_low_xp_distribution_overall = [0]*10
    for i in range(0,10):
        seen_male_low_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] <xp_cutoff)]
        selected_male_low_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] <xp_cutoff)]
    for i in range(0,len(selected_male_candidates_per_rank)):
        male_low_xp_distribution_overall[i]=len(selected_male_low_xp_candidates_per_rank[i])/max(1,len(seen_male_low_xp_candidates_per_rank[i]))
    
    distribution_data["male_low"] = {}
    distribution_data["male_low"]["seen"] = seen_male_low_xp_candidates_per_rank
    distribution_data["male_low"]["selected"] = selected_male_low_xp_candidates_per_rank
    distribution_data["male_low"]["overall"] = male_low_xp_distribution_overall  
    
    seen_female_high_xp_candidates_per_rank = [0]*10
    selected_female_high_xp_candidates_per_rank = [0]*10
    female_high_xp_distribution_overall = [0]*10
    for i in range(0,10):
        seen_female_high_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] >=xp_cutoff)]
        selected_female_high_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'f' and cand[0] >=xp_cutoff)]
    for i in range(0,len(selected_female_candidates_per_rank)):
        female_high_xp_distribution_overall[i]=len(selected_female_high_xp_candidates_per_rank[i])/max(1,len(seen_female_high_xp_candidates_per_rank[i]))

    distribution_data["female_high"] = {}
    distribution_data["female_high"]["seen"] = seen_female_high_xp_candidates_per_rank
    distribution_data["female_high"]["selected"] = selected_female_high_xp_candidates_per_rank
    distribution_data["female_high"]["overall"] = female_high_xp_distribution_overall  
        
    seen_male_high_xp_candidates_per_rank = [0]*10
    selected_male_high_xp_candidates_per_rank = [0]*10
    male_high_xp_distribution_overall = [0]*10
    for i in range(0,10):
        seen_male_high_xp_candidates_per_rank[i] = [cand for cand in seen_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] >=xp_cutoff)]
        selected_male_high_xp_candidates_per_rank[i] = [cand for cand in selected_candidates_per_rank[i] if (cand[1] == 'm' and cand[0] >=xp_cutoff)]
    for i in range(0,len(selected_male_candidates_per_rank)):
        male_high_xp_distribution_overall[i]=len(selected_male_high_xp_candidates_per_rank[i])/max(1,len(seen_male_high_xp_candidates_per_rank[i]))

    distribution_data["male_high"] = {}
    distribution_data["male_high"]["seen"] = seen_male_high_xp_candidates_per_rank
    distribution_data["male_high"]["selected"] = selected_male_high_xp_candidates_per_rank
    distribution_data["male_high"]["overall"] = male_high_xp_distribution_overall
        
    return distribution_data

def plot_probs_single_ranking(tag, data="all", task="all", rankingTypes_ordered=[2,3], rankingTypes_not_ordered= [0,1]
                             , xp_used="total", xp_cutoff = 12, prefix="test", uGender="all", uIncome="all", uAge="all"):
    
    data = get_probability_distribution_single_rankings_as_array(data = data, task=task, rankingTypes_ordered = rankingTypes_ordered,
                                                                rankingTypes_not_ordered = rankingTypes_not_ordered,
                                                                xp_used = xp_used, xp_cutoff= xp_cutoff, prefix = prefix,
                                                                uGender=uGender,uIncome = uIncome, uAge= uAge)
    seen_male_candidates_rank_1_to_3 = 0
    seen_male_candidates_rank_4_to_6 = 0
    seen_male_candidates_rank_7_to_9 = 0
    seen_male_candidates_rank_10_to_12 = 0
    seen_male_candidates_rank_13_to_15 = 0
    seen_male_candidates_rank_16_to_20 = 0
    
    seen_female_candidates_rank_1_to_3 = 0
    seen_female_candidates_rank_4_to_6 = 0
    seen_female_candidates_rank_7_to_9 = 0
    seen_female_candidates_rank_10_to_12 = 0
    seen_female_candidates_rank_13_to_15 = 0
    seen_female_candidates_rank_16_to_20 = 0
    
    selected_male_candidates_rank_1_to_3 = 0
    selected_male_candidates_rank_4_to_6 = 0
    selected_male_candidates_rank_7_to_9 = 0
    selected_male_candidates_rank_10_to_12 = 0
    selected_male_candidates_rank_13_to_15 = 0
    selected_male_candidates_rank_16_to_20 = 0
    
    selected_female_candidates_rank_1_to_3 = 0
    selected_female_candidates_rank_4_to_6 = 0
    selected_female_candidates_rank_7_to_9 = 0
    selected_female_candidates_rank_10_to_12 = 0
    selected_female_candidates_rank_13_to_15 = 0
    selected_female_candidates_rank_16_to_20 = 0
    log_1_to_3 = 0
    log_4_to_6 = 0
    log_7_to_9 = 0
    log_10_to_12 = 0
    log_13_to_15 = 0
    log_16_to_20 = 0
    overall_seen_1_to_3 = 0
    overall_seen_4_to_6 = 0
    overall_seen_7_to_9 = 0
    overall_seen_10_to_12 = 0
    overall_seen_13_to_15 = 0
    overall_seen_16_to_20 = 0
    for i in range(0,3):
        selected_male_candidates_rank_1_to_3 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_1_to_3 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_1_to_3 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_1_to_3 += len(data["female"+tag]["seen"][i])
        #log_1_to_3 += (1.0/math.log(i+2,2))
        overall_seen_1_to_3 += len(data["overall"]["seen"][i])
        log_1_to_3 += data["overall"]["overall"][i]
    for i in range(3,6):
        overall_seen_4_to_6 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_4_to_6 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_4_to_6 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_4_to_6 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_4_to_6 += len(data["female"+tag]["seen"][i])
        log_4_to_6 += data["overall"]["overall"][i]
    for i in range(6,9):
        overall_seen_7_to_9 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_7_to_9 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_7_to_9 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_7_to_9 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_7_to_9 += len(data["female"+tag]["seen"][i])
        log_7_to_9 += data["overall"]["overall"][i]
    for i in range(9,12):
        overall_seen_10_to_12 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_10_to_12 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_10_to_12 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_10_to_12 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_10_to_12 += len(data["female"+tag]["seen"][i])
        log_10_to_12 += data["overall"]["overall"][i]
    for i in range(12,15):
        overall_seen_13_to_15 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_13_to_15 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_13_to_15 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_13_to_15 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_13_to_15 += len(data["female"+tag]["seen"][i])
        log_13_to_15 += data["overall"]["overall"][i]
    for i in range(15,20):
        overall_seen_16_to_20 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_16_to_20 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_16_to_20 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_16_to_20 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_16_to_20 += len(data["female"+tag]["seen"][i])
        log_16_to_20 += data["overall"]["overall"][i]

    m1 = selected_male_candidates_rank_1_to_3/max(1,seen_male_candidates_rank_1_to_3)
    e_m1 = math.sqrt(m1 * (1-m1) / max(1,seen_male_candidates_rank_1_to_3))

    m2 = selected_male_candidates_rank_4_to_6/max(1,seen_male_candidates_rank_4_to_6)
    e_m2 = math.sqrt(m2 * (1-m2) /max(1,seen_male_candidates_rank_4_to_6))

    m3 = selected_male_candidates_rank_7_to_9/ max(1,seen_male_candidates_rank_7_to_9)
    e_m3 = math.sqrt(m3 * (1-m3)/max(1,seen_male_candidates_rank_7_to_9))
    
    m4 = selected_male_candidates_rank_10_to_12/ max(1,seen_male_candidates_rank_10_to_12)
    e_m4 = math.sqrt(m4 * (1-m4)/max(1,seen_male_candidates_rank_10_to_12))
    
    m5 = selected_male_candidates_rank_13_to_15/ max(1,seen_male_candidates_rank_13_to_15)
    e_m5 = math.sqrt(m5 * (1-m5)/max(1,seen_male_candidates_rank_13_to_15))
    
    m6 = selected_male_candidates_rank_16_to_20/ max(1,seen_male_candidates_rank_16_to_20)
    e_m6 = math.sqrt(m6 * (1-m6)/max(1,seen_male_candidates_rank_16_to_20))

    f1 = selected_female_candidates_rank_1_to_3/max(1,seen_female_candidates_rank_1_to_3)
    e_f1 = math.sqrt(f1 * (1-f1)/max(1,seen_female_candidates_rank_1_to_3))

    f2 = selected_female_candidates_rank_4_to_6/max(1,seen_female_candidates_rank_4_to_6)
    e_f2 = math.sqrt(f2 * (1-f2) /max(1,seen_female_candidates_rank_4_to_6))

    f3 = selected_female_candidates_rank_7_to_9/max(1,seen_female_candidates_rank_7_to_9)
    e_f3 = math.sqrt(f3 * (1-f3)/max(1,seen_female_candidates_rank_7_to_9))
    
    f4 = selected_female_candidates_rank_10_to_12/ max(1,seen_female_candidates_rank_10_to_12)
    e_f4 = math.sqrt(f4 * (1-f4)/max(1,seen_female_candidates_rank_10_to_12))
    
    f5 = selected_female_candidates_rank_13_to_15/ max(1,seen_female_candidates_rank_13_to_15)
    e_f5 = math.sqrt(f5 * (1-f5)/max(1,seen_female_candidates_rank_13_to_15))
    
    f6 = selected_female_candidates_rank_16_to_20/ max(1,seen_female_candidates_rank_16_to_20)
    e_f6 = math.sqrt(f6 * (1-f6)/max(1,seen_female_candidates_rank_16_to_20))

    l1 = log_1_to_3/3
    l2 = log_4_to_6/3
    l3 = log_7_to_9/3
    l4 = log_10_to_12/3
    l5 = log_13_to_15/3
    l6 = log_16_to_20/4
    
    e_l1 = math.sqrt(l1 * (1-l1)/overall_seen_1_to_3)
    e_l2 = math.sqrt(l2 * (1-l2)/overall_seen_4_to_6)
    e_l3 = math.sqrt(l3 * (1-l3)/overall_seen_7_to_9)
    e_l4 = math.sqrt(l4 * (1-l4)/overall_seen_10_to_12)
    e_l5 = math.sqrt(l5 * (1-l5)/overall_seen_13_to_15)
    e_l6 = math.sqrt(l6 * (1-l6)/overall_seen_16_to_20)
    x = np.array(["1-3", "4-6","7-9","10-12","13-15", "16-20"])
    y1 = np.array([m1,m2, m3,m4,m5,m6])
    y2 = np.array([f1,f2, f3,f4,f5,f6])
    y3 = np.array([l1,l2,l3,l4,l5,l6])
    e1 = np.array([e_m1,e_m2, e_m3,e_m4,e_m5,e_m6])
    e2 = np.array([e_f1,e_f2, e_f3, e_f4, e_f5, e_f6])
    e3 = np.array([e_l1,e_l2,e_l3, e_l4, e_l5, e_l6])
    #print("maleprobs: "+str(y1))
    #print("femaleprobs: "+str(y2))
    #print("totalprobs: "+str(y3))
    #print("maleerror: "+str(e1))
    #print("femaleError: "+str(e2))
    #print("totalerror:"+str(e3))
    
    #print("male seen on rank 1-3 "+str(seen_male_candidates_rank_1_to_3))
    #print("female seen on rank 1-3 "+str(seen_female_candidates_rank_1_to_3))
    #print("male seen on rank 4-6 "+str(seen_male_candidates_rank_4_to_6))
    #print("female seen on rank 4-6 "+str(seen_female_candidates_rank_4_to_6))
    #print("male seen on rank 7-9 "+str(seen_male_candidates_rank_7_to_9))
    #print("female seen on rank 7-9 "+str(seen_female_candidates_rank_7_to_9))
    #print("male seen on rank 10-12 "+str(seen_male_candidates_rank_10_to_12))
    #print("female seen on rank 10-12 "+str(seen_female_candidates_rank_10_to_12))
    #print("male seen on rank 13-15 "+str(seen_male_candidates_rank_13_to_15))
    #print("female seen on rank 13-15 "+str(seen_female_candidates_rank_13_to_15))
    #print("male seen on rank 16-20 "+str(seen_male_candidates_rank_16_to_20))
    #print("female seen on rank 16-20 "+str(seen_female_candidates_rank_16_to_20))
    #fig = plt.figure(figsize=(15, 10))
    #plt.errorbar(x, y1, e1, marker='o', markersize = 10, capsize = 10)
    #plt.errorbar(x, y2, e2, marker='x',color = "red", markersize = 10, capsize = 10)
    #plt.errorbar(x, y3,e3, marker='^', markersize = 10, capsize = 10)
    data_m = [y1,e1]
    data_f = [y2,e2]
    
    male_n = [seen_male_candidates_rank_1_to_3,seen_male_candidates_rank_4_to_6,seen_male_candidates_rank_7_to_9,
             seen_male_candidates_rank_10_to_12,seen_male_candidates_rank_13_to_15,seen_male_candidates_rank_16_to_20]
    female_n = [seen_female_candidates_rank_1_to_3,seen_female_candidates_rank_4_to_6,seen_female_candidates_rank_7_to_9,
             seen_female_candidates_rank_10_to_12,seen_female_candidates_rank_13_to_15,seen_female_candidates_rank_16_to_20]
    #plt.yticks(np.arange(0, 1, 0.05))
    #plt.xlabel('Rank'),
    #plt.ylabel('P("Selected" | "Gender")')
    #plt.title(''+tag)
    #plt.show()
    return data_m, male_n, data_f,female_n



def plot_probs_double_ranking(tag, data="all", task="all", rankingTypes=[4,5,6,7]
                             , xp_used="total", xp_cutoff = 12, prefix="test", uGender="all", uIncome="all", uAge="all"):
    
    data = get_probability_distribution_double_rankings_as_array(xp_cutoff = xp_cutoff, task = task, rankingTypes = rankingTypes,
                                                                xp_used=xp_used, prefix = prefix, uGender = uGender, uIncome = uIncome,
                                                                uAge = uAge)
    seen_male_candidates_rank_1_to_3 = 0
    seen_male_candidates_rank_4_to_6 = 0
    seen_male_candidates_rank_7_to_10 = 0
    seen_female_candidates_rank_1_to_3 = 0
    seen_female_candidates_rank_4_to_6 = 0
    seen_female_candidates_rank_7_to_10 = 0

    selected_male_candidates_rank_1_to_3 = 0
    selected_male_candidates_rank_4_to_6 = 0
    selected_male_candidates_rank_7_to_10 = 0
    selected_female_candidates_rank_1_to_3 = 0
    selected_female_candidates_rank_4_to_6 = 0
    selected_female_candidates_rank_7_to_10 = 0
   
    overall_seen_1_to_3 = 0
    overall_seen_4_to_6 = 0
    overall_seen_7_to_10 = 0
    for i in range(0,3):
        selected_male_candidates_rank_1_to_3 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_1_to_3 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_1_to_3 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_1_to_3 += len(data["female"+tag]["seen"][i])
        overall_seen_1_to_3 += len(data["overall"]["seen"][i])
    for i in range(3,6):
        overall_seen_4_to_6 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_4_to_6 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_4_to_6 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_4_to_6 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_4_to_6 += len(data["female"+tag]["seen"][i])
    for i in range(6,10):
        overall_seen_7_to_10 += len(data["overall"]["seen"][i])
        selected_male_candidates_rank_7_to_10 += len(data["male"+tag]["selected"][i])
        selected_female_candidates_rank_7_to_10 += len(data["female"+tag]["selected"][i])
        seen_male_candidates_rank_7_to_10 += len(data["male"+tag]["seen"][i])
        seen_female_candidates_rank_7_to_10 += len(data["female"+tag]["seen"][i])

    m1 = selected_male_candidates_rank_1_to_3/max(1,seen_male_candidates_rank_1_to_3)
    e_m1 = math.sqrt(m1 * (1-m1) / max(1,seen_male_candidates_rank_1_to_3))

    m2 = selected_male_candidates_rank_4_to_6/max(1,seen_male_candidates_rank_4_to_6)
    e_m2 = math.sqrt(m2 * (1-m2) /max(1,seen_male_candidates_rank_4_to_6))

    m3 = selected_male_candidates_rank_7_to_10/ max(1,seen_male_candidates_rank_7_to_10)
    e_m3 = math.sqrt(m3 * (1-m3)/max(1,seen_male_candidates_rank_7_to_10))

    f1 = selected_female_candidates_rank_1_to_3/max(1,seen_female_candidates_rank_1_to_3)
    e_f1 = math.sqrt(f1 * (1-f1)/max(1,seen_female_candidates_rank_1_to_3))

    f2 = selected_female_candidates_rank_4_to_6/max(1,seen_female_candidates_rank_4_to_6)
    e_f2 = math.sqrt(f2 * (1-f2) /max(1,seen_female_candidates_rank_4_to_6))

    f3 = selected_female_candidates_rank_7_to_10/max(1,seen_female_candidates_rank_7_to_10)
    e_f3 = math.sqrt(f3 * (1-f3)/max(1,seen_female_candidates_rank_7_to_10))

   
    x = np.array(["1-3", "4-6", "7-10"])
    y1 = np.array([m1,m2, m3])
    y2 = np.array([f1,f2, f3])
    e1 = np.array([e_m1,e_m2, e_m3])
    e2 = np.array([e_f1,e_f2, e_f3])
    
    print(y1)
    print(y2)
    #print(e1)
    #print(e2)
    print("male seen on rank 1-3 "+str(seen_male_candidates_rank_1_to_3))
    print("female seen on rank 1-3 "+str(seen_female_candidates_rank_1_to_3))
    print("male seen on rank 4-6 "+str(seen_male_candidates_rank_4_to_6))
    print("female seen on rank 4-6 "+str(seen_female_candidates_rank_4_to_6))
    print("male seen on rank 7-10 "+str(seen_male_candidates_rank_7_to_10))
    print("female seen on rank 7-10 "+str(seen_female_candidates_rank_7_to_10))
    fig = plt.figure(figsize=(15, 10))
    plt.errorbar(x, y1, e1, marker='o', markersize = 10, capsize = 10)
    plt.errorbar(x, y2, e2, marker='x',color = "red", markersize = 10, capsize = 10)
    #plt.errorbar(x, y3,e3, marker='^', markersize = 10, capsize = 10)
    data_m = [y1,e1]
    data_f = [y2,e2]
    
    male_n = [seen_male_candidates_rank_1_to_3,seen_male_candidates_rank_4_to_6,seen_male_candidates_rank_7_to_10]
    female_n = [seen_female_candidates_rank_1_to_3,seen_female_candidates_rank_4_to_6,seen_female_candidates_rank_7_to_10]
    plt.yticks(np.arange(0, 1, 0.05))
    plt.xlabel('Rank'),
    plt.ylabel('P("Selected" | "Gender")')
    plt.title(''+tag)
    plt.show()
    return data_m,male_n, data_f, female_n

In [2]:
#"./all_test_data"
#"./jsonFiles_valid_third_test"
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
import pdb
from sklearn.metrics import *
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import itertools
import csv

def build_learning_data_from(click_data):
    learning_data = click_data.loc[:, feature_columns]
    
    #scaler = StandardScaler()
    #learning_data['rank'] = scaler.fit_transform(learning_data[['rank']])
    #learning_data['total_xp'] = scaler.fit_transform(learning_data[['total_xp']])
    #learning_data['furniture_assembly_xp'] = scaler.fit_transform(learning_data[['furniture_assembly_xp']])
    #learning_data['home_repairs_xp'] = scaler.fit_transform(learning_data[['home_repairs_xp']])
    #learning_data['isMale'] = scaler.fit_transform(learning_data[['isMale']])
    #learning_data['isFemale'] = scaler.fit_transform(learning_data[['isFemale']])
    return learning_data

def train_model(model, prediction_function, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    
    y_train_pred = prediction_function(model, X_train)
    print('train precision: ' + str(precision_score(y_train, y_train_pred)))
    print('train recall: ' + str(recall_score(y_train, y_train_pred)))
    print('train accuracy: ' + str(accuracy_score(y_train, y_train_pred)))
    y_test_pred = prediction_function(model, X_test)
    print('test precision: ' + str(precision_score(y_test, y_test_pred)))
    print('test recall: ' + str(recall_score(y_test, y_test_pred)))
    print('test accuracy: ' + str(accuracy_score(y_test, y_test_pred)))
    
    return model
def get_predicted_outcome(model, data):
    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)
def get_predicted_rank(model, data):
    return model.predict_proba(data)[:, 1]
def get_test_train_data(events_data):
    X = events_data.drop(columns=['action']).values.astype(np.float32)
    #print('overall input shape: ' + str(X.shape))

    y = events_data.loc[:, ['action']].values.astype(np.float32).ravel()
    #print('overall output shape: ' + str(y.shape))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    #print('training input shape: ' + str(X_train.shape))
    #print('training output shape: ' + str(y_train.shape))

    #print('testing input shape: ' + str(X_test.shape))
    #print('testing output shape: ' + str(y_test.shape))
    
    return [X_train, X_test, y_train, y_test]

In [13]:
#"./jsonFiles_valid_third_test"
#"./all_test_data"
data = load_data_as_df("./all_test_data")
#print(len(data["ranking_type"]))
cols = ['id','rank','total_xp','furniture_assembly_xp','home_repairs_xp','isMale','isFemale','isRandom','action']
data = data[(data["ranking_type"].isin([2,3]))]
#print(len(data["ranking_type"]))
print("FURNITURE ASSEMBLY")
rows = []
for i in range(0,len(data["ranking_type"])):
    seen = []
    for cand in data.iloc[i][4]:
        rank = cand["totalRank"]
        total_xp = cand["totalExp"]
        furniture_assembly_xp = cand["tag1"]
        home_repairs_xp = cand["tag2"]
        if cand["gender"] == "m":
            isMale = 1
            isFemale = 0
            const = 0
        else:
            isMale = 0
            isFemale = 1
        cand_id = cand["id"]
        action = 0
        seen.append([cand_id,rank,total_xp,furniture_assembly_xp,home_repairs_xp,isMale,isFemale,const,action])
    selected = data.iloc[i][10]
    for entry_sel in selected:
        for entry in seen:
            if entry[0] == entry_sel["id"]:
                entry[8] = 1
    for l in seen:
        rows.append(l)
df = pd.DataFrame(rows,columns=cols)
df=df.drop(columns=['id'])
#print(df)
#print(df)
df.to_csv('furnitureassembly_ordered.csv', index=False)
X_train, X_test, y_train, y_test = get_test_train_data(df)
model = train_model(LogisticRegression(solver = 'liblinear'), get_predicted_outcome, X_train, y_train, X_test, y_test)
print(pd.DataFrame({"Feature":df.drop(columns=['action']).columns.tolist(),"Coefficients":model.coef_[0]}))

FURNITURE ASSEMBLY
train precision: 0.6521739130434783
train recall: 0.5454545454545454
train accuracy: 0.8828571428571429
test precision: 0.5294117647058824
test recall: 0.45
test accuracy: 0.8733333333333333
                 Feature  Coefficients
0                   rank     -0.011593
1               total_xp      0.175355
2  furniture_assembly_xp      0.339952
3        home_repairs_xp     -0.164598
4                 isMale     -1.476077
5               isFemale     -1.793172
6               isRandom      0.000000


In [8]:
#"./jsonFiles_valid_third_test"
#"./all_test_data"
data = load_data_as_df("./jsonFiles_valid_third_test")
#print(len(data["ranking_type"]))
cols = ['id','rank','total_xp','furniture_assembly_xp','home_repairs_xp','isMale','isFemale','isHome','action']
data = data[(data["ranking_type"].isin([0,1]))]
#print(len(data["ranking_type"]))
print("HOME REPAIRS")
rows = []
for i in range(0,len(data["ranking_type"])):
    seen = []
    for cand in data.iloc[i][5]:
        rank = cand["totalRank"]
        total_xp = cand["totalExp"]
        furniture_assembly_xp = cand["tag1"]
        home_repairs_xp = cand["tag2"]
        const = 1
        if cand["gender"] == "m":
            isMale = 1
            isFemale = 0
        else:
            isMale = 0
            isFemale = 1
        cand_id = cand["id"]
        action = 0
        seen.append([cand_id,rank,total_xp,furniture_assembly_xp,home_repairs_xp,isMale,isFemale,const,action])
    selected = data.iloc[i][11]
    for entry_sel in selected:
        for entry in seen:
            if entry[0] == entry_sel["id"]:
                entry[8] = 1
    for l in seen:
        rows.append(l)
df = pd.DataFrame(rows,columns=cols)
df=df.drop(columns=['id'])
#df=df.drop(columns=['rank'])
#print(df)
df.to_csv('homerepairs.csv', index=False)
X_train, X_test, y_train, y_test = get_test_train_data(df)
model = train_model(LogisticRegression(solver = 'liblinear'), get_predicted_outcome, X_train, y_train, X_test, y_test)
print(pd.DataFrame({"Feature":df.drop(columns=['action']).columns.tolist(),"Coefficients":model.coef_[0]}))
print(model.coef_)

HOME REPAIRS
train precision: 0.0
train recall: 0.0
train accuracy: 0.8441558441558441
test precision: 0.0
test recall: 0.0
test accuracy: 0.8636363636363636
                 Feature  Coefficients
0                   rank     -0.055879
1               total_xp      0.151732
2  furniture_assembly_xp     -0.095746
3        home_repairs_xp      0.247476
4                 isMale     -0.496451
5               isFemale     -1.121294
6                 isHome     -1.617745
[[-0.05587859  0.15173208 -0.09574631  0.24747603 -0.49645125 -1.12129395
  -1.61774521]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
#        "id": 1,
#        "tag1": 7.5,
#        "tag2": 8.3,
#        "totalExp": 15.8,
#        "name": "Candidate 2",
#        "gender": "m",
#        "score": 9.9,
#        "score_rank": 3,
#        "xp_ranking": 5,
#        "totalRank": 2
x = math.exp(-1.692047 + 0.174284*15.8 -0.066743 * 7.5 + 0.241024 * 8.3 -0.635304 - 0.089535 * 2)
p = x/(1+x)
print(p)

0.8515983947104382


In [6]:
#"./jsonFiles_valid_third_test"
#"./all_test_data"
data = load_data_as_df("./jsonFiles_valid_third_test")
#print(len(data["ranking_type"]))
cols = ['id','rank','total_xp','furniture_assembly_xp','home_repairs_xp','isMale','isFemale','action']
data = data[(data["ranking_type"].isin([0,1]))]
#print(len(data["ranking_type"]))
print("FURNITURE DELIVERY")
rows = []
for i in range(0,len(data["ranking_type"])):
    seen = []
    for cand in data.iloc[i][6]:
        rank = cand["totalRank"]
        total_xp = cand["totalExp"]
        furniture_assembly_xp = cand["tag1"]
        home_repairs_xp = cand["tag2"]
        if cand["gender"] == "m":
            isMale = 1
            isFemale = 0
        else:
            isMale = 0
            isFemale = 1
        cand_id = cand["id"]
        action = 0
        seen.append([cand_id,rank,total_xp,furniture_assembly_xp,home_repairs_xp,isMale,isFemale,action])
    selected = data.iloc[i][12]
    for entry_sel in selected:
        for entry in seen:
            if entry[0] == entry_sel["id"]:
                entry[7] = 1
    for l in seen:
        rows.append(l)
df = pd.DataFrame(rows,columns=cols)
df=df.drop(columns=['id'])
df.to_csv('furnituredelivery.csv', index=False)
#print(df)
#max_accuracy = 0
#max_kpis = 0
#for i in range(0,100):
X_train, X_test, y_train, y_test = get_test_train_data(df)
model = train_model(LogisticRegression(solver = 'liblinear'), get_predicted_outcome, X_train, y_train, X_test, y_test)
print(pd.DataFrame({"Feature":df.drop(columns=['action']).columns.tolist(),"Coefficients":model.coef_[0]}))

FURNITURE DELIVERY
train precision: 0.6666666666666666
train recall: 0.18181818181818182
train accuracy: 0.8701298701298701
test precision: 0.8
test recall: 0.36363636363636365
test accuracy: 0.8787878787878788
                 Feature  Coefficients
0                   rank     -0.154085
1               total_xp      0.136800
2  furniture_assembly_xp      0.199084
3        home_repairs_xp     -0.062281
4                 isMale     -0.880932
5               isFemale     -1.244192
