In [1]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
import os, json
import pandas as pd
import numpy as np
import seaborn as sns
import math

def get_ranking_representations_in_order(dict):
    ranking_data = dict["ranking"]
    task_order = dict["treatmentOrder"]
    rankings_per_task = []
    for i in range(0,len(ranking_data)):
        if ranking_data[i]["category"] == "rankingRepresentation":
            rankings_per_task.append(ranking_data[i]["ranking"])
    sorted_rankings_by_task = [0] * 3;
    sorted_rankings_by_task[task_order[0]] = rankings_per_task[0]
    sorted_rankings_by_task[task_order[1]] = rankings_per_task[1]
    sorted_rankings_by_task[task_order[2]] = rankings_per_task[2]
    return sorted_rankings_by_task

def get_task_actions_in_order(dict):
    ranking_data = dict["ranking"]
    task_order = dict["treatmentOrder"]
    actions_per_task = []
    for i in range(0,len(ranking_data)):
        if ranking_data[i]["category"] == "rankingRepresentation":
            actions = []
            for j in range(i+1,len(ranking_data)):
                if ranking_data[j]["category"] == "serverEvent" or ranking_data[j]["category"] == "timeStamps":
                    i = j
                    break
                actions.append(ranking_data[j])
            actions_per_task.append(actions)
    sorted_actions_by_task = [0] * 3;
    sorted_actions_by_task[task_order[0]] = actions_per_task[0]
    sorted_actions_by_task[task_order[1]] = actions_per_task[1]
    sorted_actions_by_task[task_order[2]] = actions_per_task[2]
    return sorted_actions_by_task
        
def filter_for_choices(actions, ranking):
    selected_ids = []
    for action in actions:
        if action["eventName"] == "select":
            selected_ids.append(action["itemID"])
        if action["eventName"] == "unselect":
            selected_ids.remove(action["itemID"])
        if action["eventName"] == "fourthElementSelected":
            selected_ids.remove(action["itemID"])
    choices = []
    for id in selected_ids:
        for cand in ranking:
            try:
                if id == cand["id"]:
                    choices.append(cand)
            except:
                if id == cand[0]["id"]:
                    choices.append(cand[0])
                if id == cand[1]["id"]:
                    choices.append(cand[1])
    return choices

def get_survey_answers(dict):
    survey = dict["survey"]
    answers = []
    for entry in survey:
        if entry["category"] == "surveyQuestionAnswers":
            answers.append(entry)
    return answers

def load_data_as_df(path):
    path_to_jsonfiles = path
    cols = ['uID', 'ranking_type','briefing','task_order','task_1_ranking', 'task_2_ranking', 'task_3_ranking',
               'task_1_actions','task_2_actions','task_3_actions','task_1_choices','task_2_choices',
            'task_3_choices','survey_answers']
    data = pd.DataFrame(columns=cols)
    for file in os.listdir(path_to_jsonfiles):
        full_filename = "%s/%s" % (path_to_jsonfiles, file)
        with open(full_filename,'r') as fi:
            dict = json.load(fi)
            uID = file[0:len(file)-5]
            ranking_type = int(dict["ranking"][0]["itemID"])
            briefing = dict["briefing"]
            task_order = dict["treatmentOrder"]
            task_rankings = get_ranking_representations_in_order(dict)
            task_1_ranking = task_rankings[0]
            task_2_ranking = task_rankings[1]
            task_3_ranking = task_rankings[2]
            task_actions = get_task_actions_in_order(dict)
            task_1_actions = task_actions[0]
            task_2_actions = task_actions[1]
            task_3_actions = task_actions[2]
            task_1_choices = filter_for_choices(task_1_actions, task_1_ranking)
            task_2_choices = filter_for_choices(task_2_actions, task_2_ranking)
            task_3_choices = filter_for_choices(task_3_actions, task_3_ranking)
            survey_answers = get_survey_answers(dict)
            
            data = data.append({'uID': uID, 'ranking_type': ranking_type, 'briefing': briefing, 'task_order': task_order,
                               'task_1_ranking': task_1_ranking, 'task_2_ranking': task_2_ranking, 'task_3_ranking': task_3_ranking,
                               'task_1_actions': task_1_actions, 'task_2_actions': task_2_actions, 'task_3_actions': task_3_actions,
                               'task_1_choices': task_1_choices, 'task_2_choices': task_2_choices, 'task_3_choices': task_3_choices,
                               'survey_answers': survey_answers}, ignore_index=True)
    return data


In [3]:
#"./jsonFiles_valid_third_test"
#"./all_test_data"
load_data_as_df("./test")

Unnamed: 0,uID,ranking_type,briefing,task_order,task_1_ranking,task_2_ranking,task_3_ranking,task_1_actions,task_2_actions,task_3_actions,task_1_choices,task_2_choices,task_3_choices,survey_answers
0,1601296000871A91588910,0,"[{'category': 'briefingEvent', 'eventName': 'd...","[1, 0, 2]","[{'id': 22, 'name': 'Jessica Davis', 'score': ...","[{'id': 7, 'name': 'Thomas Lewis', 'score': 0....","[{'id': 46, 'name': 'Katherine Moore', 'score'...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 22, 'name': 'Jessica Davis', 'score': ...","[{'id': 7, 'name': 'Thomas Lewis', 'score': 0....","[{'id': 46, 'name': 'Katherine Moore', 'score'...","[{'question': 1, 'answers': ['5 trust-reliable..."
1,1601296041613A16376502,1,"[{'category': 'briefingEvent', 'eventName': 'd...","[0, 2, 1]","[{'id': 33, 'name': 'Jessica Wright', 'score':...","[{'id': 14, 'name': 'Jacob Williams', 'score':...","[{'id': 55, 'name': 'Carol Smith', 'score': 0....","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 33, 'name': 'Jessica Wright', 'score':...","[{'id': 14, 'name': 'Jacob Williams', 'score':...","[{'id': 55, 'name': 'Carol Smith', 'score': 0....","[{'question': 1, 'answers': ['5 trust-reliable..."
2,1601296218695A47105452,2,"[{'category': 'briefingEvent', 'eventName': 'd...","[0, 2, 1]","[{'id': 21, 'name': 'Michael Baker', 'score': ...","[{'id': 1, 'name': 'Nicholas Martin', 'score':...","[{'id': 41, 'name': 'Larry Nelson', 'score': 0...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 21, 'name': 'Michael Baker', 'score': ...","[{'id': 1, 'name': 'Nicholas Martin', 'score':...","[{'id': 41, 'name': 'Larry Nelson', 'score': 0...","[{'question': 1, 'answers': ['5 trust-reliable..."
3,1601296263347A70687493,3,"[{'category': 'briefingEvent', 'eventName': 'd...","[1, 2, 0]","[{'id': 31, 'name': 'Sandra Harris', 'score': ...","[{'id': 11, 'name': 'Kathleen Cook', 'score': ...","[{'id': 51, 'name': 'Jennifer Adams', 'score':...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 31, 'name': 'Sandra Harris', 'score': ...","[{'id': 11, 'name': 'Kathleen Cook', 'score': ...","[{'id': 51, 'name': 'Jennifer Adams', 'score':...","[{'question': 1, 'answers': ['5 trust-reliable..."
4,1601296318814A94869356,4,"[{'category': 'briefingEvent', 'eventName': 'd...","[2, 0, 1]","[{'id': 21, 'name': 'Mark Collins', 'score': 0...","[{'id': 1, 'name': 'Charles Campbell', 'score'...","[{'id': 41, 'name': 'Joseph Taylor', 'score': ...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 21, 'name': 'Mark Collins', 'score': 0...","[{'id': 1, 'name': 'Charles Campbell', 'score'...","[{'id': 41, 'name': 'Joseph Taylor', 'score': ...","[{'question': 1, 'answers': ['5 trust-reliable..."
5,1601296531280A44305344,5,"[{'category': 'briefingEvent', 'eventName': 'd...","[2, 1, 0]","[{'id': 31, 'name': 'Linda Anderson', 'score':...","[{'id': 11, 'name': 'Barbara Wilson', 'score':...","[{'id': 51, 'name': 'Ruth Davis', 'score': 0.7...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 31, 'name': 'Linda Anderson', 'score':...","[{'id': 11, 'name': 'Barbara Wilson', 'score':...","[{'id': 51, 'name': 'Ruth Davis', 'score': 0.7...","[{'question': 1, 'answers': ['5 trust-reliable..."
6,1601296602808A13207918,0,"[{'category': 'briefingEvent', 'eventName': 'd...","[1, 2, 0]","[{'id': 24, 'name': 'Jennifer Cook', 'score': ...","[{'id': 7, 'name': 'Christopher Peterson', 'sc...","[{'id': 45, 'name': 'Matthew Wilson', 'score':...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 24, 'name': 'Jennifer Cook', 'score': ...","[{'id': 7, 'name': 'Christopher Peterson', 'sc...","[{'id': 45, 'name': 'Matthew Wilson', 'score':...","[{'question': 1, 'answers': ['5 trust-reliable..."
7,1601296654471A88569457,1,"[{'category': 'briefingEvent', 'eventName': 'd...","[0, 2, 1]","[{'id': 34, 'name': 'Joshua Evans', 'score': 0...","[{'id': 15, 'name': 'Donna Allen', 'score': 0....","[{'id': 54, 'name': 'Benjamin Phillips', 'scor...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 33, 'name': 'Deborah Wilson', 'score':...","[{'id': 13, 'name': 'Ashley Collins', 'score':...","[{'id': 54, 'name': 'Benjamin Phillips', 'scor...","[{'question': 1, 'answers': [], 'timeOfEvent':..."
8,1601296700384A52902867,2,"[{'category': 'briefingEvent', 'eventName': 'd...","[2, 0, 1]","[{'id': 21, 'name': 'Andrew Hill', 'score': 0....","[{'id': 1, 'name': 'Jeffrey King', 'score': 0....","[{'id': 41, 'name': 'Jonathan Hill', 'score': ...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 21, 'name': 'Andrew Hill', 'score': 0....","[{'id': 1, 'name': 'Jeffrey King', 'score': 0....","[{'id': 41, 'name': 'Jonathan Hill', 'score': ...","[{'question': 1, 'answers': ['4 trust-reliable..."
9,1601296743979A33838449,3,"[{'category': 'briefingEvent', 'eventName': 'd...","[2, 0, 1]","[{'id': 31, 'name': 'Sandra Johnson', 'score':...","[{'id': 11, 'name': 'Rebecca Lewis', 'score': ...","[{'id': 51, 'name': 'Deborah Collins', 'score'...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'category': 'mouseEvent', 'eventName': 'firs...","[{'id': 31, 'name': 'Sandra Johnson', 'score':...","[{'id': 11, 'name': 'Rebecca Lewis', 'score': ...","[{'id': 51, 'name': 'Deborah Collins', 'score'...","[{'question': 1, 'answers': ['1 trust-reliable..."


In [None]:
#print(len(data["ranking_type"]))
#cols = ['id','rank','total_xp','furniture_assembly_xp','home_repairs_xp','isMale','isFemale','isRandom','action']
#data = data[(data["ranking_type"].isin([2,3]))]
#print(len(data["ranking_type"]))
print("FURNITURE ASSEMBLY")
#rows = []
#for i in range(0,len(data["ranking_type"])):
    #seen = []
    #for cand in data.iloc[i][4]:
        rank = cand["totalRank"]
        total_xp = cand["totalExp"]
        furniture_assembly_xp = cand["tag1"]
        home_repairs_xp = cand["tag2"]
        if cand["gender"] == "m":
            isMale = 1
            isFemale = 0
            const = 0
        else:
            isMale = 0
            isFemale = 1
        cand_id = cand["id"]
        action = 0
        seen.append([cand_id,rank,total_xp,furniture_assembly_xp,home_repairs_xp,isMale,isFemale,const,action])
    selected = data.iloc[i][10]
    for entry_sel in selected:
        for entry in seen:
            if entry[0] == entry_sel["id"]:
                entry[8] = 1
    for l in seen:
        rows.append(l)
df = pd.DataFrame(rows,columns=cols)
df=df.drop(columns=['id'])
#print(df)
#print(df)
df.to_csv('furnitureassembly_ordered.csv', index=False)
X_train, X_test, y_train, y_test = get_test_train_data(df)
model = train_model(LogisticRegression(solver = 'liblinear'), get_predicted_outcome, X_train, y_train, X_test, y_test)
print(pd.DataFrame({"Feature":df.drop(columns=['action']).columns.tolist(),"Coefficients":model.coef_[0]}))

In [None]:
import matplotlib.pyplot as plt
data = load_data_as_df('./final data')
x = []
y = []
data = data[(data["ranking_type"].isin([4,5]))]
for i in range(0,len(data["ranking_type"])):
    for cand in data.iloc[i][5]:
        x.append(cand["rank"])
        y.append(cand["tag3"])
plt.scatter(x, y)
plt.show()