In [3]:
from typing import List, Dict, Tuple
from tqdm.auto import tqdm
import random
import os
import json
import re
from re import finditer
from itertools import tee
import pickle
import urllib
from collections.abc import Iterable, Iterator
from itertools import product

### Utils

def g_path(*argv: str)->str:
    """short hand for creating a new path properly
    args:
        argv: vector of strings to join into a path"""
    return os.path.join(*argv)

def pairwise(iterable:Iterable)->Iterator:
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def camel_case_split(identifier):
    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]


#Mappings
topics = {
    "ethics": 1,
    "genetically_modified_organism": 2,
    "noise-induced_hearing_loss": 3,
    "subprime_mortgage_crisis": 4,
    "radiocarbon_dating_considerations": 5,
    "business_cycle": 7,
    "irritable_bowel_syndrome": 8,
    "theory_of_mind": 9
}

subtopic_strategies = {
    "Greedy": 1,
    "Random": 2,
    "Reverse": 3,
    "GreedySmart": 4
}
vocab = json.load(open("../CHIIR21-SAL-Scaffolding/data/vocab.json"))
topics = {x['title']:k for k,x in vocab.items() if x['title'] != "Sports" and x['title']!="Norepinephrine" and x['title']!="Research in lithium-ion batteries"}

subtopics = json.load(open("../CHIIR21-SAL-Scaffolding/data/topics.json"))
subtopics = {x['title'] :x['terms'] for x in subtopics.values() if x['title'] != "Sports" and x['title']!="Norepinephrine" and x['title']!="Research in lithium-ion batteries"}
subtopics["Ethics"]["Meta-ethics"] = subtopics["Ethics"].pop("Meta ethics")

subtopics ={k: list(map(lambda x: urllib.parse.quote(x) , v.keys())) for k, v in subtopics.items()}  # Dict[topic_name, List[subtopic]] , Dict[str, List[str]]
subtopics_keywords = pickle.load(open("../data/subtopic_l2_keywords.pkl", 'rb'))  # Dict[subtopic, Set[keywords]], Dict[str, Set[str]]

methods = subtopic_strategies.keys()

params = {"lambda": [0.1, 0.4, 0.8],
          "limit": [2.0, 6.0, 10.0],
          "threshold":[0.0, 1.0, 3.0, 5.0]
         }

all_users = list(product(params['limit'], params['threshold'], params['lambda']))


In [4]:
#THIS IS LONG AF
HOME_PATH = "data/simulation_data"


def get_file_name(subtopic_strategy, limit, sn_threshold, sn_lambda,doc_lambda, topic, extension, run_id):
    limit = f"{str(limit).replace('.', '')}"
    sn_threshold = f"{str(sn_threshold).replace('.', '')}"
    sn_lambda = f"{str(sn_lambda).replace('.', '')}"
    doc_lambda = f"{str(doc_lambda).replace('.', '')}"

    partial_filename = f"run_{run_id}_sub{subtopic_strategy}-l{limit}_q13_ss1_se0_sn0-t{sn_threshold}l{sn_lambda}_doc0-l{doc_lambda}"
    run_file_template = f"{HOME_PATH}/run_{run_id}_sub{subtopic_strategy}/vars-l{limit}/q13/ss1/se0/sn0/vars-t{sn_threshold}l{sn_lambda}/doc0/vars-l{doc_lambda}/output/{partial_filename}-{topic}-user-{partial_filename}.{extension}"

    return run_file_template


a = get_file_name(1, 10.0, 5.0, 0.8, 0.1, "ethics", "log",1)
os.path.isfile(a)

True

In [74]:
import numpy as np
from collections import defaultdict
query_per_method = defaultdict(lambda:defaultdict(lambda:[]))
docs_per_method = defaultdict(lambda:defaultdict(lambda:[]))
snippets_per_method = defaultdict(lambda:defaultdict(lambda:[]))

query_per_user = defaultdict(lambda:[])
docs_per_user = defaultdict(lambda:[])
snippets_per_user = defaultdict(lambda:[])


for (limit, threshold, lam) in all_users:
    u_id = (limit, threshold, lam)
    for method in methods:
        method_id = subtopic_strategies[method]
        for topic in topics:
            n_queries = []
            n_snippets = []
            n_docs = []
            for r in range(10):
                c_topic = topic.lower().replace(" ", "_")
                log_file = get_file_name(method_id, limit, threshold, lam, 0.1, c_topic, "log", r)
                lines = open(log_file).readlines()
                n_queries.append(int(open(log_file).readlines()[-6].strip().split()[-1]))
                n_snippets.append(int(open(log_file).readlines()[-5].strip().split()[-1]))
                n_docs.append(int(open(log_file).readlines()[-3].strip().split()[-1]))
            query_per_method[method][u_id].append(np.mean(n_queries))
            docs_per_method[method][u_id].append(np.mean(n_docs))
            snippets_per_method[method][u_id].append(np.mean(n_snippets))
            
            query_per_user[u_id].append(np.mean(n_queries))
            docs_per_user[u_id].append(np.mean(n_docs))
            snippets_per_user[u_id].append(np.mean(n_snippets))