Creates a simpler query-log style of logs. A list with each eleement a dictionaty like this:
```
{
    'pre_test_score': 0,  # Score in the pre-test
    'post_test_score': 8, # score in the post-test
    'RPL': 0.4,  # Realized pottential gain (%)
    'ALG': 0.8, # Absolute pottential gain (%)
    'topic_id': '4', ID of the topic
    'topic_title': 'Ethics', # Title of the topic
    'session_duration': 3486, # Time in seconds
    'clicks': [ # List of all user clicks
                {'query_text': 'ethics',   # Text of the query
                'url': 'https://www.scu.edu/ethics/ethics-resources/ethical-decision-making/what-is-ethics/',  #URL clicked
                'position': 2},  # Position of the document in the SERP
                }  ...
```

## Preparation
- See `Analysis.ipynb` for more details

In [2]:
import base64
import gzip
import json
from collections import OrderedDict
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm.auto import tqdm


def p_date(x: str) -> datetime:
    """Short hand for datetime.fromisoformat(x)"""
    if isinstance(x, datetime):
        return x
    return datetime.fromisoformat(x)


with gzip.open("../data/all_logs_clean.json.gz", "r") as inf:
    all_logs = json.loads(inf.read().decode("utf-8"))
with gzip.open("../data/all_queries_clean.json.gz", "r") as inf:
    all_queries = json.loads(inf.read().decode("utf-8"))

vocab = json.load(open("../data/vocab.json"))  # VKS vocabulary dictionary
users = set([x["userId"] for x in all_logs])

pre_tests = {
    **{x["sessionId"]: x["meta"] for x in all_logs if x["event"] == "SURVEY_REGISTER_RESULTS"},
    **{x["userId"]: x["meta"] for x in all_logs if x["event"] == "SURVEY_REGISTER_RESULTS"},
    **{x["meta"]["data"]["userId"]: x["meta"] for x in all_logs if x["event"] == "SURVEY_REGISTER_RESULTS"},
}
prolific_path = [
    f"../data/{x}"
    for x in [
        "prolific_export_5e4d0dea60c37d14cb329934.csv",
        "prolific_export_5e4e4f7be482212a2429a366.csv",
        "prolific_export_5f46799646aa3002de1afa25.csv",
    ]
]
prolific_last_time = dict()
session_length_prolific = dict()
prolific_status = dict()
prolific_users = set()
returned_code = dict()
found = 0
for p in prolific_path:
    for line in open(p):
        l = line.split(",")
        if l[0] == "session_id":
            continue
        participant_id = l[1]
        prolific_users.add(participant_id)
        duration = float(l[5])
        status = l[2]
        returned_code[participant_id] = l[11]
        prolific_status[participant_id] = status
        prolific_last_time[participant_id] = l[4]
        session_length_prolific[participant_id] = duration

## Metrics

In [3]:
def ALG(user_logs: list[dict], topic: str) -> float:
    """Computes Absolute Learning Gains:
    Args:
        user_logs: sorted (by time) list with all of one user's logs
        topic: string with user's topic id
    """
    score = 0
    topic_terms = get_topic_terms(topic)
    pretest_results = [r for r in user_logs if r["event"] == "SURVEY_PRE_TEST_RESULTS"][0]
    posttest_results = [r for r in user_logs if r["event"] == "SURVEY_POST_TEST_RESULTS"][0]
    score_mapping = {1: 0, 2: 0, 3: 1, 4: 2}
    for question in topic_terms:
        qid = "Q-" + str(topic) + "-" + str(question)
        pre_score = score_mapping[int(pretest_results["meta"]["data"][qid])]
        post_score = score_mapping[int(posttest_results["meta"]["data"][qid])]
        score += max(0, post_score - pre_score)
    return score / 10


def MLG(user_logs: list[dict], topic: str) -> float:
    """Computes Maximum possible learning potential
    Args:
        user_logs: sorted (by time) list with all of one user's logs
        topic: string with users' topic id
    """
    topic_terms = get_topic_terms(topic)
    pretest_results = [r for r in user_logs if r["event"] == "SURVEY_PRE_TEST_RESULTS"][0]
    score = 0
    score_mapping = {1: 0, 2: 0, 3: 1, 4: 2}
    for question in topic_terms:
        qid = "Q-" + str(topic) + "-" + str(question)
        pre_score = score_mapping[int(pretest_results["meta"]["data"][qid])]
        score += 2 - pre_score
    return score / 10


def RPL(user_logs: list, topic: str) -> float:
    """Computes user's Realized Potential Learning:
    Args:
        user_logs: List of one user's logs
        topic: string with user's topic id
    """
    sorted_logs = sorted(user_logs, key=lambda x: x["date"], reverse=True)
    _ALG = ALG(sorted_logs, topic)
    _MLG = MLG(sorted_logs, topic)
    return _ALG / _MLG


def get_pre_pos_score(user_logs: list[dict], topic: str) -> tuple[float, float]:
    """Returns absolute pre and post user knowledge"""
    topic_terms = get_topic_terms(topic)
    pretest_results = [r for r in user_logs if r["event"] == "SURVEY_PRE_TEST_RESULTS"][0]
    posttest_results = [r for r in user_logs if r["event"] == "SURVEY_POST_TEST_RESULTS"][0]
    score_mapping = {1: 0, 2: 0, 3: 1, 4: 2}

    pre_score = 0
    post_score = 0

    for question in topic_terms:
        qid = "Q-" + str(topic) + "-" + str(question)
        pre_score += score_mapping[int(pretest_results["meta"]["data"][qid])]
        post_score += score_mapping[int(posttest_results["meta"]["data"][qid])]
    return pre_score, post_score


def get_all_logs(user: str) -> list[dict]:
    """Get all logs from one user"""
    user_logs = [x for x in all_logs if x["userId"] == user]
    return sorted(user_logs, key=lambda x: datetime.strptime(x["date"], "%Y-%m-%d %H:%M:%S"))


def get_topic_title(user_logs: list[dict]) -> str:
    """Gets the title of the topic assigned to user"""
    return [x for x in user_logs if x["event"] == "SURVEY_POST_TEST_RESULTS"][0]["task"]["data"]["topic"]["title"]


def get_topic_id(user_logs: list[dict]) -> str:
    """Gets the id of the topic the user was assigned to"""
    for x in user_logs:
        if x["event"] == "SEARCHRESULT_VIEW_URL":
            return x["task"]["data"]["topic"]["id"]


def get_topic_terms(p: str) -> list[str]:
    """Gets a list of the terms for the VKS for a given topic id"""
    return vocab[p]["terms"]


def get_session(user_logs: list[dict]) -> str:
    """Get what type of experiment the user was in"""
    event = [x for x in user_logs if x["event"] == "SEARCHRESULT_VIEW_URL"][0]
    return event["meta"]["session"]


def get_session_duration(user_logs: list[dict]) -> int:
    """Gets how long, in seconds, a user session lasted, from start to end"""
    sorted_logs = sorted(user_logs, key=lambda x: datetime.strptime(x["date"], "%Y-%m-%d %H:%M:%S"))
    end_time = p_date(sorted_logs[-1]["date"])
    start_time = p_date(sorted_logs[0]["date"])
    return (end_time - start_time).seconds


def rebuild_serp(user_log: list[dict], start_event: int) -> dict[str, int]:
    """Rebuilts a SERP, with documents in each position, for a given query
    Returns dict with URL and its position in the SERP.
    CAVEAT: If URL appears twice in the serp (may happen due to filtering process), only first occurence counts."""
    n_events = len(user_log) - start_event
    for idx, e in enumerate(user_log[start_event + 1 :], start=start_event):
        if e["event"] == "SEARCH_QUERY":
            n_events = idx
            break

    query_events = sorted(user_log[start_event : start_event + n_events], key=lambda x: datetime.strptime(x["date"], "%Y-%m-%d %H:%M:%S"))
    query_events = [x for x in query_events if "position" in x["meta"]]

    for k in query_events:
        # naive. Filter later.
        k["true_position"] = (k["meta"]["page"] - 1) * 10 + k["meta"]["position"]

    query_events = sorted(query_events, key=lambda x: x["true_position"])
    # Try to rebuild the SERP
    serp = {}
    docs_so_far = 0
    offset = 0
    for ix, e in enumerate(query_events):
        meta = e["meta"]
        url = meta["url"]
        if url in serp:
            continue
        serp[url] = e["true_position"]

    ranking = sorted(serp.items(), key=lambda x: x[1])

    return serp

## Extract data

In [10]:
dataset = []
for count, user in tqdm(enumerate(users), total=len(users)):
    user_log = get_all_logs(user)  # All user events
    user_session = get_session(user_log)  # What experiment the user was assigned to
    topic = get_topic_title(user_log)  # User topic
    topic_id = get_topic_id(user_log)  # User topic id

    realized_learning_gain = RPL(user_log, topic_id)
    absolute_learning_gain = ALG(user_log, topic_id)
    pre_score, post_score = get_pre_pos_score(user_log, topic_id)
    time_elapsed = get_session_duration(user_log)

    user_data = {
        "pre_test_score": pre_score,
        "post_test_score": post_score,
        "RPL": realized_learning_gain,
        "ALG": absolute_learning_gain,
        "topic_id": topic_id,
        "topic_title": topic,
        "session_duration": time_elapsed,
        "clicks": [],
    }
    current_query = None
    queries = 0
    query_data = None
    # Iterate over log
    for e_idx, e in enumerate(user_log):
        meta = e["meta"]
        if e["event"] == "SEARCH_QUERY":
            URL_rankings = rebuild_serp(user_log, e_idx)  # rebuild SERP for this query
            current_query = meta["query"]

        elif e["event"] == "SEARCHRESULT_CLICK_URL":
            position = URL_rankings[meta["url"]]
            user_data["clicks"].append({"query_text": current_query, "url": meta["url"], "position": position})
    dataset.append(user_data)

  0%|          | 0/127 [00:00<?, ?it/s]

In [14]:
json.dump(dataset, open("../data/logs_with_position.json", "w"))

In [15]:
dataset = json.load(open("../data/logs_with_position.json"))

In [16]:
dataset[2]

{'pre_test_score': 0,
 'post_test_score': 8,
 'RPL': 0.4,
 'ALG': 0.8,
 'topic_id': '4',
 'topic_title': 'Ethics',
 'session_duration': 3486,
 'clicks': [{'query_text': 'ethics',
   'url': 'https://www.scu.edu/ethics/ethics-resources/ethical-decision-making/what-is-ethics/',
   'position': 2},
  {'query_text': 'ethics',
   'url': 'https://www.britannica.com/topic/ethics-philosophy',
   'position': 1},
  {'query_text': 'ethics',
   'url': 'http://www.bbc.co.uk/ethics/introduction/intro_1.shtml',
   'position': 5},
  {'query_text': 'meta ethics',
   'url': 'https://plato.stanford.edu/entries/metaethics/',
   'position': 0},
  {'query_text': 'meta ethics',
   'url': 'http://www.qcc.cuny.edu/socialsciences/ppecorino/ETHICS_TEXT/Chapter_1_Introduction/Normative_Ethics_and_Metaethics.htm',
   'position': 5},
  {'query_text': 'meta ethics',
   'url': 'https://www.differencebetween.com/difference-between-metaethics-and-normative-ethics/',
   'position': 9},
  {'query_text': 'meta ethics',
   '