# Setup

In [1]:
%%capture
!pip install --progress-bar off poetry
!pip install --progress-bar off git+https://github.com/oughtinc/ergo.git@e92c684d45ebebea0d6bb43172b284fc53d067fc

In [2]:
import warnings
warnings.filterwarnings(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="plotnine")

In [3]:
import pandas as pd
from datetime import datetime

import ergo
from ergo.platforms.metaculus.question import MetaculusQuestion, LinearQuestion, LogQuestion, ContinuousQuestion

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Get questions

Get all *open* questions on the specified subdomains.

NOTES:
1. log date questions are excluded because Ergo currently can't handle them
2. questions that are in multiple subdomains will be included multiple times, once for each subdomain

In [5]:
subdomains = ["www", "pandemic"]

In [6]:
qs = []

for subdomain in subdomains:
    metaculus = ergo.Metaculus(username="oughtpublic", password="123456", api_domain=subdomain)
    qs = qs + metaculus.get_questions(question_status="open", pages=99999, load_detail=False)

In [7]:
# For questions with open boundaries,
# the undetailed version of these questions is missing
# the probability above and below the question bounds, which we'd like to have.
# So, fetch the full question data.
for q in qs:
    if getattr(q, "low_open", False) or getattr(q, "high_open", False):
        q.refresh_question()

In [8]:
exemplar_q_id = 3530

In [9]:
exemplar_q = metaculus.get_question(exemplar_q_id)

In [10]:
qs_df = exemplar_q.to_dataframe(qs)

# Get question metadata

## Get all of the metadata already on the question

Get the field names from the question JSON from Metaculus:

In [11]:
metaculus_json_fields = list(exemplar_q.data.keys())

Get the property names from the MetaculusQuestion and ContinuousQuestion classes:

In [12]:
def properties(some_class):
    """
    Get all @properties of a class
    """
    class_items = some_class.__dict__.items()
    return [name for (name, value) in class_items if isinstance(value, property)]

In [13]:
question_properties = properties(MetaculusQuestion)

In [14]:
continuous_question_properties = properties(ContinuousQuestion)

In [15]:
%%capture
q_fields = metaculus_json_fields + question_properties + continuous_question_properties

simple_fields = [field for field in q_fields if type(getattr(exemplar_q, field)) in [bool, int, float, str, datetime]]

# This property causes an exception for some reason.
# Didn't seem worth investigating
simple_fields.remove("question_range_width")

for field in simple_fields:
    qs_df[field] = [getattr(q, field, None) for q in qs]

## Generate and add more metadata

In [16]:
def get_p_outside(q):
    if not hasattr(q, "latest_community_percentiles"):
        return None

    # q.latest_community_percentiles is a float for binary questions:
    # https://github.com/oughtinc/ergo/pull/378
    if type(q.latest_community_percentiles) == float:
        return None
    
    return q.latest_community_percentiles["low"] + (1 - q.latest_community_percentiles["high"])

In [17]:
metadata_columns = {
    "type": lambda q: type(q).__name__,
    "subdomain": lambda q: q.metaculus.api_domain,
    "num_boundaries_open": lambda q: int(q.low_open) + int(q.high_open) if hasattr(q, "low_open") else None,
    "question_scale_low": lambda q: q.scale.low if hasattr(q, "scale") else None,
    "question_scale_high": lambda q: q.scale.high if hasattr(q, "scale") else None,
}

In [18]:
for (name, fn) in metadata_columns.items():
    qs_df[name] = [fn(q) for q in qs]

## Select and reorder columns

We have these columns:

In [25]:
qs_df[qs_df["id"] == exemplar_q_id].head(1)

Unnamed: 0,id,title,question_url,subdomain,type,num_boundaries_open,low_open,high_open,p_outside,question_scale_low,question_scale_high,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
277,3530,How many people will die as a result of the 20...,https://www.metaculus.com/questions/3530,www,LogQuestion,2.0,True,True,0.028,200,100000000,101.0,2020-06-25T18:13:37.440708Z,149,234,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,2021-01-01 00:00:00,Jgalt,2020-06-19T03:27:38.264689Z,True,6.706319,COVID-19 related deaths before 2021:


Select all the ones that might plausibly be useful, and put them in a reasonable order:

In [20]:
qs_df = qs_df[[
    "id",
    "title",
    "question_url",
    "subdomain",
    "type",
    "num_boundaries_open",
    "low_open",
    "high_open",
    "p_outside",
    "question_scale_low",
    "question_scale_high",
    "anon_prediction_count",
    "last_activity_time",
    "votes",
    "comment_count",
    "created_time",
    "publish_time",
    "close_time",
    "resolve_time",
    "author_name",
    "last_read",
    "has_predictions",
    "activity",
    "title_short"
]]

In [26]:
qs_df[qs_df["id"] == exemplar_q_id].head(1)

Unnamed: 0,id,title,question_url,subdomain,type,num_boundaries_open,low_open,high_open,p_outside,question_scale_low,question_scale_high,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
277,3530,How many people will die as a result of the 20...,https://www.metaculus.com/questions/3530,www,LogQuestion,2.0,True,True,0.028,200,100000000,101.0,2020-06-25T18:13:37.440708Z,149,234,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,2021-01-01 00:00:00,Jgalt,2020-06-19T03:27:38.264689Z,True,6.706319,COVID-19 related deaths before 2021:


## Explanations of some important fields
- `low_open`: Is the lower boundary of the question open? (only applies to ContinuousQuestions)
- `high_open`: Is the upper boundary of the question open? (only applies to ContinuousQuestions)
- `p_outside`: How much of the total probability mass of the community prediction is outside the question range? (only applies to ContinuousQuestions)
- `anon_prediction_count`: Seems to be a proxy for the number of predictions. See "Data notes" below.
- `last_activity_time`: Seems to be a quick proxy for the time of the last prediction on the question. See "Data notes" below.
- `comment_count`: How many comments have been left on this question?
- `created_time`: When did the author of the question create it? (I think)
- `publish_time`: When was the question published to all Metaculus users?
- `close_time`: After what time are predictions on this question no longer allowed?
- `resolve_time`: When can the question be resolved, i.e. when will the answer be known?

## Data notes:
1. `anon_prediction_count` is the closest thing I could find to a count of number of predictions, but I'm not really sure how it relates to the number of predictions. In my testing:
    1. It seems to always be the same as the length of `prediction_timeseries`
    2. It seems to be correlated with something about the number of predictions shown. E.g.
        1. it's 101 for this question where the community prediction is shown: https://www.metaculus.com/questions/3530/how-many-people-will-die-as-a-result-of-the-2019-novel-coronavirus-covid-19-before-2021/.
        2. While it's 0 for this question where the community prediction is not shown yet: https://www.metaculus.com/questions/4614/when-will-directly-removing-carbon-dioxide-from-the-atmosphere-be-economically-feasible/ 
    3. I couldn't get it to increment. I tried:
        1. making a new prediction with an account that had already predicted on the question
        2. making a prediction with an account that had never predicted on that question before.
2.`last_activity_time` seems like the most obvious easy proxy for when the most recent prediction was made. However, I'm not sure how reliable it is.
    1. It did not update when I made a new prediction from an account that had already predicted on the question previously
    2. It may update when people leave comments or at other times
    3. Alternatively, we could use the last time from the `prediction_timeseries`, but that also doesn't seem to update every time someone makes a prediction
3. To get the datetime of the last posted comment, I think we'd need to retrieve it from a separate API (prob at least 30 min of work, maybe more like hours), so I haven't tried
4. Log date questions are excluded here because Ergo can't handle them yet.

# View data

## Export as csv

(For use when running locally in `ergo/notebooks`)

In [22]:
# qs_df.to_csv("../ergo/contrib/metac_qs_data/metac_qs_data.csv", index=False, float_format='%.20f')

A version of this CSV is uploaded as a [Google Sheet](https://docs.google.com/spreadsheets/d/1Aii_IkUTiJH6t14n2lhwhu4PJJjlTz6X5vdEi5gPGa0/edit#gid=1305569144).

## View all questions

In [23]:
qs_df

Unnamed: 0,id,title,question_url,subdomain,type,num_boundaries_open,low_open,high_open,p_outside,question_scale_low,question_scale_high,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
0,4694,"What will Gross World Product be in 2047, in t...",https://www.metaculus.com/questions/4694,www,LogQuestion,2.0,True,True,,50,100000,0.0,2020-06-22T14:34:24.601294Z,2,0,2020-06-22T14:34:24.601294Z,2020-06-24 22:00:00.000,2030-02-20 22:01:00,2048-01-19 23:00:00,Tamay,,True,0.2507273,What will Gross World Product be in 2047
1,4645,If Conservatives win the next UK general elect...,https://www.metaculus.com/questions/4645,www,LinearQuestion,2.0,True,True,,100,180,0.0,2020-06-14T16:29:52.393444Z,1,0,2020-06-14T16:29:52.393444Z,2020-06-24 22:00:00.000,2027-02-14 00:00:00,2030-12-31 00:00:00,holomanga,,True,0.03633143,If Conservatives win the next UK general
2,4644,"If Labour wins the next UK general election, w...",https://www.metaculus.com/questions/4644,www,LinearQuestion,2.0,True,True,,100,180,0.0,2020-06-14T14:30:14.717192Z,1,0,2020-06-14T14:30:14.717192Z,2020-06-24 22:00:00.000,2027-02-14 00:00:00,2030-12-31 00:00:00,holomanga,,True,0.03633502,If Labour wins the next UK general elect
3,4692,When will The Boring Company tunnel faster tha...,https://www.metaculus.com/questions/4692,www,LinearDateQuestion,1.0,False,True,0.01,2021-06-15,2027-06-15,0.0,2020-06-25T18:17:47.314220Z,7,1,2020-06-22T06:31:58.450487Z,2020-06-24 22:00:00.000,2025-06-22 05:51:00,2027-06-22 05:51:00,krtnu,,True,0.3883329,When will The Boring Company tunnel fast
4,4680,When will the population size of India surpass...,https://www.metaculus.com/questions/4680,www,LinearDateQuestion,1.0,False,True,0.01,2020-12-30,2030-01-01,0.0,2020-06-25T05:25:14.570587Z,5,0,2020-06-19T17:36:49.201914Z,2020-06-23 22:00:00.000,2021-03-01 08:00:00,2030-01-01 08:00:00,Matthew_Barnett,,True,0.007843406,When will the population size of India s
5,4688,What will the real GDP/capita of the USA be in...,https://www.metaculus.com/questions/4688,www,LinearQuestion,2.0,True,True,0.04045,25000,200000,0.0,2020-06-24T00:36:43.311637Z,9,15,2020-06-21T06:03:58.819677Z,2020-06-23 22:00:00.000,2021-11-02 04:00:00,2025-12-31 05:00:00,nagolinc,,True,0.2280317,What will the real GDP/capita of the USA
6,4683,Will the border conflict between India and Chi...,https://www.metaculus.com/questions/4683,www,BinaryQuestion,,,,,,,,2020-06-25T07:13:28.158638Z,4,6,2020-06-19T19:01:28.582070Z,2020-06-23 18:00:00.000,2020-10-31 18:59:05.300000,2021-01-01 07:00:00.700000,,,,0.8119445,Will the border conflict between India a
7,4682,Will at least 500 Indians die in clashes along...,https://www.metaculus.com/questions/4682,www,BinaryQuestion,,,,,,,,2020-06-25T07:13:07.307401Z,4,3,2020-06-19T18:59:47.119304Z,2020-06-23 18:00:00.000,2020-11-01 05:59:01.200000,2021-01-01 07:00:01.700000,,,,0.6911838,Will at least 500 Indians die in clashes
8,4697,[short fuse] Given that the NYT publishes an a...,https://www.metaculus.com/questions/4697,www,BinaryQuestion,,,,,,,,2020-06-25T18:20:37.402310Z,52,49,2020-06-23T11:19:01.061598Z,2020-06-23 03:00:00.000,2020-07-08 03:00:00,2020-07-08 03:00:00,,,,53.02617,[short fuse] Given that the NYT publishe
9,4681,How many paid memberships will Netflix have wo...,https://www.metaculus.com/questions/4681,www,LinearQuestion,2.0,True,True,0.02311,160000000,320000000,1.0,2020-06-25T18:15:02.194179Z,7,8,2020-06-19T18:42:53.046239Z,2020-06-21 22:00:00.000,2020-09-30 22:00:00,2021-09-30 22:00:00,alexrjl,,True,4.251324,How many paid memberships will Netflix h
