# Setup

In [17]:
import pandas as pd
from datetime import datetime

import ergo
from ergo.platforms.metaculus.question import LinearQuestion, LogQuestion, ContinuousQuestion

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Get questions

Get all *open* questions on the *main* subdomain.

In [4]:
metaculus = ergo.Metaculus(username="oughtpublic", password="123456", api_domain="www")

In [5]:
qs = metaculus.get_questions(question_status="open", pages=99999)

In [18]:
exemplar_q_id = 3530

In [19]:
exemplar_q = metaculus.get_question(exemplar_q_id)

In [20]:
qs_df = exemplar_q.to_dataframe(qs)

# Get question metadata

## Get all of the metadata already on the question

Get the field names from the question JSON from Metaculus:

In [21]:
metaculus_json_fields = list(exemplar_q.data.keys())

Get the property names from the ContinuousQuestion class:

In [22]:
def properties(some_class):
    class_items = some_class.__dict__.items()
    return [name for (name, value) in class_items if isinstance(value, property)]

In [23]:
continuous_question_properties = properties(ContinuousQuestion)

In [30]:
%%capture
q_fields = metaculus_json_fields + continuous_question_properties

simple_fields = [field for field in q_fields if type(getattr(exemplar_q, field)) in [bool, int, float, str, datetime]]

# This property causes an exception for some reason
simple_fields.remove("question_range_width")

for field in simple_fields:
    qs_df[field] = [getattr(q, field, None) for q in qs]

## Generate and add more metadata

In [31]:
def get_p_outside(q):
    if not hasattr(q, "latest_community_percentiles"):
        return None
    # for some reason q.latest_community_percentiles is a float for some questions
    if type(q.latest_community_percentiles) == float:
        return None
    
    return q.latest_community_percentiles["low"] + (1 - q.latest_community_percentiles["high"])

In [32]:
metadata_columns = {
    "type": lambda q: type(q).__name__,
    "num_boundaries_open": lambda q: int(q.low_open) + int(q.high_open) if hasattr(q, "low_open") else None,
    "p_outside": get_p_outside,
    "question_range_min": lambda q: q.question_range["min"] if hasattr(q, "question_range") else None,
    "question_range_max": lambda q: q.question_range["max"] if hasattr(q, "question_range") else None,
    "question_url": lambda q: f"https://www.metaculus.com{q.page_url}"
}

In [33]:
for (name, fn) in metadata_columns.items():
    qs_df[name] = [fn(q) for q in qs]

## Select and reorder columns

At this point, we have these columns:

In [34]:
qs_df[qs_df["id"] == exemplar_q_id]

Unnamed: 0,id,title,resolve_time,type,num_boundaries_open,p_outside,question_range_min,question_range_max,question_url,url,page_url,author,status,created_time,publish_time,close_time,can_use_powers,last_activity_time,activity,comment_count,votes,title_short,user_vote,user_community_vis,author_name,anon_prediction_count,last_read,low_open,high_open,has_predictions,plot_title
253,3530,How many people will die as a result of the 20...,2021-01-01 00:00:00,LogQuestion,2.0,0.03308,200,100000000,https://www.metaculus.com/questions/3530/how-m...,https://www.metaculus.com/api2/questions/3530/,/questions/3530/how-many-people-will-die-as-a-...,101465,A,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,True,2020-06-16T00:13:10.915866Z,11.692703,229,146,COVID-19 related deaths before 2021:,0,0.0,Jgalt,101,2020-06-16T02:28:51.275880Z,True,True,True,How many people will die as a result of the 20...


Select all the ones that might plausibly be useful, and put them in a reasonable order:

In [35]:
qs_df = qs_df[[
    "id",
    "title",
    "question_url",
    "type",
    "num_boundaries_open",
    "low_open",
    "high_open",
    "p_outside",
    "question_range_min",
    "question_range_max",
    "anon_prediction_count",
    "last_activity_time",
    "votes",
    "comment_count",
    "created_time",
    "publish_time",
    "close_time",
    "resolve_time",
    "author_name",
    "last_read",
    "has_predictions",
    "activity",
    "title_short"
]]

In [36]:
qs_df[qs_df["id"] == exemplar_q_id]

Unnamed: 0,id,title,question_url,type,num_boundaries_open,low_open,high_open,p_outside,question_range_min,question_range_max,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
253,3530,How many people will die as a result of the 20...,https://www.metaculus.com/questions/3530/how-m...,LogQuestion,2.0,True,True,0.03308,200,100000000,101,2020-06-16T00:13:10.915866Z,146,229,2020-01-25T04:09:23.208127Z,2020-01-27,2020-11-01 00:00:00,2021-01-01 00:00:00,Jgalt,2020-06-16T02:28:51.275880Z,True,11.692703,COVID-19 related deaths before 2021:


## Explanations of some important fields
- `low_open`: Is the lower boundary of the question open? (only applies to ContinuousQuestions)
- `high_open`: Is the upper boundary of the question open? (only applies to ContinuousQuestions)
- `p_outside`: How much of the total probability mass of the community prediction is outside the question range?
- `anon_prediction_count`: Seems to be a proxy for the number of predictions. See "Data notes" below.
- `last_activity_time`: Seems to be a quick proxy for the time of the last prediction on the question. See "Data notes" below.
- `comment_count`: How many comments have been left on this question?
- `created_time`: When did the author of the question create it? (I think)
- `publish_time`: When was the question published to all Metaculus users?
- `close_time`: After what time are predictions on this question no longer allowed?
- `resolve_time`: When can the question be resolved, i.e. when will the answer be known?

## Data notes:
1. `anon_prediction_count` is the closest thing I could find to a count of number of predictions, but I'm not really sure how it relates to the number of predictions. In my testing:
    1. It seems to always be the same as the length of `prediction_timeseries`
    2. It seems to be correlated with something about the number of predictions shown. E.g.
        1. it's 101 for this question where the community prediction is shown: https://www.metaculus.com/questions/3530/how-many-people-will-die-as-a-result-of-the-2019-novel-coronavirus-covid-19-before-2021/.
        2. While it's 0 for this question where the community prediction is not shown yet: https://www.metaculus.com/questions/4614/when-will-directly-removing-carbon-dioxide-from-the-atmosphere-be-economically-feasible/ 
    3. I couldn't get it to increment. I tried:
        1. making a new prediction with an account that had already predicted on the question
        2. making a prediction with an account that had never predicted on that question before.
2.`last_activity_time` seems like the most obvious easy proxy for when the most recent prediction was made. However, I'm not sure how reliable it is.
    1. It did not update when I made a new prediction from an account that had already predicted on the question previously
    2. It may update when people leave comments or at other times
    3. Alternatively, we could use the last time from the `prediction_timeseries`, but that also doesn't seem to update every time someone makes a prediction
3. To get the datetime of the last posted comment, I think we'd need to retrieve it from a separate API (prob at least 30 min of work, maybe more like hours), so I haven't tried

# View data

## Export as csv

In [44]:
qs_df.to_csv("../ergo/contrib/metac_qs_data/metac_qs_data.csv", index=False, float_format='%.20f')

## View all questions

In [43]:
qs_df

Unnamed: 0,id,title,question_url,type,num_boundaries_open,low_open,high_open,p_outside,question_range_min,question_range_max,anon_prediction_count,last_activity_time,votes,comment_count,created_time,publish_time,close_time,resolve_time,author_name,last_read,has_predictions,activity,title_short
0,4607,Will California Assembly Bill 3155 be chaptere...,https://www.metaculus.com/questions/4607/will-...,BinaryQuestion,,,,,,,0,2020-06-15T21:10:39.306094Z,3,1,2020-06-05T15:34:10.753708Z,2020-06-14 22:00:00.000,2020-08-14 23:00:00,2020-09-29 23:00:00,holomanga,2020-06-16T01:38:47.884640Z,,0.005065626,Will California Assembly Bill 3155 be ch
1,4628,Will one of GiveWell's 2019 top charities be e...,https://www.metaculus.com/questions/4628/will-...,BinaryQuestion,,,,,,,0,2020-06-16T02:21:03.176170Z,8,7,2020-06-08T18:18:49.172425Z,2020-06-14 22:00:00.000,2023-01-01 08:00:00,2031-12-31 08:00:00,mdickens,,,18.48351,Will one of GiveWell's 2019 top charitie
2,4408,Will Apple announce plans to make ARM-based Ma...,https://www.metaculus.com/questions/4408/will-...,BinaryQuestion,,,,,,,1,2020-06-16T00:30:17.135901Z,9,6,2020-05-14T08:11:39.669743Z,2020-06-13 22:00:00.000,2020-06-20 22:00:00,2020-06-26 22:00:00,yhoiseth,,,0.5127414,Will Apple announce plans to make ARM-ba
3,4619,Will the border conflict between India and Chi...,https://www.metaculus.com/questions/4619/will-...,BinaryQuestion,,,,,,,1,2020-06-15T22:14:22.462768Z,8,6,2020-06-07T22:28:32.937354Z,2020-06-13 07:00:00.000,2020-12-02 06:59:04.800000,2021-01-01 17:00:03.500000,beala,,,1.57462,Will the border conflict between India a
4,4586,"If Biden becomes president, will the sentencin...",https://www.metaculus.com/questions/4586/if-bi...,BinaryQuestion,,,,,,,0,2020-06-14T19:56:14.598178Z,10,2,2020-06-04T11:05:54.798027Z,2020-06-12 22:00:00.000,2023-04-26 23:00:00,2025-01-01 00:00:00,holomanga,,,0.1546586,"If Biden becomes president, will the sen"
5,4609,"If President Trump loses the 2020 election, wi...",https://www.metaculus.com/questions/4609/if-pr...,BinaryQuestion,,,,,,,12,2020-06-16T02:00:25.341275Z,14,19,2020-06-05T17:28:38.321223Z,2020-06-12 22:00:00.000,2020-11-03 05:01:00,2021-01-21 05:01:00,AABoyles,2020-06-15T23:28:21.105582Z,,46.77851,If President Trump loses the 2020 electi
6,4615,What annual real return will the S&P 500 reali...,https://www.metaculus.com/questions/4615/what-...,LinearQuestion,2.0,True,True,0.02881,-50,100,1,2020-06-16T02:25:18.550377Z,3,3,2020-06-07T01:34:07.779233Z,2020-06-12 22:00:00.000,2021-12-31 08:00:00,2033-01-01 08:00:00,mdickens,,True,6.15241,What annual real return will the S&P 500
7,4630,Will Metaculus Inc. host a prediction market p...,https://www.metaculus.com/questions/4630/will-...,BinaryQuestion,,,,,,,0,2020-06-14T16:16:17.331062Z,8,12,2020-06-09T11:46:57.515725Z,2020-06-11 22:00:00.000,2021-06-09 11:36:00,2023-06-11 11:36:00,Tamay,2020-06-10T00:16:12.293652Z,,0.3210752,Will Metaculus Inc. host a prediction ma
8,4603,"What will the US Q3 2020 GDP growth rate be, a...",https://www.metaculus.com/questions/4603/what-...,LinearQuestion,2.0,True,True,0.10933,-20,20,0,2020-06-15T16:31:49.386354Z,5,16,2020-06-05T02:37:40.908628Z,2020-06-11 00:00:04.000,2020-10-02 00:00:00,2020-10-30 00:00:00,Jgalt,2020-06-15T04:04:13.961143Z,True,6.28064,What will the US Q3 2020 GDP growth rate
9,4617,Will 2020 be the warmest year on record?,https://www.metaculus.com/questions/4617/will-...,BinaryQuestion,,,,,,,1,2020-06-15T17:08:57.417112Z,9,10,2020-06-07T08:36:40.708609Z,2020-06-09 22:00:00.000,2020-11-30 22:00:00,2021-02-14 22:00:00,kilotaras,,,0.09760413,Will 2020 be the warmest year on record?
