# 0.2 Prepare and filter data

This notebook filters the data to non-empty questions and answers about "data availability" and "data accessibility".

In [1]:
import os

import pandas as pd
import engarde.decorators as ed

In [2]:
PROJ_ROOT = os.path.join(os.pardir)

In [3]:
# Read the complete set of questions

@ed.is_shape((None, 6))
def load_data():
    save_path = os.path.join(PROJ_ROOT + "/data/raw/" + "das.feather")

    df = pd.read_feather(save_path)
    
    return df

In [4]:
all_questions = load_data()

In [5]:
all_questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1430433 entries, 0 to 1430432
Data columns (total 6 columns):
site_name           1430433 non-null object
documentid          1430433 non-null object
customquestionid    1430433 non-null int64
questiontext        1430288 non-null object
answertext          1152711 non-null object
submissiondate      1414242 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 65.5+ MB


In [6]:
# Limit to questions that mention "data availability" or "data accessibility".

data_statements = all_questions[
    all_questions["questiontext"].str.contains(
        r"data availability|data accessibility",
        na=False,  # Ignore rows with NaNs
        case=False  # Ignore case
    )
]

data_statements.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197899 entries, 2 to 1430426
Data columns (total 6 columns):
site_name           197899 non-null object
documentid          197899 non-null object
customquestionid    197899 non-null int64
questiontext        197899 non-null object
answertext          127714 non-null object
submissiondate      191066 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 10.6+ MB


In [8]:
save_path = os.path.join(PROJ_ROOT + '/data/interim/' + 'data_statements.feather')

data_statements.reset_index(drop=True).to_feather(save_path)