# Testing downloading a sample datum in python

In [1]:
url = "https://parlacap.ipipan.waw.pl/"

import json
import requests

response = requests.get(url + "sample?size=1")
if not response.status_code == 200:
    raise Exception(f"Got weird response code: {response.status_code}")

payload = response.json()
payload

[{'id': 4278958,
  'parlmint_text_id': None,
  'parlmint_id': None,
  'date': '2022-04-25',
  'parliament': 'IS',
  'vdem_country_id': 168,
  'lang': 'Icelandic',
  'speaker_role': 'Regular',
  'speaker_mp': None,
  'speaker_minister': 'notMinister',
  'speaker_party': '-',
  'speaker_party_name': '-',
  'party_status': '-',
  'party_orientation': '-',
  'partyfacts_id': None,
  'speaker_id': 'SteinunnArnadottir',
  'speaker_name': 'Árnadóttir, Steinunn Þóra',
  'speaker_gender': 'F',
  'speaker_birth': 1977,
  'word_count': 145,
  'cap_category': None,
  'cap_prob': None,
  'sent_logit': 3.266,
  'sent3_category': 'Neutral',
  'sent6_category': 'Neutral Positive',
  'text': 'Frú forseti. Ég held að það sé enginn að neita því að fara yfir og skoða þetta ferli. Hæstv. fjármálaráðherra lagði fram tillögu hér sem hann byggði á tillögu Bankasýslunnar. Ég veit ekki betur en að þar hafi allt verið eftir hinu lögformlega ferli. En ég bind vonir við að þetta sé eitt af því sem verið er að skoð

# Checking the datatypes of available columns:

In [3]:
response = requests.get(url + "variables")
if not response.status_code == 200:
    raise Exception(f"Got weird response code: {response.status_code}")

payload = response.json()
payload

[{'name': 'id', 'type': 'INTEGER'},
 {'name': 'parlmint_text_id', 'type': 'TEXT'},
 {'name': 'parlmint_id', 'type': 'TEXT'},
 {'name': 'date', 'type': 'TEXT'},
 {'name': 'parliament', 'type': 'TEXT'},
 {'name': 'vdem_country_id', 'type': 'INTEGER'},
 {'name': 'lang', 'type': 'TEXT'},
 {'name': 'speaker_role', 'type': 'TEXT'},
 {'name': 'speaker_mp', 'type': 'TEXT'},
 {'name': 'speaker_minister', 'type': 'TEXT'},
 {'name': 'speaker_party', 'type': 'TEXT'},
 {'name': 'speaker_party_name', 'type': 'TEXT'},
 {'name': 'party_status', 'type': 'TEXT'},
 {'name': 'party_orientation', 'type': 'TEXT'},
 {'name': 'partyfacts_id', 'type': 'TEXT'},
 {'name': 'speaker_id', 'type': 'TEXT'},
 {'name': 'speaker_name', 'type': 'TEXT'},
 {'name': 'speaker_gender', 'type': 'TEXT'},
 {'name': 'speaker_birth', 'type': 'INTEGER'},
 {'name': 'word_count', 'type': 'INTEGER'},
 {'name': 'cap_category', 'type': 'TEXT'},
 {'name': 'cap_prob', 'type': 'REAL'},
 {'name': 'sent_logit', 'type': 'REAL'},
 {'name': 'se

# Figuring out filtering

In [4]:
data = {"search": {"query": "spoštovani", "field": "text"}, "limit": 1}
response = requests.post(
    url + "filter",
    json=data,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

[{'id': 24495,
  'parlmint_text_id': None,
  'parlmint_id': None,
  'date': '1998-07-07',
  'parliament': 'AT',
  'vdem_country_id': 144,
  'lang': 'German',
  'speaker_role': 'Regular',
  'speaker_mp': None,
  'speaker_minister': 'notMinister',
  'speaker_party': 'LIF',
  'speaker_party_name': 'parliamentary group of Liberal Forum',
  'party_status': '-',
  'party_orientation': 'Centre',
  'partyfacts_id': '605.0',
  'speaker_id': 'PAD_01894',
  'speaker_name': 'Smolle, Karl',
  'speaker_gender': 'M',
  'speaker_birth': 1944,
  'word_count': 1228,
  'cap_category': None,
  'cap_prob': None,
  'sent_logit': 1.795,
  'sent3_category': 'Neutral',
  'sent6_category': 'Neutral Negative',
  'text': 'Das ist sehr freundlich, Herr Präsident! – Spoštovani gospod predsednik! Visoki Dom! Gospod minister! Hohes Haus! Die Uneinigkeit der Regierungsparteien haben wir heute wieder demonstriert bekommen. Die einzige Einigkeit, die hier besteht, ist, daß man die Mittelschulbildung ein bißchen auffrisc

In [5]:
filter = {
    "filter": {
        "operator": "OR",
        "filters": [
            {
                "operator": "AND",
                "filters": [
                    {"column": "date", "operator": ">=", "value": "2015-11-01"},
                    {"column": "speaker_name", "operator": "LIKE", "value": "Duda"},
                ],
            },
            {"column": "parliament", "operator": "=", "value": "ES"},
            {"column": "parliament", "operator": "=", "value": "PL"},
        ],
    },
    "search": {"query": "panie i panowie", "field": "text"},
    "columns": ["speaker_name", "date", "word_count", "text", "cap_category"],
    "limit": 5,
    "offset": 0,
}


response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

[{'speaker_name': 'Duda, Andrzej',
  'date': '2015-11-12',
  'word_count': 1464,
  'text': 'Drodzy Rodacy! Panie Marszałku! Szanowni Państwo Marszałkowie poprzedniej i wcześniejszych kadencji Sejmu! Pani Premier! Państwo Premierzy obecni na tej sali! Panie Posłanki, Panowie Posłowie! Wszyscy Dostojni Goście! Ekscelencje! Dziękuję. Przede wszystkim dziękuję i panu marszałkowi, i Wysokiej Izbie za możliwość zabrania głosu w tak niezwykłym, uroczystym dniu, kiedy Sejm VIII kadencji rozpoczyna swoją pracę, powiem więcej: swoją służbę dla Rzeczypospolitej. Ale pozwólcie państwo, że w pierwszej kolejności słowa podziękowania skieruję do wyborców, do tych, którzy poszli do głosowania, do tych, którzy państwa wybrali, mając różne poglądy, głosując na różne ugrupowania. W efekcie doprowadzili do sformowania Sejmu. Dziękuję za to, dlatego że apelowałem przed wyborami jako prezydent Rzeczypospolitej o udział w wyborach, apelowała pani premier, apelowali właściwie wszyscy politycy, dlatego że wszy

# In search of null filtering

This does not seem to work. I keep getting Nginx Gateway Timeout errors

In [6]:
url = "https://parlacap.ipipan.waw.pl/"


filter = {
    "filter": {"column": "cap_category", "value": None, "operator": "="},
    "limit": 1,
    "offset": 10,
}

response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

Exception: Got weird response code: 504, response text: <html>
<head><title>504 Gateway Time-out</title></head>
<body>
<center><h1>504 Gateway Time-out</h1></center>
<hr><center>nginx</center>
</body>
</html>


# Checking hunches with sampling

Let's see on a sample of 60*500 instances if we find any non-null parlmint_text_id, parlmint_id, cap_prob, and cap_category values:

In [7]:
N = 60
examined = 0
textidcount, idcount, cap_category, cap_prob, sent_logit = 0, 0, 0, 0, 0
for i in range(N):
    url = "https://parlacap.ipipan.waw.pl/"
    take_n = 500
    import json
    import requests

    response = requests.get(url + f"sample?size={take_n}")
    if not response.status_code == 200:
        raise Exception(
            f"Got weird response code: {response.status_code}, response text: {response.text}"
        )

    payload = response.json()
    textidcount += len([i for i in payload if i["parlmint_text_id"] is not None])
    idcount += len([i for i in payload if i["parlmint_id"] is not None])
    cap_category += len([i for i in payload if i["cap_category"] is not None])
    cap_prob += len([i for i in payload if i["cap_prob"] is not None])
    sent_logit += len([i for i in payload if i["sent_logit"] is not None])
    examined += take_n
    print(
        f"Examined: {examined} instances, found nonnulls: {textidcount=}, {idcount=}, {cap_prob=}, {cap_category=}, {sent_logit=}",
        end="\r",
    )

Examined: 30000 instances, found nonnulls: textidcount=0, idcount=0, cap_prob=0, cap_category=0, sent_logit=30000

# Testing issues found by MM

Let's test if downloading in CSV format works:

In [8]:
url = "https://parlacap.ipipan.waw.pl/"


filter = {
    "filter": {"column": "parliament", "value": "SI", "operator": "="},
    "limit": 100,
    "offset": 10,
}

response = requests.post(
    url + "download?format=csv",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )
with open("downloaded_data.csv", "wb") as f:
    f.write(response.content)


In [9]:
import pandas as pd

df = pd.read_csv("downloaded_data.csv", skiprows=7)
print(df.dtypes)
df.head(2)

id                      int64
parlmint_text_id      float64
parlmint_id           float64
date                   object
parliament             object
vdem_country_id         int64
lang                   object
speaker_role           object
speaker_mp            float64
speaker_minister       object
speaker_party          object
speaker_party_name     object
party_status           object
party_orientation      object
partyfacts_id         float64
speaker_id             object
speaker_name           object
speaker_gender         object
speaker_birth         float64
word_count              int64
cap_category          float64
cap_prob              float64
sent_logit            float64
sent3_category         object
sent6_category         object
text                   object
text_en                object
dtype: object


Unnamed: 0,id,parlmint_text_id,parlmint_id,date,parliament,vdem_country_id,lang,speaker_role,speaker_mp,speaker_minister,...,speaker_gender,speaker_birth,word_count,cap_category,cap_prob,sent_logit,sent3_category,sent6_category,text,text_en
0,6502516,,,2000-10-27,SI,202,Slovenian,Regular,,Minister,...,M,1948.0,64,,,1.603,Neutral,Neutral Negative,Spoštovane poslanke in poslanci. Govorim v ime...,Dear Members and Members. I speak on behalf of...
1,6502517,,,2000-10-27,SI,202,Slovenian,Chairperson,,notMinister,...,M,1924.0,40,,,2.934,Neutral,Neutral Positive,Hvala lepa. Želi še kakšna poslanska skupina b...,Thank you very much. Does any other parliament...


In [10]:
import polars as pl

df = pl.read_csv("downloaded_data.csv", skip_rows=7)
df.head(2)

id,parlmint_text_id,parlmint_id,date,parliament,vdem_country_id,lang,speaker_role,speaker_mp,speaker_minister,speaker_party,speaker_party_name,party_status,party_orientation,partyfacts_id,speaker_id,speaker_name,speaker_gender,speaker_birth,word_count,cap_category,cap_prob,sent_logit,sent3_category,sent6_category,text,text_en
i64,str,str,str,str,i64,str,str,str,str,str,str,str,str,f64,str,str,str,i64,i64,str,str,f64,str,str,str,str
6502516,,,"""2000-10-27""","""SI""",202,"""Slovenian""","""Regular""",,"""Minister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""",1618.0,"""PeterleAlojz""","""Peterle, Alojz Lojze""","""M""",1948,64,,,1.603,"""Neutral""","""Neutral Negative""","""Spoštovane poslanke in poslanc…","""Dear Members and Members. I sp…"
6502517,,,"""2000-10-27""","""SI""",202,"""Slovenian""","""Chairperson""",,"""notMinister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""",1618.0,"""BernikJožef""","""Bernik, Jožef""","""M""",1924,40,,,2.934,"""Neutral""","""Neutral Positive""","""Hvala lepa. Želi še kakšna pos…","""Thank you very much. Does any …"


In [11]:
df.dtypes

df["speaker_name"].describe()

statistic,value
str,str
"""count""","""100"""
"""null_count""","""0"""
"""min""","""Anderlič, Anton"""
"""max""","""Rupel, Dimitrij"""


In [12]:
for format in ["csv", "jsonl", "tsv", "parquet"]:
    url = "https://parlacap.ipipan.waw.pl/"
    filter = {
        "filter": {"column": "parliament", "value": "SI", "operator": "="},
        "limit": 10,
        "offset": 10,
    }

    response = requests.post(
        url + f"download?format={format}",
        json=filter,
    )
    if not response.status_code == 200:
        raise Exception(
            f"Got weird response code: {response.status_code}, response text: {response.text}"
        )
    with open(f"downloaded_data.{format}", "wb") as f:
        f.write(response.content)


In [13]:
pd.read_parquet("downloaded_data.parquet").head(2)

Unnamed: 0,id,parlmint_text_id,parlmint_id,date,parliament,vdem_country_id,lang,speaker_role,speaker_mp,speaker_minister,...,speaker_gender,speaker_birth,word_count,cap_category,cap_prob,sent_logit,sent3_category,sent6_category,text,text_en
0,6502516,,,2000-10-27,SI,202,Slovenian,Regular,,Minister,...,M,1948.0,64,,,1.603,Neutral,Neutral Negative,Spoštovane poslanke in poslanci. Govorim v ime...,Dear Members and Members. I speak on behalf of...
1,6502517,,,2000-10-27,SI,202,Slovenian,Chairperson,,notMinister,...,M,1924.0,40,,,2.934,Neutral,Neutral Positive,Hvala lepa. Želi še kakšna poslanska skupina b...,Thank you very much. Does any other parliament...


In [14]:
url = "https://parlacap.ipipan.waw.pl/"


filter = {
    "filter": {"column": "parliament", "value": "SI", "operator": "="},
    "limit": 100,
    "offset": 10_000_000,
}

response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

response.json()

Exception: Got weird response code: 404, response text: {"detail":{"code":"NO_RESULTS","message":"No results found","details":null}}