# Testing downloading a sample datum in python

In [1]:
url = "https://parlacap.ipipan.waw.pl/"

import json
import requests

response = requests.get(url + "sample?size=1")
if not response.status_code == 200:
    raise Exception(f"Got weird response code: {response.status_code}")

payload = response.json()
payload

[{'id': 5878951,
  'parlmint_text_id': None,
  'parlmint_id': None,
  'date': '2016-02-04',
  'parliament': 'PT',
  'vdem_country_id': 21,
  'lang': 'Portuguese',
  'speaker_role': 'Regular',
  'speaker_mp': None,
  'speaker_minister': 'notMinister',
  'speaker_party': 'GP-BE',
  'speaker_party_name': 'Grupo Parlamentar do Bloco de Esquerda',
  'party_status': 'Opposition',
  'party_orientation': 'Left to far-left',
  'partyfacts_id': '1310.0',
  'speaker_id': 'PedroManuelBastosRodriguesSoares',
  'speaker_name': 'Manuel Bastos Rodrigues Soares, Pedro',
  'speaker_gender': 'M',
  'speaker_birth': 1957,
  'word_count': 174,
  'cap_category': None,
  'cap_prob': None,
  'sent_logit': 1.209,
  'sent3_category': 'Negative',
  'sent6_category': 'Mixed Negative',
  'text': '… e esta tem sido a secular fratura do Douro . A democracia devolveu autonomia à Casa do Douro , conferindo -lhe natureza pública com função representativa de todos os produtores. Por via da igualdade do voto e da inelegi

# Checking the datatypes of available columns:

In [2]:
response = requests.get(url + "variables")
if not response.status_code == 200:
    raise Exception(f"Got weird response code: {response.status_code}")

payload = response.json()
payload

[{'name': 'id', 'type': 'INTEGER'},
 {'name': 'parlmint_text_id', 'type': 'TEXT'},
 {'name': 'parlmint_id', 'type': 'TEXT'},
 {'name': 'date', 'type': 'TEXT'},
 {'name': 'parliament', 'type': 'TEXT'},
 {'name': 'vdem_country_id', 'type': 'INTEGER'},
 {'name': 'lang', 'type': 'TEXT'},
 {'name': 'speaker_role', 'type': 'TEXT'},
 {'name': 'speaker_mp', 'type': 'TEXT'},
 {'name': 'speaker_minister', 'type': 'TEXT'},
 {'name': 'speaker_party', 'type': 'TEXT'},
 {'name': 'speaker_party_name', 'type': 'TEXT'},
 {'name': 'party_status', 'type': 'TEXT'},
 {'name': 'party_orientation', 'type': 'TEXT'},
 {'name': 'partyfacts_id', 'type': 'TEXT'},
 {'name': 'speaker_id', 'type': 'TEXT'},
 {'name': 'speaker_name', 'type': 'TEXT'},
 {'name': 'speaker_gender', 'type': 'TEXT'},
 {'name': 'speaker_birth', 'type': 'INTEGER'},
 {'name': 'word_count', 'type': 'INTEGER'},
 {'name': 'cap_category', 'type': 'TEXT'},
 {'name': 'cap_prob', 'type': 'REAL'},
 {'name': 'sent_logit', 'type': 'REAL'},
 {'name': 'se

# Figuring out filtering

In [3]:
data = {"search": {"query": "spoštovani", "field": "text"}, "limit": 1}
response = requests.post(
    url + "filter",
    json=data,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

[{'id': 24495,
  'parlmint_text_id': None,
  'parlmint_id': None,
  'date': '1998-07-07',
  'parliament': 'AT',
  'vdem_country_id': 144,
  'lang': 'German',
  'speaker_role': 'Regular',
  'speaker_mp': None,
  'speaker_minister': 'notMinister',
  'speaker_party': 'LIF',
  'speaker_party_name': 'parliamentary group of Liberal Forum',
  'party_status': '-',
  'party_orientation': 'Centre',
  'partyfacts_id': '605.0',
  'speaker_id': 'PAD_01894',
  'speaker_name': 'Smolle, Karl',
  'speaker_gender': 'M',
  'speaker_birth': 1944,
  'word_count': 1228,
  'cap_category': None,
  'cap_prob': None,
  'sent_logit': 1.795,
  'sent3_category': 'Neutral',
  'sent6_category': 'Neutral Negative',
  'text': 'Das ist sehr freundlich, Herr Präsident! – Spoštovani gospod predsednik! Visoki Dom! Gospod minister! Hohes Haus! Die Uneinigkeit der Regierungsparteien haben wir heute wieder demonstriert bekommen. Die einzige Einigkeit, die hier besteht, ist, daß man die Mittelschulbildung ein bißchen auffrisc

In [4]:
filter = {
    "filter": {
        "operator": "OR",
        "filters": [
            {
                "operator": "AND",
                "filters": [
                    {"column": "date", "operator": ">=", "value": "2015-11-01"},
                    {"column": "speaker_name", "operator": "LIKE", "value": "Duda"},
                ],
            },
            {"column": "parliament", "operator": "=", "value": "ES"},
            {"column": "parliament", "operator": "=", "value": "PL"},
        ],
    },
    "search": {"query": "panie i panowie", "field": "text"},
    "columns": ["speaker_name", "date", "word_count", "text", "cap_category"],
    "limit": 5,
    "offset": 0,
}


response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

[{'speaker_name': 'Duda, Andrzej',
  'date': '2015-11-12',
  'word_count': 1464,
  'text': 'Drodzy Rodacy! Panie Marszałku! Szanowni Państwo Marszałkowie poprzedniej i wcześniejszych kadencji Sejmu! Pani Premier! Państwo Premierzy obecni na tej sali! Panie Posłanki, Panowie Posłowie! Wszyscy Dostojni Goście! Ekscelencje! Dziękuję. Przede wszystkim dziękuję i panu marszałkowi, i Wysokiej Izbie za możliwość zabrania głosu w tak niezwykłym, uroczystym dniu, kiedy Sejm VIII kadencji rozpoczyna swoją pracę, powiem więcej: swoją służbę dla Rzeczypospolitej. Ale pozwólcie państwo, że w pierwszej kolejności słowa podziękowania skieruję do wyborców, do tych, którzy poszli do głosowania, do tych, którzy państwa wybrali, mając różne poglądy, głosując na różne ugrupowania. W efekcie doprowadzili do sformowania Sejmu. Dziękuję za to, dlatego że apelowałem przed wyborami jako prezydent Rzeczypospolitej o udział w wyborach, apelowała pani premier, apelowali właściwie wszyscy politycy, dlatego że wszy

# In search of null filtering

This does not seem to work. I keep getting Nginx Gateway Timeout errors

In [5]:
url = "https://parlacap.ipipan.waw.pl/"


filter = {
    "filter": {"column": "cap_category", "value": None, "operator": "="},
    "limit": 1,
    "offset": 10,
}

response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

Exception: Got weird response code: 504, response text: <html>
<head><title>504 Gateway Time-out</title></head>
<body>
<center><h1>504 Gateway Time-out</h1></center>
<hr><center>nginx</center>
</body>
</html>


# Checking hunches with sampling

Let's see on a sample of 60*500 instances if we find any non-null parlmint_text_id, parlmint_id, cap_prob, and cap_category values:

In [6]:
N = 60
examined = 0
textidcount, idcount, cap_category, cap_prob, sent_logit = 0, 0, 0, 0, 0
for i in range(N):
    url = "https://parlacap.ipipan.waw.pl/"
    take_n = 500
    import json
    import requests

    response = requests.get(url + f"sample?size={take_n}")
    if not response.status_code == 200:
        raise Exception(
            f"Got weird response code: {response.status_code}, response text: {response.text}"
        )

    payload = response.json()
    textidcount += len([i for i in payload if i["parlmint_text_id"] is not None])
    idcount += len([i for i in payload if i["parlmint_id"] is not None])
    cap_category += len([i for i in payload if i["cap_category"] is not None])
    cap_prob += len([i for i in payload if i["cap_prob"] is not None])
    sent_logit += len([i for i in payload if i["sent_logit"] is not None])
    examined += take_n
    print(
        f"Examined: {examined} instances, found nonnulls: {textidcount=}, {idcount=}, {cap_prob=}, {cap_category=}, {sent_logit=}",
        end="\r",
    )

Examined: 30000 instances, found nonnulls: textidcount=0, idcount=0, cap_prob=0, cap_category=0, sent_logit=30000