# Figuring out filtering

In [3]:
filter = {
    "filter": {
        "operator": "NOT",
        "filters": [
            {"column": "speaker_gender", "operator": "IN", "value": ["M", "F"]},
        ],
    },
    "limit": 5,
    "offset": 0,
}

import requests

url = "https://parlacap.ipipan.waw.pl/"
response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

[{'id': 358439,
  'parlacap_id': 'ParlaCAP-BE_2014-09-03_00003',
  'parlamint_text_id': 'ParlaMint-BE_2014-09-03-54-commissie-ic004x',
  'parlamint_id': 'ParlaMint-BE_2014-09-03-54-commissie-ic004x.u3',
  'date': '2014-09-03',
  'parliament': 'BE',
  'vdem_country_id': 148,
  'lang': 'Multilingual',
  'speaker_role': 'Guest',
  'speaker_mp': 'notMP',
  'speaker_minister': 'notMinister',
  'speaker_party': None,
  'speaker_party_name': None,
  'party_status': None,
  'party_orientation': None,
  'partyfacts_id': None,
  'speaker_id': 'BensJan',
  'speaker_name': 'Bens, Jan',
  'speaker_gender': 'U',
  'speaker_birth': None,
  'word_count': 1508,
  'cap_category': 'Energy',
  'cap_prob': 0.989,
  'sent_logit': 2.157,
  'sent3_category': 'Neutral',
  'sent6_category': 'Neutral Negative',
  'text': 'Mijnheer de voorzitter, mevrouw de voorzitter, geachte leden, bedankt voor de uitnodiging om wat toelichting te komen geven over de toestand van de kerncentrales in België. Vooraleer ik aan die

# In search of null filtering

This does not seem to work. I keep getting Nginx Gateway Timeout errors

In [None]:



filter = {
    "filter": {"column": "cap_category", "value": None, "operator": "="},
    "limit": 1,
    "offset": 10,
}

response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

payload = response.json()
payload

[{'id': 6675284,
  'parlacap_id': 'ParlaCAP-SI_2012-04-21_00011',
  'parlamint_text_id': 'ParlaMint-SI_2012-04-21-SDZ6-Izredna-42',
  'parlamint_id': 'ParlaMint-SI_2012-04-21-SDZ5-Izredna-42.u11',
  'date': '2012-04-21',
  'parliament': 'SI',
  'vdem_country_id': 202,
  'lang': 'Slovenian',
  'speaker_role': 'Chairperson',
  'speaker_mp': 'notMP',
  'speaker_minister': 'notMinister',
  'speaker_party': None,
  'speaker_party_name': None,
  'party_status': None,
  'party_orientation': None,
  'partyfacts_id': None,
  'speaker_id': 'GantarPavel',
  'speaker_name': 'Gantar, Pavel',
  'speaker_gender': 'M',
  'speaker_birth': 1949,
  'word_count': 13,
  'cap_category': None,
  'cap_prob': None,
  'sent_logit': 3.199,
  'sent3_category': 'Neutral',
  'sent6_category': 'Neutral Positive',
  'text': 'Hvala lepa. Stališče Poslanske skupine demokratične stranke upokojencev Slovenije bo predstavil Franc Jurša.',
  'text_en': 'Thank you very much. The position of the Group of the Democratic Party

# Checking hunches with sampling

Let's see on a sample of 60*500 instances if we find any non-null parlmint_text_id, parlmint_id, cap_prob, and cap_category values:

In [5]:
N = 1
examined = 0
textidcount, idcount, cap_category, cap_prob, sent_logit = 0, 0, 0, 0, 0
for i in range(N):
    url = "https://parlacap.ipipan.waw.pl/"
    take_n = 500
    import json
    import requests

    response = requests.get(url + f"sample?size={take_n}")
    if not response.status_code == 200:
        raise Exception(
            f"Got weird response code: {response.status_code}, response text: {response.text}"
        )

    payload = response.json()
    textidcount += len([i for i in payload if i["parlamint_text_id"] is not None])
    idcount += len([i for i in payload if i["parlamint_id"] is not None])
    cap_category += len([i for i in payload if i["cap_category"] is not None])
    cap_prob += len([i for i in payload if i["cap_prob"] is not None])
    sent_logit += len([i for i in payload if i["sent_logit"] is not None])
    examined += take_n
    print(
        f"Examined: {examined} instances, found nonnulls: {textidcount=}, {idcount=}, {cap_prob=}, {cap_category=}, {sent_logit=}",
        end="\r",
    )

Examined: 500 instances, found nonnulls: textidcount=500, idcount=500, cap_prob=500, cap_category=500, sent_logit=500

# Testing issues found by MM

Let's test if downloading in CSV format works:

In [None]:
url = "https://parlacap.ipipan.waw.pl/"


filter = {
    "filter": {"column": "parliament", "value": "SI", "operator": "="},
    "limit": 100,
    "offset": 10,
}

response = requests.post(
    url + "download?format=csv",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )
with open("downloaded_data.csv", "wb") as f:
    f.write(response.content)


In [7]:
import pandas as pd

df = pd.read_csv(
    "downloaded_data.csv",
)
print(df.dtypes)
df.head(2)

id                      int64
parlacap_id            object
parlamint_text_id      object
parlamint_id           object
date                   object
parliament             object
vdem_country_id         int64
lang                   object
speaker_role           object
speaker_mp             object
speaker_minister       object
speaker_party          object
speaker_party_name     object
party_status           object
party_orientation      object
partyfacts_id         float64
speaker_id             object
speaker_name           object
speaker_gender         object
speaker_birth         float64
word_count              int64
cap_category           object
cap_prob              float64
sent_logit            float64
sent3_category         object
sent6_category         object
text                   object
text_en                object
dtype: object


Unnamed: 0,id,parlacap_id,parlamint_text_id,parlamint_id,date,parliament,vdem_country_id,lang,speaker_role,speaker_mp,...,speaker_gender,speaker_birth,word_count,cap_category,cap_prob,sent_logit,sent3_category,sent6_category,text,text_en
0,6502516,ParlaCAP-SI_2000-10-27_00011,ParlaMint-SI_2000-10-27-SDZ3-Redna-01,ParlaMint-SI_2000-10-27-SDZ3-Redna-01.u11,2000-10-27,SI,202,Slovenian,Regular,MP,...,M,1948.0,64,Government Operations,0.947,1.603,Neutral,Neutral Negative,Spoštovane poslanke in poslanci. Govorim v ime...,Dear Members and Members. I speak on behalf of...
1,6502517,ParlaCAP-SI_2000-10-27_00012,ParlaMint-SI_2000-10-27-SDZ3-Redna-01,ParlaMint-SI_2000-10-27-SDZ3-Redna-01.u12,2000-10-27,SI,202,Slovenian,Chairperson,MP,...,M,1924.0,40,Other,0.998,2.934,Neutral,Neutral Positive,Hvala lepa. Želi še kakšna poslanska skupina b...,Thank you very much. Does any other parliament...


In [None]:
import polars as pl

df = pl.read_csv("downloaded_data.csv", skip_rows=7)
df.head(2)

In [None]:
df.dtypes

df["speaker_name"].describe()

In [17]:
for format in ["csv", "jsonl", "tsv", "parquet"]:
    url = "https://parlacap.ipipan.waw.pl/"
    filter = {
        "filter": {"column": "parliament", "value": "SI", "operator": "="},
        "limit": 10,
        "offset": 10,
    }

    response = requests.post(
        url + f"download?format={format}",
        json=filter,
    )
    if not response.status_code == 200:
        raise Exception(
            f"Got weird response code: {response.status_code}, response text: {response.text}"
        )
    with open(f"downloaded_data.{format}", "wb") as f:
        f.write(response.content)

In [20]:
import pandas as pd

pd.read_parquet("downloaded_data.parquet").head(2)

Unnamed: 0,id,parlmint_text_id,parlmint_id,date,parliament,vdem_country_id,lang,speaker_role,speaker_mp,speaker_minister,...,speaker_gender,speaker_birth,word_count,cap_category,cap_prob,sent_logit,sent3_category,sent6_category,text,text_en
0,6502516,,,2000-10-27,SI,202,Slovenian,Regular,,Minister,...,M,1948.0,64,,,1.603,Neutral,Neutral Negative,Spoštovane poslanke in poslanci. Govorim v ime...,Dear Members and Members. I speak on behalf of...
1,6502517,,,2000-10-27,SI,202,Slovenian,Chairperson,,notMinister,...,M,1924.0,40,,,2.934,Neutral,Neutral Positive,Hvala lepa. Želi še kakšna poslanska skupina b...,Thank you very much. Does any other parliament...


In [None]:
url = "https://parlacap.ipipan.waw.pl/"


filter = {
    "filter": {"column": "parliament", "value": "SI", "operator": "="},
    "limit": 100,
    "offset": 10_000_000,
}

response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, response text: {response.text}"
    )

response.json()

# What do LIKE and IN do?

In [None]:
filter = {
    "filter": {
        "operator": "AND",
        "filters": [
            {"column": "speaker_party", "operator": "IN", "value": ["NSi", "SDS"]},
            {"column": "parliament", "operator": "IN", "value": ["SI"]},
        ],
    },
    "columns": [
        "id",
        "speaker_role",
        "speaker_mp",
        "speaker_party",
        "speaker_party_name",
        "party_status",
        "speaker_name",
        "speaker_birth",
        "text",
    ],
    "order_by": ["id"],
    "limit": 100,
    "offset": 0,
}


response = requests.post(
    url + "filter",
    json=filter,
)
if not response.status_code == 200:
    raise Exception(
        f"Got weird response code: {response.status_code}, {response.content}"
    )

payload = response.json()
payload
set([i["speaker_party"] for i in payload])

In [1]:
import polars as pl

df = pl.read_excel("downloaded_data_metadata.xlsx")
df

id,parlacap_id,parlamint_text_id,parlamint_id,date,parliament,vdem_country_id,lang,speaker_role,speaker_mp,speaker_minister,speaker_party,speaker_party_name,party_status,party_orientation,partyfacts_id,speaker_id,speaker_name,speaker_gender,speaker_birth,word_count,cap_category,cap_prob,sent_logit,sent3_category,sent6_category,text,text_en
i64,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,f64,f64,str,str,str,str
6502516,"""ParlaCAP-SI_2000-10-27_00011""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Regular""","""MP""","""Minister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""","""1618.0""","""PeterleAlojz""","""Peterle, Alojz Lojze""","""M""",1948.0,64,"""Government Operations""",0.947,1.603,"""Neutral""","""Neutral Negative""","""Spoštovane poslanke in poslanc…","""Dear Members and Members. I sp…"
6502517,"""ParlaCAP-SI_2000-10-27_00012""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Chairperson""","""MP""","""notMinister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""","""1618.0""","""BernikJožef""","""Bernik, Jožef""","""M""",1924.0,40,"""Other""",0.998,2.934,"""Neutral""","""Neutral Positive""","""Hvala lepa. Želi še kakšna pos…","""Thank you very much. Does any …"
6502518,"""ParlaCAP-SI_2000-10-27_00013""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Regular""","""MP""","""notMinister""","""LDS""","""Liberal Democracy of Slovenia""","""Coalition""","""Centre to centre-left""","""975.0""","""AnderličAnton""","""Anderlič, Anton""","""M""",1956.0,43,"""Other""",0.997,2.82,"""Neutral""","""Neutral Positive""","""Glede na predhodno obrazložite…","""In view of the previous explan…"
6502519,"""ParlaCAP-SI_2000-10-27_00014""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Chairperson""","""MP""","""notMinister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""","""1618.0""","""BernikJožef""","""Bernik, Jožef""","""M""",1924.0,242,"""Government Operations""",0.728,2.823,"""Neutral""","""Neutral Positive""","""Na glasovanje dajem predlog za…","""I am putting to the vote a pro…"
6502520,"""ParlaCAP-SI_2000-10-27_00015""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Regular""","""MP""","""notMinister""","""SLS+SKD""","""Slovenian People's Party and S…","""Coalition""","""Centre-right""","""764.0""","""PodobnikJanez""","""Podobnik, Janez""","""M""",1959.0,162,"""Government Operations""",0.982,2.843,"""Neutral""","""Neutral Positive""","""Spoštovani dr. Bernik, kolegic…","""Dear Dr. Bernick, colleagues a…"
6502521,"""ParlaCAP-SI_2000-10-27_00016""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Chairperson""","""MP""","""notMinister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""","""1618.0""","""BernikJožef""","""Bernik, Jožef""","""M""",1924.0,118,"""Other""",0.996,3.065,"""Neutral""","""Neutral Positive""","""Hvala lepa, gospod Podobnik. Ž…","""Thank you very much, Mr. Podob…"
6502522,"""ParlaCAP-SI_2000-10-27_00017""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Chairperson""","""MP""","""notMinister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""","""1618.0""","""BernikJožef""","""Bernik, Jožef""","""M""",1924.0,192,"""Government Operations""",0.975,2.644,"""Neutral""","""Neutral Positive""","""Spoštovani poslanci, malo potr…","""Ladies and gentlemen, I ask fo…"
6502523,"""ParlaCAP-SI_2000-10-27_00018""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Regular""","""MP""","""notMinister""","""DeSUS""","""Democratic Party of Pensioners…","""Coalition""","""Centre to centre-left""","""467.0""","""PohorecValentin""","""Pohorec, Valentin""","""M""",1941.0,379,"""Government Operations""",0.989,2.844,"""Neutral""","""Neutral Positive""","""Spoštovani gospod predsedujoči…","""Mr President, honourable Membe…"
6502524,"""ParlaCAP-SI_2000-10-27_00019""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Chairperson""","""MP""","""notMinister""","""NSi""","""New Slovenia – Christian Democ…","""Opposition""","""Centre-right""","""1618.0""","""BernikJožef""","""Bernik, Jožef""","""M""",1924.0,196,"""Other""",0.733,3.17,"""Neutral""","""Neutral Positive""","""Hvala gospodu predsedniku, Val…","""Thank you, Mr. President, Vale…"
6502525,"""ParlaCAP-SI_2000-10-27_00020""","""ParlaMint-SI_2000-10-27-SDZ3-R…","""ParlaMint-SI_2000-10-27-SDZ3-R…","""2000-10-27""","""SI""",202,"""Slovenian""","""Regular""","""notMP""","""notMinister""",,,,,,"""FrankarGašper""","""Frankar, Gašper""","""M""",,16,"""Other""",0.997,4.23,"""Positive""","""Mixed Positive""","""Gospod predsedujoči, hvala lep…","""Mr. Chairman, thank you very m…"
