In [1]:
from typing import Union, List
from pathlib import Path

import pandas as pd

directory = ".."
xmlid = "5"

text_path = Path(directory) / (xmlid + "_text.txt")
meta_path = Path(directory) / (xmlid + "_meta.tsv")
mp_path = Path(directory) / Path("Croatia_MPs_final_ 20220917.xlsx")
parties_path = Path(directory) / Path("Croatia_parties_final_20220917.xlsx")


def parse_meta_file(file: Union[str, Path]) -> pd.DataFrame:
    if isinstance(file, Path):
        assert file.exists(), "The path does not exist!"
        file = str(file)
    return pd.read_csv(file, sep="\t")


def parse_text_file(file: Union[str, Path]) -> pd.DataFrame:
    if isinstance(file, Path):
        assert file.exists(), "The path does not exist!"
        file = str(file)
    with open(file, "r") as f:
        contents = f.readlines()
    IDs = [i.split()[0] for i in contents]
    texts = [" ".join(i.split()[1:]) for i in contents]

    return pd.DataFrame(data={
        "ID": IDs,
        "Text": texts
    })
mpdf = pd.read_excel(str(mp_path))
partiesdf = pd.read_excel(str(parties_path))

textdf = parse_text_file(text_path)
metadf = parse_meta_file(meta_path)

metatextdf = textdf.merge(metadf, on="ID")
metatextdf["term2"] = int(xmlid)


In [2]:
pd.set_option('display.max_colwidth', 30)
pd.set_option('display.max_columns', None)


In [4]:
alldatamerged = metatextdf.merge(mpdf,
    how="left",
    left_on=["term2", "Codemp"],
    right_on=["term2", "codemp"]
).merge(partiesdf,
    how="left",
    left_on=["term2", "party"],
    right_on=["term2", "party"]
)

print(alldatamerged.shape)
alldatamerged.sample(3)

(96743, 53)


Unnamed: 0,ID,Text,Title,From,To,House,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Speaker_role,Speaker_type,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth,Codemp,Codeparty,term2,codemp,order_id,term1_x,term_id,type_of_list,fullname,firstname,lastname,party,date_of_birth,year_of_birth,gender,place_of_birth,field_of_study,education_y,constituency,bp_lat,bp_lon,codeparty,term1_y,full_name,established,chairman,ideology_LR,party_family,election_result,no_seats,coalition,coalition_composition,ruling
79638,ParlaMint-HR_T5.S23.u79639,"Potpredsjednik Doma, pošto...",Minutes of the National As...,2006-11-15,2006-12-15,,5,23,47,,Konačni prijedlog zakona o...,Reference,Chairperson,MP,HDZ,Hrvatska demokratska zajed...,Coalition,"Šeks, Vladimir",M,1943.0,M638,P3,5,M638,322.0,2003-2007,130.0,normal,"Šeks, Vladimir",Vladimir,Šeks,HDZ,19430101,1943.0,0.0,Osijek,5,16,4,45.554962,18.695514,P3,2003-2007,Hrvatska demokratska zajed...,1989.0,Ivo Sanader,4.0,1,33.91,66.0,0.0,-,1.0
38521,ParlaMint-HR_T5.S14.u38522,"Molim, tko je za taj amand...",Minutes of the National As...,2005-05-18,2005-06-10,,5,14,39,,Konačni prijedlog zakona o...,Reference,Chairperson,MP,HDZ,Hrvatska demokratska zajed...,Coalition,"Šeks, Vladimir",M,1943.0,M638,P3,5,M638,322.0,2003-2007,130.0,normal,"Šeks, Vladimir",Vladimir,Šeks,HDZ,19430101,1943.0,0.0,Osijek,5,16,4,45.554962,18.695514,P3,2003-2007,Hrvatska demokratska zajed...,1989.0,Ivo Sanader,4.0,1,33.91,66.0,0.0,-,1.0
80719,ParlaMint-HR_T5.S23.u80720,Zahvaljujem. Riječ ima uva...,Minutes of the National As...,2006-11-15,2006-12-15,,5,23,69; 70; 71; 72; 73,,Prijedlog odluke Hrvatskog...,Reference,Chairperson,MP,HDZ,Hrvatska demokratska zajed...,Coalition,"Milinović, Darko",M,1963.0,M115,P3,5,M115,282.0,2003-2007,90.0,normal,"Milinović, Darko",Darko,Milinović,HDZ,19630425,1963.0,0.0,Gospić,6,18,9,44.546934,15.375049,P3,2003-2007,Hrvatska demokratska zajed...,1989.0,Ivo Sanader,4.0,1,33.91,66.0,0.0,-,1.0


In [5]:
import classla

try:
    pipeline = classla.Pipeline("hr", processors="tokenize")
except FileNotFoundError:
    classla.download('hr')
    pipeline = classla.Pipeline("hr", processors="tokenize")

def split_sentences(s: str) -> List[str]:
    global pipeline
    results = pipeline.process(s)
    return [i.text for i in results.sentences]
from tqdm import tqdm
tqdm.pandas()
alldatamerged["sentences"] = alldatamerged.Text.progress_apply(split_sentences)

2022-09-29 13:10:39 INFO: Loading these models for language: hr (Croatian):
| Processor | Package  |
------------------------
| tokenize  | standard |

2022-09-29 13:10:39 INFO: Use device: cpu
2022-09-29 13:10:39 INFO: Loading: tokenize
2022-09-29 13:10:39 INFO: Done loading processors!
100%|██████████| 96743/96743 [07:05<00:00, 227.33it/s] 


In [6]:
alldatamerged.columns

Index(['ID', 'Text', 'Title', 'From', 'To', 'House', 'Term', 'Session',
       'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Speaker_role',
       'Speaker_type', 'Speaker_party', 'Speaker_party_name', 'Party_status',
       'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Codemp',
       'Codeparty', 'term2', 'codemp', 'order_id', 'term1_x', 'term_id',
       'type_of_list', 'fullname', 'firstname', 'lastname', 'party',
       'date_of_birth', 'year_of_birth', 'gender', 'place_of_birth',
       'field_of_study', 'education_y', 'constituency', 'bp_lat', 'bp_lon',
       'codeparty', 'term1_y', 'full_name', 'established', 'chairman',
       'ideology_LR', 'party_family', 'election_result', 'no_seats',
       'coalition', 'coalition_composition', 'ruling', 'sentences'],
      dtype='object')

In [7]:
to_categ = ['ID', 'Title', 'From', 'To', 'House', 'Term', 'Session',
       'Meeting', 'Sitting', 'Agenda', 'Subcorpus', 'Speaker_role',
       'Speaker_type', 'Speaker_party', 'Speaker_party_name', 'Party_status',
       'Speaker_name', 'Speaker_gender', 'Speaker_birth', 'Codemp',
       'Codeparty', 'term2', 'codemp', 'order_id', 'term1_x', 'term_id',
       'type_of_list', 'fullname', 'firstname', 'lastname', 'party',
       'date_of_birth', 'year_of_birth', 'gender', 'place_of_birth',
       'field_of_study', 'education_y', 'constituency', 'bp_lat', 'bp_lon',
       'codeparty', 'term1_y', 'full_name', 'established', 'chairman',
       'ideology_LR', 'party_family', 'election_result', 'no_seats',
       'coalition', 'coalition_composition', 'ruling']
for c in to_categ:
    alldatamerged[c] = pd.Categorical(alldatamerged[c])


In [11]:
alldatamerged.to_pickle("01_merged_data")


In [17]:
alldatamerged.sentences[0]

['Cijenjene gospođe i gospodo.',
 'Pripala mi je čast da sukladno Poslovniku Hrvatskog sabora otvorim prvu Konstituirajuću sjednicu 5. saziva Hrvatskog sabora i privremeno joj predsjedavam do izbora predsjednika.',
 'Sve vas srdačno pozdravljam a izabranim zastupnicima čestitam na izboru za zastupnike u Hrvatski sabor.',
 'Posebno pozdravljam predsjednika Republike Hrvatske gospodina Stjepana Mesića (Pljesak) Predsjednika Vlade Republike Hrvatske Ivicu Račana i nazočne potpredsjednike i članove Vlade.(Pljesak) Mandatara za sastav Vlade gospodina Ivu Sanadera.(Pljesak) Dosadašnje potpredsjednike Hrvatskog sabora i prvoga predsjednika Hrvatskog sabora gospodina Žarka Domjana. (Pljesak) Pozdravljam nazočne čelnike stranaka nazočne u Hrvatskom saboru koji nisu zastupnici.',
 'Pozdravljam predsjednika Ustavnog suda Republike Hrvatske gospodina Petra Klarića. (Pljesak) Predsjednika Vrhovnog suda Republike Hrvatske gospodina Ivicu CRnića i nazočne predstavnike sudbene vlasti. (Pljesak).',
 'P