In [1]:
import pandas as pd

def process(text: str):
    from parse import compile
    from string import punctuation

    p = compile("{hit:d}.")
    in_list = text.split()
    out_list = list()
    for seg in in_list:
        parse_result = p.parse(seg)
        if parse_result:
            # We got a number with a dot afterward:
            out_list.append(seg.lower())
        else:
            out_list.append(seg.translate(str.maketrans("", "", punctuation)).lower())
    return " ".join(out_list)



df = pd.read_json("ParlaSpeech-HR.v1.0.jsonl", orient="records", lines=True)
df["audio_length"] = df.end-df.start
df = df[df.audio_length >= 8]
df = pd.concat([df, df.speaker_info.apply(pd.Series)], axis=1)
df.head(3)

Unnamed: 0,path,orig_file,start,end,words,word_start_times,norm_words,norm_words_start_times,utterance_id_start,utterance_id_end,...,norm_words_edited,audio_length,Speaker_role,Speaker_type,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth
0,rFVDr4ghXlQ_10620.07-10632.05.wav,"20 2 2020 - 1. dio, 16. sjednica, 9. saziv [rF...",10620.07,10632.05,"[interpretacija,, inzistiranje, na, tim, dezin...","[0, 1.0, 1.83, 1.96, 2.22, 3.79, 4.01, 4.43, 5...","[interpretacija, inzistiranje, na, tim, dezinf...","[0, 1.0, 1.83, 1.96, 2.22, 3.79, 4.01, 4.43, 5...",ParlaMint-HR_S16.u4568,ParlaMint-HR_S16.u4568,...,,11.98,Regular,MP,HDZ,Klub Hrvatske demokratske zajednice,Coalition,"Tuđman, Miroslav",M,1946
1,Ki_SnDM_EkQ_2917.58-2937.5.wav,"18 10 2018 - 9. sjednica, 9. saziv [Ki_SnDM_Ek...",2917.58,2937.5,"[izraelska, tvrtka, prodaje, avione, Hrvatskoj...","[0, 0.65, 1.04, 1.54, 1.92, 2.51, 3.11, 3.5700...","[izraelska, tvrtka, prodaje, avione, hrvatskoj...","[0, 0.65, 1.04, 1.54, 1.92, 2.51, 3.11, 3.5700...",ParlaMint-HR_S09.u4267,ParlaMint-HR_S09.u4267,...,,19.92,Regular,MP,"Živi zid, SNAGA",Klub Živog zida i SNAGA-e,Opposition,"Bunjac, Branimir",M,1972
2,XguZsDKdRh4_13797.59-13811.74.wav,"20 1 2017 - 3. sjednica, 9. saziv [XguZsDKdRh4...",13797.59,13811.74,"[jučer, trećem,, sutra, ili, prekosutra, četvr...","[0, 0.43, 1.0, 1.4, 1.52, 2.01, 2.55, 3.12, 3....","[jučer, trećem, sutra, ili, prekosutra, četvrt...","[0, 0.43, 1.0, 1.4, 1.52, 2.01, 2.55, 3.12, 3....",ParlaMint-HR_S03.u9702,ParlaMint-HR_S03.u9702,...,,14.15,Regular,MP,HDZ,Klub Hrvatske demokratske zajednice,Coalition,"Bačić, Branko",M,1959


In [2]:

from datetime import datetime
def get_date_from_filename(s: str)->datetime:
    from parse import compile
    pattern = "{day:d} {month:d} {year:d} {rest}"

    p = compile(pattern)
    results = p.search(s)

    if not results:
        return None
    return datetime(results["year"], results["month"], results["day"])

df["recording_datetime"] = df.orig_file.apply(get_date_from_filename)


In [3]:
def get_speaker_age_at_recording(row):
    try:
        return row["recording_datetime"].year - int(row["Speaker_birth"])
    except ValueError:
        return None

df["Speaker_age_at_recording"] = df.apply(get_speaker_age_at_recording, axis=1)

In [4]:
df = df[~df.Speaker_age_at_recording.isna()]

In [5]:
gb = df.groupby("Speaker_name").agg({
    "path": "count",
    "Speaker_age_at_recording": "median",
    "Speaker_gender": lambda i: i.iloc[0],
}).rename(columns={"path": "Count"}).sort_values(by="Count", ascending=False)
gb =  gb.loc[(gb.Count > 200) & (gb.Count < 3000)]

C_is_female = gb.Speaker_gender == "F"
C_is_male = gb.Speaker_gender == "M"

females_train = gb.loc[C_is_female].index[0:25].tolist()
males_train = gb.loc[C_is_male].index[0:25].tolist()

females_test = gb.loc[C_is_female].index[25:25+5].tolist()
males_test = gb.loc[C_is_male].index[25:25+5].tolist()

In [13]:
train = pd.concat([
    df[df.Speaker_name == name][0:20] for name in females_train+males_train
])
test = pd.concat([
    df[df.Speaker_name == name][0:200] for name in females_test+males_test
])

In [14]:
train["Speaker_median_age"] = 0.0
for speaker in train.Speaker_name:
    median_age = train[train.Speaker_name == speaker].Speaker_age_at_recording.median()
    train.loc[train.Speaker_name == speaker, "Speaker_median_age"] = median_age
test["Speaker_median_age"] = 0.0
for speaker in test.Speaker_name:
    median_age = test[test.Speaker_name == speaker].Speaker_age_at_recording.median()
    test.loc[test.Speaker_name == speaker, "Speaker_median_age"] = median_age

In [15]:
cols_to_keep = ["path", "Speaker_median_age", "Speaker_gender", "Speaker_name"]

train = train[cols_to_keep]
test = test[cols_to_keep]

In [16]:
train.head()

Unnamed: 0,path,Speaker_median_age,Speaker_gender,Speaker_name
276,on9WnAhubWg_12584.33-12599.73.wav,40.5,F,"Petrijevčanin Vuksanović, Irena"
298,IF1blZ6YSMA_15104.48-15124.11.wav,40.5,F,"Petrijevčanin Vuksanović, Irena"
397,IF1blZ6YSMA_15143.3-15163.11.wav,40.5,F,"Petrijevčanin Vuksanović, Irena"
501,wRedcoBr6Qk_10795.62-10814.97.wav,40.5,F,"Petrijevčanin Vuksanović, Irena"
503,9UCrtnKBgqs_15502.17-15521.49.wav,40.5,F,"Petrijevčanin Vuksanović, Irena"


In [17]:
train.groupby("Speaker_name").agg({"path": "count"})

Unnamed: 0_level_0,path
Speaker_name,Unnamed: 1_level_1
"Alfirev, Marija",20
"Babić, Ante",20
"Balić, Marijana",20
"Bauk, Arsen",20
"Bedeković, Vesna",20
"Bernardić, Davor",20
"Beus Richembergh, Goran",20
"Culej, Stevo",20
"Dobrović, Slaven",20
"Glavak, Sunčana",20


In [18]:
test.groupby("Speaker_name").agg({"path": "count"})

Unnamed: 0_level_0,path
Speaker_name,Unnamed: 1_level_1
"Demetlika, Tulio",200
"Glavašević, Bojan",200
"Horvat, Darko",200
"Vešligaj, Marko",200
"Vučetić, Marko",200


In [19]:
set(train.Speaker_name).intersection(set(test.Speaker_name))

set()

In [20]:
files_to_move = train.path.tolist()+test.path.tolist()
with open("005_files_to_move.txt", "w") as f:
    for file in files_to_move:
        f.write(file + "\n")

In [21]:
train["path"] = train.path.apply(lambda s: "/home/peterr/macocu/task11/data_age/seg."+s)
test["path"] = test.path.apply(lambda s: "/home/peterr/macocu/task11/data_age/seg."+s)

train.to_csv("005_train.csv", index=False)
test.to_csv("005_test.csv", index=False)