# TXT to texts converter
**Task: Convert long document full of input articles to single files as .txt**

In [190]:
import os
import sys
import uuid
import re
import json

def new_uuid():
    return 'uuid:' + str(uuid.uuid4())

src_file = 'texts_pdf_to_txt.txt'
output_dir = 'texts'
if not os.path.isfile(src_file):
    raise Exception('Source file does not exist')
if not os.path.exists(output_dir):
   os.makedirs(output_dir, exist_ok=True)
else:
    # remove all files in the output directory
    for f in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, f))

Read file + clean lines

In [191]:
with open(src_file, 'r') as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines]

print(f'Loaded {len(lines)} lines from {src_file}')

Loaded 4295 lines from texts_pdf_to_txt.txt


Get article headers

In [192]:
article_header_lines = lines[:230]
article_headers = []
for line in article_header_lines:
    if not line or not re.match(r'^1\.\d+', line):
        continue

    line = re.sub(r'\d+$', '', line.strip()).strip()
    article_headers.append(line)

print(f'Found {len(article_headers)} article headers')

for l in article_headers[:5]:
    print(l)
print('...')
for l in article_headers[-5:]:
    print(l)


Found 101 article headers
1.1. Zelení chtějí hlavu Čunka
1.2. Kamery viděly do soukromí. Soud to potrestal
1.3. Test MF DNES: Přijímačky jsou těžké i pro dospělé
1.4. Sledují každý náš krok
1.5. Vedení ODS: Zvolte Klause opět prezidentem
...
1.97. Omluva
1.98. Čunek: Lhůta pro výpověď z bytu déle než dva roky
1.99. Česká spořitelna prodlužuje provoz na pobočkách
1.100. Vyšší mzdy ve Škodě ohrozí její rozvoj
1.101. Unimex Group loni vydělal téměř miliardu


Clean article lines

In [193]:
article_lines = lines[1012:]

def clean_lines_by_start(lines, start):
    cleaned = []
    for line in lines:
        if not line.startswith(start):
            cleaned.append(line)
    return cleaned

line_count = len(article_lines)

for start in ['Foto', '© 2022', 'Zpět']:
    article_lines = clean_lines_by_start(article_lines, start)

print(f'Remaining {len(article_lines)}/{line_count} article lines after cleaning')

Remaining 3012/3283 article lines after cleaning


Get articles

In [194]:
texts = '\n'.join(lines[1012:]).strip()
print(texts[:100])

# split texts into articles by headers
articles = []
article = []
for line in article_lines:
    # if not line:
    #     continue

    if re.match(r'^1\.\d+', line) and article and line in article_headers:
        if article:
            articles.append(article)
            article = []
    article.append(line)

if article:
    articles.append(article)

articles = [{'header': str(a[0]).strip().replace(' ', '_').replace('/', '_'), 
             'text': '\n'.join(a).strip()} 
             for a in articles]

articles = [a for a in articles if a['text']]

print(f'Found {len(articles)} articles')
print(articles[0])

1.1. Zelení chtějí hlavu Čunka
TISK , Datum: 02.04.2007 , Zdroj: Mladá fronta DNES , Strana: 1 , Aut
Found 101 articles
{'header': '1.1._Zelení_chtějí_hlavu_Čunka', 'text': '1.1. Zelení chtějí hlavu Čunka\nTISK , Datum: 02.04.2007 , Zdroj: Mladá fronta DNES , Strana: 1 , Autor: PAVLA KUBÁLKOVÁ , Vytištěno: 171 085 , Prodáno: 144 292 , Infotype:\nNepojmenováno , Datum importu: 01.01.0001 00:00 , Čtenost: 612 533 , Rubrika: Titulní  strana , Země: Česko , GRP: 6,81\nVicepremiér svými výroky o Romech rozčilil ODS, zelené, opozici i některé své lidovce a byl označen za xenofoba\n\nPraha - Poslancům a vedení Strany zelených už došla trpělivost s Jiřím Čunkem. Chtějí, aby odešel z vlády. Poslední kapkou\nbyl pro ně páteční výrok vicepremiéra Čunka o tom, že kdo chce být dotován od státu, musí se nejdřív „opálit“. Narážel tím na\nzneužívání sociálních dávek a Romy.\nVčera odpoledne proto předsednictvo Strany zelených rozhodlo, že je nutné o Čunkově dalším setrvání v pozici vicepremiéra a\nmin

Export articles to .txt files in `output_dir`

In [195]:
for article in articles:
    if not article['text'] or not article['header']:
        print(f'Skipping article without text or header')
        print(f'header: {article["header"]}')
        print(f'text: {article["text"]}')
        continue

    file_name = article['header'] + '.txt'
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'w') as f:
        f.write(article['text'])

print(f'Exported {len(os.listdir(output_dir))} articles to {output_dir}')

Exported 101 articles to texts


Export list of article headers

In [196]:
# export to os.path.dirname(src_file) + '/articles.txt'
article_list_file = os.path.join(os.path.dirname(src_file), 'article_headers.txt')
with open(article_list_file, 'w') as f:
    for article in articles:
        if article['header']:
            f.write(f"{article['header']}\n")

print(f'Exported article list to {article_list_file}')

Exported article list to article_headers.txt


Export articles in JSON

In [197]:
article_json_file = os.path.join(os.path.dirname(src_file), 'articles.json')
with open(article_json_file, 'w') as f:
    json.dump(articles, f, indent=2, ensure_ascii=False)

# XLS to csv sheets

In [200]:
import os
import pandas as pd

# Change the working directory to the data directory
# os.chdir('../../data/promeny_zpravodajstvi')

excel_file_path = 'ratings.xls'
out_dir = 'ratings_orig_csv'
if not os.path.exists(out_dir):
    os.makedirs(out_dir, exist_ok=True)
else:
    # remove all files in the output directory
    for f in os.listdir(out_dir):
        os.remove(os.path.join(out_dir, f))


def export_sheet_to_csv(file_path, sheet_name):
    """
    Export a single sheet from an Excel file to a CSV file.
    
    :param file_path: Path to the Excel file.
    :param sheet_name: Name of the sheet to export.
    """
    # Read the specific sheet from the Excel file
    df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    print(sheet_name, df.shape)
    
    # Define the output CSV file name based on the sheet name
    out_csv_file_name = os.path.join(out_dir, f"{sheet_name}.csv")

    # Save the DataFrame to a CSV file
    df.to_csv(out_csv_file_name, index=False, header=False)


# Get the list of sheet names
sheet_names = pd.ExcelFile(excel_file_path).sheet_names

# Iterate over each sheet and export it to a CSV file
for sheet_name in sheet_names:
    export_sheet_to_csv(excel_file_path, sheet_name)

print(f'Exported {len(os.listdir(out_dir))} CSV files to {out_dir}')

kriterium (101, 8)
odstavce (100, 8)
hard_news (100, 8)
politicka_zprava (100, 8)
soukromy_ramec (93, 9)
strategicke (93, 9)
vecne (93, 9)
funkce (93, 9)
vyvazenost (93, 9)
protichudna_hlediska (93, 9)
odborny_zdroj (93, 9)
citace_a_parafraze (93, 8)
pyramida (93, 9)
fakta_a_nazory (93, 9)
negativni_vyzneni (93, 9)
konflikt (93, 9)
nekompetence (93, 9)
skandal (93, 9)
dopad (100, 9)
obtezujici_okolnosti (100, 8)
problemy (100, 9)
rady (100, 9)
personalizace (100, 8)
soukromy_zivot (100, 8)
senzacnost (100, 8)
emoce (100, 8)
morbidnost (100, 8)
reakce (100, 9)
pozadavky (100, 8)
validace (100, 8)
lokalni_dopad (100, 8)
vzdelavani (100, 8)
kontext (100, 8)
dotazy (100, 8)
informace (100, 8)
podpora (100, 8)
Exported 36 CSV files to ratings_orig_csv


# Ratings csv to raters

In [221]:
import os
import sys
import uuid
import re
import pandas as pd
import os

src_dir = 'ratings_orig_csv'
output_dir = 'ratings_raters'
if not os.path.isfile(src_file):
    raise Exception('Source file does not exist')
if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)
else:
    # remove all files in the output directory
    for f in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, f))

Load file_names to add to ratings

In [222]:
article_headers_file = 'article_headers.txt'

with open(article_headers_file, 'r') as f:
    article_headers = f.readlines()
    article_headers = [l.strip() for l in article_headers]

print(f'Loaded {len(lines)} article headers from {article_headers_file}, \nsuch as: {article_headers[:5]}')

Loaded 4295 article headers from article_headers.txt, 
such as: ['1.1._Zelení_chtějí_hlavu_Čunka', '1.2._Kamery_viděly_do_soukromí._Soud_to_potrestal', '1.3._Test_MF_DNES:_Přijímačky_jsou_těžké_i_pro_dospělé', '1.4._Sledují_každý_náš_krok', '1.5._Vedení_ODS:_Zvolte_Klause_opět_prezidentem']


Load csv files and check them

In [223]:
# load csv files
csv_files = [f for f in os.listdir(src_dir) if f.endswith('.csv')]
print(f'Found {len(csv_files)} csv files')

dfs = {}

for file in csv_files:
    src_file = os.path.join(src_dir, file)
    df = pd.read_csv(src_file, header=None)
    print(f'Loaded {len(df)} rows from {src_file}')

    df = df.iloc[:, :3]
    df['file_name'] = article_headers[:len(df)]

    df.rename(columns={0: 'rater_0', 1: 'rater_1', 2: 'rater_2'}, inplace=True)
    df.set_index('file_name', inplace=True)
    df = df[['rater_0', 'rater_1', 'rater_2']]

    # drop rows that have NaN values in all rater_0, rater_1, rater_2 columns
    df = df.dropna(subset=['rater_0', 'rater_1', 'rater_2'], how='all')
    
    df = df.fillna(-1)  # replace NaN with -1
    # type all columns to str
    for col in ['rater_0', 'rater_1', 'rater_2']:
        df[col] = df[col].astype(int)
    df.replace(-1, 'N/A', inplace=True)

    print(df)

    # output_file = os.path.join(output_dir, file)
    # df.to_csv(output_file, index=False)
    dfs[file] = df

# TODO for each csv file, add text file names, drop empty rows and map number ratings to text equivalents (except "dostavce" column)
# columns starting from one: funkce, vyvazenost

Found 36 csv files
Loaded 93 rows from ratings_orig_csv/odborny_zdroj.csv
                                                   rater_0 rater_1 rater_2
file_name                                                                 
1.1._Zelení_chtějí_hlavu_Čunka                           0       0       0
1.5._Vedení_ODS:_Zvolte_Klause_opět_prezidentem          0       0       0
1.6._Květinko,_hýčkal_Zeman_Volfovou                     0       0       0
1.15._Tunel_v_Praze_pod_Letnou_se_může_začít_st...     N/A       0       0
1.24._Schwarzenberg_k_odsunu:_bylo_to_vyhnání            0       0       1
1.25._Zelení_už_mají_Čunka_dost                          0       0       0
1.26._Zeman_byl_u_zrodu_nové_strany_Jany_Volfové         0       0       0
1.28._ODS_podpořila_Klause_na_Hrad                       0       0       0
1.34._Policejní_prezident_by_mohl_mít_časově_om...       0       0       0
1.35._ČSSD_viní_Langra_z_ovlivňování_kauzy_Kubice        0       0       0
1.36._Transparency_Int._pr

Load prompt_defintions and replace numbers with answers

In [224]:
promp_definitions_file = 'prompt_definitions.json'
with open(promp_definitions_file, 'r') as f:
    prompt_definitions = json.load(f)

prompt_options = {}
for prompt in prompt_definitions["prompts"].keys():
    prompt_options[prompt] = {i: option['name'] for i, option in enumerate(prompt_definitions['prompts'][prompt]['options'])}

print(f'Loaded prompt options for {len(prompt_options)} prompts')
# for prompt in sorted(prompt_options.keys()):
#     print(prompt)
print(f'prompts: {list(prompt_options.keys())}')

# replace all values in columns rater_0, rater_1, rater_2 with text values from prompt_options[file]
for file, df in dfs.items():
    file = file.replace('.csv', '')
    print(f'{file}')
    if file == 'odstavce':
        continue

    df.replace(prompt_options[file], inplace=True)

# add prompt names to the columns
for file, df in dfs.items():
    file = file.replace('.csv', '')
    df['prompt'] = file
    # df.set_index('prompt', append=True, inplace=True)

# merge all dataframes into one
df = pd.concat(dfs.values())
df.set_index('prompt', append=True, inplace=True)
df.sort_index(inplace=True)

# export individual raters with both index columns
for rater in ['rater_0', 'rater_1', 'rater_2']:
    output_file = os.path.join(output_dir, f'{rater}.csv')
    rater_data = df[[rater]]
    rater_data = rater_data[rater_data[rater] != 'N/A']  # remove rows with N/A
    rater_data.to_csv(output_file, header=False)

Loaded prompt options for 36 prompts
prompts: ['kriterium', 'odstavce', 'hard_news', 'politicka_zprava', 'soukromy_ramec', 'strategicke', 'vecne', 'funkce', 'vyvazenost', 'protichudna_hlediska', 'odborny_zdroj', 'citace_a_parafraze', 'pyramida', 'fakta_a_nazory', 'negativni_vyzneni', 'konflikt', 'nekompetence', 'skandal', 'dopad', 'obtezujici_okolnosti', 'problemy', 'rady', 'personalizace', 'soukromy_zivot', 'senzacnost', 'emoce', 'morbidnost', 'reakce', 'pozadavky', 'validace', 'lokalni_dopad', 'vzdelavani', 'kontext', 'dotazy', 'informace', 'podpora']
odborny_zdroj
citace_a_parafraze
odstavce
vecne
podpora
strategicke
skandal
pozadavky
pyramida
obtezujici_okolnosti
politicka_zprava
emoce
kriterium
fakta_a_nazory
personalizace
morbidnost
lokalni_dopad
negativni_vyzneni
problemy
nekompetence
konflikt
validace
kontext
dotazy
soukromy_zivot
funkce
reakce
senzacnost
protichudna_hlediska
vzdelavani
rady
soukromy_ramec
dopad
informace
hard_news
vyvazenost
