In [1]:
from google.cloud import storage
from striprtf.striprtf import rtf_to_text

import pandas as pd

In [2]:
RESULT_FORMAT = "txt"

In [3]:
def download_rtf(
    bucket_name: str, 
    source_blob_name: str,
    destination_file_name: str,
):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [4]:
def upload_txt_contents(
    bucket_name: str, 
    source_filepath: str, 
    destination_blob_name: str,
):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_filepath)

In [5]:
def convert_rtf_to_text(file_path):
    with open(file_path, 'r') as file:
        rtf_content = file.read()
    text = rtf_to_text(rtf_content)
    return text

In [6]:
def write_converted_file_to_tmp(
    contents: str,
    filepath: str,
):
    with open(filepath, "w", encoding="utf-8") as txt_file:
        txt_file.write(contents)

In [7]:
document_data = pd.read_csv("court_data/court_data_2022/documents.csv", sep="	")
document_data.head(10)

Unnamed: 0,doc_id,court_code,judgment_code,justice_kind,category_code,cause_num,adjudication_date,receipt_date,judge,doc_url,status,date_publ
0,102383845,503,5,2,40467.0,219/14855/21,2021-12-30 00:00:00+02,2022-01-02 00:00:00+02,Медінцева Н. М.,,0,2022-01-02 00:00:00+02
1,102391042,4856,5,4,40105.0,560/6776/21,2021-12-29 00:00:00+02,2022-01-02 00:00:00+02,Матохнюк Д.Б.,http://od.reyestr.court.gov.ua/files/51/0c9097...,1,2022-01-04 00:00:00+02
2,102390645,4824,5,1,,824/245/21,2021-12-22 00:00:00+02,2022-01-02 00:00:00+02,Нежура Вадим Анатолійович,http://od.reyestr.court.gov.ua/files/51/a53df6...,1,2022-01-04 00:00:00+02
3,102391822,2670,5,4,40114.0,640/4558/19,2021-11-02 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/904bf7...,1,2022-01-04 00:00:00+02
4,102391796,2670,3,4,40110.0,640/10043/21,2021-12-07 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/4c9376...,1,2022-01-04 00:00:00+02
5,102391789,2670,3,4,40110.0,640/18672/21,2021-12-01 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/f1fc47...,1,2022-01-04 00:00:00+02
6,102391769,2670,3,4,40088.0,640/3619/19,2021-11-10 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/98d4ee...,1,2022-01-04 00:00:00+02
7,102391764,2670,3,4,40114.0,640/4230/19,2021-11-05 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/dbaa6e...,1,2022-01-04 00:00:00+02
8,102391757,2670,3,4,40077.0,640/1726/20,2021-11-02 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/145f06...,1,2022-01-04 00:00:00+02
9,102388442,1570,3,4,40074.0,420/20761/21,2021-12-30 00:00:00+02,2022-01-02 00:00:00+02,Потоцька Н.В.,http://od.reyestr.court.gov.ua/files/51/5514da...,1,2022-01-04 00:00:00+02


### Converting from rtf to txt

In [8]:
BUCKET_NAME = "court_data_bert"
RTF_BLOB_FOLDER = "data_1m_rtf"
TXT_BLOB_FODLER = "data_1m_txt"
TMP_FOLDER = "tmp"
RESULT_PREFIX = "txt_"

In [9]:
for index, row in document_data[0:].notna().iterrows():
    if pd.isna(document_data.at[index, "doc_url"]):
        continue
    
    doc_filename = document_data["doc_url"][index].split("/")[-1]
    processing_filename = str(document_data["doc_id"][index]) + "_" + doc_filename
    processing_data_path = TMP_FOLDER + "/" + processing_filename 
    # print("processing file -> ", processing_filename)
    
    download_rtf(
        BUCKET_NAME,
        RTF_BLOB_FOLDER + "/" + processing_filename,
        processing_data_path,
    )
    
    try:
        txt_contents = convert_rtf_to_text(processing_data_path)
    except:
        print("Failing parsing: " + processing_data_path)
    
    parsed_result_filepath = RESULT_PREFIX + processing_data_path + ".txt"
    write_converted_file_to_tmp(txt_contents, parsed_result_filepath)
    
    upload_txt_contents(
        BUCKET_NAME,
        parsed_result_filepath,
        TXT_BLOB_FODLER + "/" + processing_filename + ".txt"
    )

Failing parsing: tmp/102383551_b930cf83a3c1c58e79ed0caac68c34ce.rtf


NotFound: 404 GET https://storage.googleapis.com/download/storage/v1/b/court_data_bert/o/data_1m_rtf%2F102397442_e5d27776510362a0262ff3befa5727ec.rtf?alt=media: No such object: court_data_bert/data_1m_rtf/102397442_e5d27776510362a0262ff3befa5727ec.rtf: ('Request failed with status code', 404, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.PARTIAL_CONTENT: 206>)