In [3]:
import tensorflow as tf

2024-06-24 00:11:07.250809: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import csv
import pickle
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences

import transformers
from transformers import BertTokenizer, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import BertForTokenClassification, AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
from google.cloud import storage

In [6]:
torch.__version__

'2.3.1+cu121'

In [7]:
transformers.__version__

'4.41.2'

In [8]:
document_data = pd.read_csv("court_data/court_data_2022/documents.csv", sep="	") #.fillna(method="ffill")
document_data.head(10)

Unnamed: 0,doc_id,court_code,judgment_code,justice_kind,category_code,cause_num,adjudication_date,receipt_date,judge,doc_url,status,date_publ
0,102383845,503,5,2,40467.0,219/14855/21,2021-12-30 00:00:00+02,2022-01-02 00:00:00+02,Медінцева Н. М.,,0,2022-01-02 00:00:00+02
1,102391042,4856,5,4,40105.0,560/6776/21,2021-12-29 00:00:00+02,2022-01-02 00:00:00+02,Матохнюк Д.Б.,http://od.reyestr.court.gov.ua/files/51/0c9097...,1,2022-01-04 00:00:00+02
2,102390645,4824,5,1,,824/245/21,2021-12-22 00:00:00+02,2022-01-02 00:00:00+02,Нежура Вадим Анатолійович,http://od.reyestr.court.gov.ua/files/51/a53df6...,1,2022-01-04 00:00:00+02
3,102391822,2670,5,4,40114.0,640/4558/19,2021-11-02 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/904bf7...,1,2022-01-04 00:00:00+02
4,102391796,2670,3,4,40110.0,640/10043/21,2021-12-07 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/4c9376...,1,2022-01-04 00:00:00+02
5,102391789,2670,3,4,40110.0,640/18672/21,2021-12-01 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/f1fc47...,1,2022-01-04 00:00:00+02
6,102391769,2670,3,4,40088.0,640/3619/19,2021-11-10 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/98d4ee...,1,2022-01-04 00:00:00+02
7,102391764,2670,3,4,40114.0,640/4230/19,2021-11-05 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/dbaa6e...,1,2022-01-04 00:00:00+02
8,102391757,2670,3,4,40077.0,640/1726/20,2021-11-02 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/145f06...,1,2022-01-04 00:00:00+02
9,102388442,1570,3,4,40074.0,420/20761/21,2021-12-30 00:00:00+02,2022-01-02 00:00:00+02,Потоцька Н.В.,http://od.reyestr.court.gov.ua/files/51/5514da...,1,2022-01-04 00:00:00+02


In [9]:
document_data.shape

(5822649, 12)

In [10]:
import requests

In [11]:
document_data["doc_url"].to_numpy()

array([nan,
       'http://od.reyestr.court.gov.ua/files/51/0c9097fb28ccabfcd4db8d140fd667b7.rtf',
       'http://od.reyestr.court.gov.ua/files/51/a53df6d681ad7a5921e67e15a6bce992.rtf',
       ..., nan, nan, nan], dtype=object)

In [12]:
document_data["doc_url"][1]

'http://od.reyestr.court.gov.ua/files/51/0c9097fb28ccabfcd4db8d140fd667b7.rtf'

In [13]:
requests.get(document_data["doc_url"][1])

<Response [200]>

### Uploading data to Cloud Storage

In [14]:
SUBFOLDER = "data_1m_rtf/"

In [15]:
storage_client = storage.Client()
bucket = storage_client.bucket("court_data_bert")

In [16]:
def download_court_file(url: str, doc_id: str):
    rtf_court_raw_filename = url.split("/")[-1]
    result_filename = SUBFOLDER + str(doc_id) + "_" + rtf_court_raw_filename
    blob = bucket.blob(result_filename)
    
    response = requests.get(url)
    response.raise_for_status()
    
    blob.upload_from_string(response.content, content_type=response.headers.get('content-type'))

#### Uploading 50 instances

In [17]:
document_data

Unnamed: 0,doc_id,court_code,judgment_code,justice_kind,category_code,cause_num,adjudication_date,receipt_date,judge,doc_url,status,date_publ
0,102383845,503,5,2,40467.0,219/14855/21,2021-12-30 00:00:00+02,2022-01-02 00:00:00+02,Медінцева Н. М.,,0,2022-01-02 00:00:00+02
1,102391042,4856,5,4,40105.0,560/6776/21,2021-12-29 00:00:00+02,2022-01-02 00:00:00+02,Матохнюк Д.Б.,http://od.reyestr.court.gov.ua/files/51/0c9097...,1,2022-01-04 00:00:00+02
2,102390645,4824,5,1,,824/245/21,2021-12-22 00:00:00+02,2022-01-02 00:00:00+02,Нежура Вадим Анатолійович,http://od.reyestr.court.gov.ua/files/51/a53df6...,1,2022-01-04 00:00:00+02
3,102391822,2670,5,4,40114.0,640/4558/19,2021-11-02 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/904bf7...,1,2022-01-04 00:00:00+02
4,102391796,2670,3,4,40110.0,640/10043/21,2021-12-07 00:00:00+02,2022-01-02 00:00:00+02,Клименчук Н.М.,http://od.reyestr.court.gov.ua/files/51/4c9376...,1,2022-01-04 00:00:00+02
...,...,...,...,...,...,...,...,...,...,...,...,...
5822644,108198825,444,5,2,40469.0,216/1387/22,2022-12-26 00:00:00+02,2022-12-31 00:00:00+02,КУЗНЕЦОВ Р. О.,,0,2024-04-22 00:00:00+03
5822645,108205032,1304,5,2,40469.0,461/7106/22,2022-12-29 00:00:00+02,2022-12-31 00:00:00+02,Стрельбицький В. В.,,0,2024-04-22 00:00:00+03
5822646,108199852,2609,5,2,40469.0,760/18063/22,2022-12-08 00:00:00+02,2022-12-31 00:00:00+02,Криворот О. О.,,0,2024-04-22 00:00:00+03
5822647,108199868,2609,5,2,40469.0,760/17205/22,2022-12-12 00:00:00+02,2022-12-31 00:00:00+02,Криворот О. О.,,0,2024-04-22 00:00:00+03


In [20]:
counter = 0

In [23]:
for index, row in document_data.notna()[50000:55000].iterrows():
    # print("Processing -> ", document_data["doc_url"][index])
    if counter % 1000 == 0:
        print(f"Processed -> {counter}")
    
    # if document_data["doc_url"][index].isna():
    #     continue
        
    if pd.isna(document_data.at[index, "doc_url"]):
        continue
        
    try:
        download_court_file(
            document_data["doc_url"][index], 
            document_data["doc_id"][index],
        )
    except:
        print("Unable to download court data -> ", document_data["doc_url"][index], document_data["doc_id"][index])
    
    counter = counter + 1

Processed -> 10000
Unable to download court data ->  http://od.reyestr.court.gov.ua/files/51/5cf907d20721aeeb220e7d0d1165f61a.rtf 102437528
Processed -> 11000
Unable to download court data ->  http://od.reyestr.court.gov.ua/files/51/eec9fdf8ebe13e19faf406648d0b519c.rtf 102437523
Unable to download court data ->  http://od.reyestr.court.gov.ua/files/51/d29987be9ffd5a39ec02e1a90b8fe4f2.rtf 102437520
Unable to download court data ->  http://od.reyestr.court.gov.ua/files/51/8e5922d5c63d3d22741e983300f15712.rtf 102437518
Processed -> 12000
Processed -> 13000
Unable to download court data ->  http://od.reyestr.court.gov.ua/files/51/2575037125b96fbc41a23418de678f78.rtf 102437519
Processed -> 14000
