# Import Library

In [1]:
import pandas as pd
import numpy as np
import re
from pymongo import MongoClient
import sys
import os
import ast
from tabulate import tabulate
import torch

code_path = os.path.abspath('../')
sys.path.append(code_path)

from credentials import connection_string

In [2]:
# Check if CUDA is available
if torch.cuda.is_available():
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No CUDA devices available.")

Number of CUDA devices: 1
Device 0: NVIDIA GeForce RTX 4060 Laptop GPU


# Import Data

## Connect to MongoDB Cluster

In [2]:
database_name = "Singaraja"
collection_name = f"{database_name}_Raw"

In [4]:
client = MongoClient(connection_string)
db = client[database_name]
collection = db[collection_name]

## Get All Documents from Selected Collection

In [5]:
all_documents = collection.find()

## Convert to DataFrame

In [6]:
df = pd.DataFrame(list(all_documents))

# drop _id
if '_id' in df.columns:
    df.drop(columns=['_id'], inplace=True)

In [7]:
df

Unnamed: 0,status_perkara,nomor_perkara,klasifikasi_perkara,tanggal_pendaftaran,lama_proses,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,barang_bukti,dakwaan
0,Minutasi,118/Pid.Sus/2024/PN Sgr,Narkotika,"Senin, 05 Agu. 2024",44 Hari,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']","['I Made Bagiarta~Hakim Ketua~Tidak', 'Ni Made...",3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,MENGADILI:\nMenyatakan Terdakwa I NYOMAN SUART...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
1,Minutasi,117/Pid.Sus/2024/PN Sgr,Narkotika,"Kamis, 01 Agu. 2024",49 Hari,"['MADE SUDAMA ALIAS KARTOLO', 'PUTU WISNU ALIA...","['I MADE SUTAPA,S.H.']","['Made Hermayanti Muliartha~Hakim Ketua~Ya', '...",3,['PUTU WISNU ALIAS WISNU~Subsider Penjara (10 ...,MENGADILI:\nMenyatakan Terdakwa I Made Sudama ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
2,Minutasi,112/Pid.Sus/2024/PN Sgr,Narkotika,"Selasa, 30 Jul. 2024",44 Hari,"['I KOMANG TEKEN Alias TEKEN', 'KADEK ALBET SA...","['Kadek Adi Pramarta, S.H.']","['I Made Bagiarta~Hakim Ketua~Tidak', 'I Gusti...",4,['KADEK ALBET SANJAYA Alias ALBET~Pidana Penja...,M E N G A D I L I :\nMenyatakan terda...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...
3,Minutasi,101/Pid.B/2024/PN Sgr,Pencurian,"Kamis, 11 Jul. 2024",56 Hari,['ANDRIANI'],"['Nyoman Arif Budiman, S.H., M.H.']","['I Gusti Made Juliartawan~Hakim Ketua~Ya', 'N...",3,['ANDRIANI~Pidana Penjara Waktu Tertentu (1 Ta...,"MENGADILI:\nMenyatakan Terdakwa ANDRIANI, tela...",“Demi Keadilan dan Kebenaran\nBerdasarkan Ketu...
4,Minutasi,98/Pid.Sus/2024/PN Sgr,Narkotika,"Rabu, 10 Jul. 2024",64 Hari,['I NYOMAN IWAN MAHADI alias MANG IWAN'],"['I GUSTI NGURAH ARYA DIATMIKA,S.H.']","['I Made Bagiarta~Hakim Ketua~Ya', 'Made Herma...",3,['I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana ...,MENGADILI:\nMenyatakan Terdakwa I Nyoman Iwan ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
...,...,...,...,...,...,...,...,...,...,...,...,...
1265,Minutasi,237/Pid.B/2017/PN Sgr,Penggelapan,"Rabu, 20 Des. 2017",56 Hari,[Ayu Putu Erlyandani],"[IMAM EKA SETYAWAN, SH.]",[Ida Ayu Sri Adriyanthi Astuti Widja~Hakim Ket...,4,[Ayu Putu Erlyandani~Pidana Penjara Waktu Tert...,M E N G A D I L I:\nMenyatakan Terdakwa AYU PU...,Kesatu : Â \nPrimair.\n-------- Bahwa terdakwa...
1266,Minutasi,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"Rabu, 20 Des. 2017",97 Hari,[Ketut Mahardika Alias Kelet],"[I Nyoman Sulitra, SH.,MH.]",[Ida Ayu Sri Adriyanthi Astuti Widja~Hakim Ket...,10,[Ketut Mahardika Alias Kelet~Pidana Penjara Wa...,MENGADILI:\nÂ \nMenyatakan Terdakwa Ketut Maha...,Kesatu\n----- Bahwa terdakwa Â KETUT MAHARDIKA...
1267,Minutasi,227/Pid.B/2017/PN Sgr,Penggelapan,"Senin, 11 Des. 2017",52 Hari,[Iwan Hermato Alias Iwan],"[I GEDE PUTU ASTAWA, SH.]","[Ida Bagus Bama Dewa. P~Hakim Ketua~Ya, I Made...",3,[Iwan Hermato Alias Iwan~Pidana Kurungan (1 Ta...,MENGADILI\nMenyatakan Terdakwa Iwan Hermanto A...,Pertama : Pasal 372 Kitab Undang-undang Hukum ...
1268,Minutasi,226/Pid.Sus/2017/PN Sgr,Narkotika,"Senin, 11 Des. 2017",31 Hari,[Komang Irwin Pranata Alias Erwin],"[I GEDE PUTU ASTAWA, SH.]","[Sudar~Hakim Ketua~Ya, Anak Agung Ngurah Budhi...",2,[Komang Irwin Pranata Alias Erwin~Pidana Penja...,Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ...,Pertama : Pasal 112 ayat (1) UU RI No. 35 tahu...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1270 entries, 0 to 1269
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   status_perkara       1270 non-null   object
 1   nomor_perkara        1270 non-null   object
 2   klasifikasi_perkara  1270 non-null   object
 3   tanggal_pendaftaran  1270 non-null   object
 4   lama_proses          1270 non-null   object
 5   terdakwa             1270 non-null   object
 6   penuntut_umum        1270 non-null   object
 7   hakim                1270 non-null   object
 8   jumlah_saksi         1270 non-null   int64 
 9   putusan_hukuman      1270 non-null   object
 10  barang_bukti         1270 non-null   object
 11  dakwaan              1267 non-null   object
dtypes: int64(1), object(11)
memory usage: 119.2+ KB


In [9]:

print(df[df['nomor_perkara'] == '86/Pid.B/2019/PN Sgr']['barang_bukti'])


727    M E N G A D I L I :\nMenyatakan Terdakwa NYOMA...
Name: barang_bukti, dtype: object


# Data Understanding

In [10]:
df.head(5)

Unnamed: 0,status_perkara,nomor_perkara,klasifikasi_perkara,tanggal_pendaftaran,lama_proses,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,barang_bukti,dakwaan
0,Minutasi,118/Pid.Sus/2024/PN Sgr,Narkotika,"Senin, 05 Agu. 2024",44 Hari,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']","['I Made Bagiarta~Hakim Ketua~Tidak', 'Ni Made...",3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,MENGADILI:\nMenyatakan Terdakwa I NYOMAN SUART...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
1,Minutasi,117/Pid.Sus/2024/PN Sgr,Narkotika,"Kamis, 01 Agu. 2024",49 Hari,"['MADE SUDAMA ALIAS KARTOLO', 'PUTU WISNU ALIA...","['I MADE SUTAPA,S.H.']","['Made Hermayanti Muliartha~Hakim Ketua~Ya', '...",3,['PUTU WISNU ALIAS WISNU~Subsider Penjara (10 ...,MENGADILI:\nMenyatakan Terdakwa I Made Sudama ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
2,Minutasi,112/Pid.Sus/2024/PN Sgr,Narkotika,"Selasa, 30 Jul. 2024",44 Hari,"['I KOMANG TEKEN Alias TEKEN', 'KADEK ALBET SA...","['Kadek Adi Pramarta, S.H.']","['I Made Bagiarta~Hakim Ketua~Tidak', 'I Gusti...",4,['KADEK ALBET SANJAYA Alias ALBET~Pidana Penja...,M E N G A D I L I :\nMenyatakan terda...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...
3,Minutasi,101/Pid.B/2024/PN Sgr,Pencurian,"Kamis, 11 Jul. 2024",56 Hari,['ANDRIANI'],"['Nyoman Arif Budiman, S.H., M.H.']","['I Gusti Made Juliartawan~Hakim Ketua~Ya', 'N...",3,['ANDRIANI~Pidana Penjara Waktu Tertentu (1 Ta...,"MENGADILI:\nMenyatakan Terdakwa ANDRIANI, tela...",“Demi Keadilan dan Kebenaran\nBerdasarkan Ketu...
4,Minutasi,98/Pid.Sus/2024/PN Sgr,Narkotika,"Rabu, 10 Jul. 2024",64 Hari,['I NYOMAN IWAN MAHADI alias MANG IWAN'],"['I GUSTI NGURAH ARYA DIATMIKA,S.H.']","['I Made Bagiarta~Hakim Ketua~Ya', 'Made Herma...",3,['I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana ...,MENGADILI:\nMenyatakan Terdakwa I Nyoman Iwan ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...


**Dataset Features**
| Fitur | Deskripsi | Tipe Data |
|-----------------------|-----------------------------------------------------------------------|:------------:|
| `status_perkara` | Status dari perkara.| `str` |
| `nomor_perkara` | Nomor identifikasi unik untuk perkara.| `str` |
| `klasifikasi_perkara` | Kategori atau jenis perkara.| `str` |
| `tanggal_pendaftaran` | Tanggal ketika perkara didaftarkan di pengadilan.| `str` |
| `lama_proses` | Durasi waktu yang dibutuhkan untuk memproses perkara.| `str` |
| `terdakwa` | Nama individu yang didakwa dalam perkara tersebut.| `list[str]` |
| `penuntut_umum` | Nama jaksa atau penuntut yang menangani perkara.| `list[str]`  |
| `hakim` | Nama hakim yang memimpin sidang perkara.| `list[str]` |
| `jumlah_saksi` | Total jumlah saksi yang dihadirkan dalam sidang.| `int` |
| `putusan_hukuman` | Putusan atau keputusan yang dijatuhkan oleh hakim.| `list[str]` |
| `barang_bukti` | Amar Putusan yang berisi tentang barang bukti perkara.| `str` |
| `dakwaan`| Uraian mengenai tuduhan atau dakwaan yang diajukan terhadap terdakwa.| `str` |







# Data Preprocessing

## Data Cleaning

### Data Cleaning: Null Values

In [11]:
# Check for null values in the entire DataFrame
null_counts = df.isnull().sum()

print("Number of null values in each column:")
print(null_counts)

Number of null values in each column:
status_perkara         0
nomor_perkara          0
klasifikasi_perkara    0
tanggal_pendaftaran    0
lama_proses            0
terdakwa               0
penuntut_umum          0
hakim                  0
jumlah_saksi           0
putusan_hukuman        0
barang_bukti           0
dakwaan                3
dtype: int64


In [12]:
# Print rows with any null values
null_rows = df[df.isnull().any(axis=1)]

print("Rows with null values:")
print(null_rows)


Rows with null values:
    status_perkara            nomor_perkara klasifikasi_perkara  \
88        Minutasi    151/Pid.B/2023/PN Sgr        Penganiayaan   
123       Minutasi    101/Pid.B/2023/PN Sgr         Penggelapan   
296       Minutasi  116/Pid.Sus/2023/PN Sgr           Narkotika   

      tanggal_pendaftaran lama_proses                                terdakwa  \
88    Jumat, 08 Des. 2023     70 Hari               ['NENGAH IRPAN RUSMAWAN']   
123   Senin, 02 Okt. 2023     59 Hari     ['GEDE SUMARDANA PUTRA alias DESU']   
296  Selasa, 17 Okt. 2023     58 Hari  ['K. DEKY CHRISTINA ARTHA Alias DEKY']   

                                 penuntut_umum  \
88                   ['GEDE PUTU ASTAWA,S.H.']   
123                  ['MADE JUNI ARTINI.S.H.']   
296  ['I MADE HERI PERMANA PUTRA, S.H., M.H.']   

                                                 hakim  jumlah_saksi  \
88   ['Heriyanti~Hakim Ketua~Ya', 'Ni Made Kushanda...             6   
123  ['Ni Made Kushandari~Hakim Ketua~

In [13]:
df_copy = df.dropna()
# Check for null values in the entire DataFrame
null_counts = df_copy.isnull().sum()

print("Number of null values in each column:")
print(null_counts)

Number of null values in each column:
status_perkara         0
nomor_perkara          0
klasifikasi_perkara    0
tanggal_pendaftaran    0
lama_proses            0
terdakwa               0
penuntut_umum          0
hakim                  0
jumlah_saksi           0
putusan_hukuman        0
barang_bukti           0
dakwaan                0
dtype: int64


### Data Cleaning: Data Disamarkan

In [14]:
count_disamarkan = df_copy[df_copy["dakwaan"].str.lower().isin(["disamarkan"])]

# Print the count
print(f"Jumlah data 'Disamarkan': {count_disamarkan.shape[0]}")


Jumlah data 'Disamarkan': 100


In [15]:
print(count_disamarkan)


     status_perkara            nomor_perkara           klasifikasi_perkara  \
40         Minutasi   73/Pid.Sus/2024/PN Sgr             Perlindungan Anak   
54         Minutasi   62/Pid.Sus/2024/PN Sgr             Perlindungan Anak   
67         Minutasi   27/Pid.Sus/2024/PN Sgr             Perlindungan Anak   
71         Minutasi   37/Pid.Sus/2024/PN Sgr             Perlindungan Anak   
86         Minutasi  154/Pid.Sus/2023/PN Sgr             Perlindungan Anak   
...             ...                      ...                           ...   
1085       Minutasi  159/Pid.Sus/2018/PN Sgr             Perlindungan Anak   
1159       Minutasi   18/Pid.Sus/2018/PN Sgr  Kekerasan Dalam Rumah Tangga   
1192       Minutasi    1/Pid.Sus/2018/PN Sgr             Perlindungan Anak   
1210       Minutasi   64/Pid.Sus/2018/PN Sgr             Perlindungan Anak   
1227       Minutasi  241/Pid.Sus/2017/PN Sgr  Kekerasan Dalam Rumah Tangga   

       tanggal_pendaftaran lama_proses        terdakwa  \
40   

In [16]:
df_copy = df_copy[~df_copy["dakwaan"].str.lower().isin(["disamarkan"])]
df_copy.reset_index(drop=True, inplace=True)

In [17]:
count_disamarkan = df_copy[df_copy["dakwaan"].str.lower().isin(["disamarkan"])]

# Print the count
print(f"Jumlah data 'Disamarkan': {count_disamarkan.shape[0]}")

Jumlah data 'Disamarkan': 0


### Data Cleaning: Status Perkara

In [18]:
unique_status_perkara_values = df_copy['status_perkara'].unique()
num_unique_status_perkara = len(unique_status_perkara_values)

print(f"Number of unique values in 'status_perkara': {num_unique_status_perkara}")
print("Unique values:", unique_status_perkara_values)


Number of unique values in 'status_perkara': 1
Unique values: ['Minutasi']


### Data Cleaning: Nomor Perkara

In [19]:
unique_nomor_perkara_values = df_copy['nomor_perkara'].unique()
num_unique_nomor_perkara = len(unique_nomor_perkara_values)

print(f"Jumlah Data: {len(df_copy)}")
print(f"Jumlah Nilai Unique pada Kolom 'nomor_perkara': {num_unique_nomor_perkara}")

Jumlah Data: 1167
Jumlah Nilai Unique pada Kolom 'nomor_perkara': 1167


### Data Cleaning: Klasifikasi Perkara

In [20]:
unique_klasifikasi_perkara_values = df_copy['klasifikasi_perkara'].unique()
num_unique_klasifikasi_perkara = len(unique_klasifikasi_perkara_values)
unique_klasifikasi_counts = df_copy['klasifikasi_perkara'].value_counts()

print(f"Number of unique values in 'status_perkara': {num_unique_klasifikasi_perkara}")
print(unique_klasifikasi_counts)

Number of unique values in 'status_perkara': 36
klasifikasi_perkara
Pencurian                                                                                               332
Narkotika                                                                                               323
Penganiayaan                                                                                            104
Penggelapan                                                                                              88
Lalu Lintas                                                                                              76
Kejahatan Perjudian                                                                                      63
Lain-Lain                                                                                                55
Penipuan                                                                                                 28
Pembunuhan                                                          

In [21]:
df_copy = df_copy[~df_copy["klasifikasi_perkara"].isin(["Lalu Lintas"])]
df_copy.reset_index(drop=True, inplace=True)

In [22]:
unique_klasifikasi_perkara_values = df_copy['klasifikasi_perkara'].unique()
num_unique_klasifikasi_perkara = len(unique_klasifikasi_perkara_values)
unique_klasifikasi_counts = df_copy['klasifikasi_perkara'].value_counts()

print(f"Number of unique values in 'status_perkara': {num_unique_klasifikasi_perkara}")
print(unique_klasifikasi_counts)

Number of unique values in 'status_perkara': 35
klasifikasi_perkara
Pencurian                                                                                               332
Narkotika                                                                                               323
Penganiayaan                                                                                            104
Penggelapan                                                                                              88
Kejahatan Perjudian                                                                                      63
Lain-Lain                                                                                                55
Penipuan                                                                                                 28
Pembunuhan                                                                                               11
Tindak Pidana Senjata Api atau Benda Tajam                          

### Data Cleaning: Hakim

In [23]:
# Count unique data types in the column
unique_types = df_copy['hakim'].apply(type).unique()

# Display the unique types
print(f"Unique data types in the column: {unique_types}")

Unique data types in the column: [<class 'str'> <class 'list'>]


In [24]:
count_str_hakim = df_copy[df_copy["hakim"].apply(lambda x: isinstance(x, str))].shape[0]
print(f"Jumlah data dengan 'hakim' berupa string: {count_str_hakim}")


Jumlah data dengan 'hakim' berupa string: 157


In [25]:
for index, row in df_copy.iterrows():
    value = row["hakim"]
    
    # If the value is a string representation of a list, convert it back to a list
    if isinstance(value, str):
        try:
            print(f"Type: {type(value)}, Row {index} - Value: {value}")
            value = ast.literal_eval(value)  # Convert the string back to a list
            df_copy.at[index, "hakim"] = value  # Update the value in the DataFrame
            print(f"Type: {type(value)}, Row {index} - Value: {value}\n")
            
        except (ValueError, SyntaxError):
            print(f"Error in Row {index}: Not a valid list format.")
    

Type: <class 'str'>, Row 0 - Value: ['I Made Bagiarta~Hakim Ketua~Tidak', 'Ni Made Kushandari~Hakim Anggota~Tidak', 'Made Astina Dwipayana~Hakim Anggota~Tidak', 'I Made Bagiarta~Hakim Ketua~Ya', 'Ni Made Kushandari~Hakim Anggota~Ya', 'Ni Putu Asih Yudiastri~Hakim Anggota~Ya']
Type: <class 'list'>, Row 0 - Value: ['I Made Bagiarta~Hakim Ketua~Tidak', 'Ni Made Kushandari~Hakim Anggota~Tidak', 'Made Astina Dwipayana~Hakim Anggota~Tidak', 'I Made Bagiarta~Hakim Ketua~Ya', 'Ni Made Kushandari~Hakim Anggota~Ya', 'Ni Putu Asih Yudiastri~Hakim Anggota~Ya']

Type: <class 'str'>, Row 1 - Value: ['Made Hermayanti Muliartha~Hakim Ketua~Ya', 'Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya', 'Wayan Eka Satria Utama~Hakim Anggota~Ya']
Type: <class 'list'>, Row 1 - Value: ['Made Hermayanti Muliartha~Hakim Ketua~Ya', 'Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya', 'Wayan Eka Satria Utama~Hakim Anggota~Ya']

Type: <class 'str'>, Row 2 - Value: ['I Made Bagiarta~Hakim Ketua~Tidak', 'I Gusti Made Juliartawa

In [26]:
# Count unique data types in the column
unique_types = df_copy['hakim'].apply(type).unique()

# Display the unique types
print(f"Unique data types in the column: {unique_types}")

Unique data types in the column: [<class 'list'>]


In [27]:
unique_types = df_copy['hakim'].apply(len).unique()
print(unique_types)

[6 3 9]


In [28]:
for index, row in df_copy.iterrows():
    if len(row["hakim"]) == 6: 
        print(f"{index} -> {row['hakim']}")

0 -> ['I Made Bagiarta~Hakim Ketua~Tidak', 'Ni Made Kushandari~Hakim Anggota~Tidak', 'Made Astina Dwipayana~Hakim Anggota~Tidak', 'I Made Bagiarta~Hakim Ketua~Ya', 'Ni Made Kushandari~Hakim Anggota~Ya', 'Ni Putu Asih Yudiastri~Hakim Anggota~Ya']
2 -> ['I Made Bagiarta~Hakim Ketua~Tidak', 'I Gusti Made Juliartawan~Hakim Anggota~Tidak', 'Ni Putu Asih Yudiastri~Hakim Anggota~Tidak', 'I Made Bagiarta~Hakim Ketua~Ya', 'I Gusti Made Juliartawan~Hakim Anggota~Ya', 'I Gusti Ayu Kade Ari Wulandari~Hakim Anggota~Ya']
12 -> ['I Gusti Made Juliartawan~Hakim Ketua~Tidak', 'Ni Made Kushandari~Hakim Anggota~Tidak', 'Made Astina Dwipayana~Hakim Anggota~Tidak', 'I Gusti Made Juliartawan~Hakim Ketua~Ya', 'Ni Made Kushandari~Hakim Anggota~Ya', 'Ni Putu Asih Yudiastri~Hakim Anggota~Ya']
13 -> ['I Gusti Made Juliartawan~Hakim Ketua~Tidak', 'Ni Made Kushandari~Hakim Anggota~Tidak', 'Made Astina Dwipayana~Hakim Anggota~Tidak', 'I Gusti Made Juliartawan~Hakim Ketua~Ya', 'Ni Made Kushandari~Hakim Anggota~Ya', 

In [29]:
for index, row in df_copy.iterrows():
    if len(row["hakim"]) == 9: 
        print(f"{index} -> {row['hakim']}")

479 -> ['I Gede Yuliartha~Hakim Ketua~Tidak', 'A.A. Ayu Merta Dewi~Hakim Anggota~Tidak', 'Anak Agung Ngurah Budhi Dharmawan~Hakim Anggota~Tidak', 'I Gede Yuliartha~Hakim Ketua~Tidak', 'Anak Agung Ngurah Budhi Dharmawan~Hakim Anggota~Tidak', 'Wayan Eka Satria Utama~Hakim Anggota~Tidak', 'I Gede Yuliartha~Hakim Ketua~Ya', 'Anak Agung Ngurah Budhi Dharmawan~Hakim Anggota~Ya', 'I Nyoman Dipa Rudiana~Hakim Anggota~Ya']


In [30]:
for index, row in df_copy.iterrows():
    i = len(row["hakim"])
    for j in range(i):
        print(row["hakim"][j])
    print("")

I Made Bagiarta~Hakim Ketua~Tidak
Ni Made Kushandari~Hakim Anggota~Tidak
Made Astina Dwipayana~Hakim Anggota~Tidak
I Made Bagiarta~Hakim Ketua~Ya
Ni Made Kushandari~Hakim Anggota~Ya
Ni Putu Asih Yudiastri~Hakim Anggota~Ya

Made Hermayanti Muliartha~Hakim Ketua~Ya
Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya
Wayan Eka Satria Utama~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Tidak
I Gusti Made Juliartawan~Hakim Anggota~Tidak
Ni Putu Asih Yudiastri~Hakim Anggota~Tidak
I Made Bagiarta~Hakim Ketua~Ya
I Gusti Made Juliartawan~Hakim Anggota~Ya
I Gusti Ayu Kade Ari Wulandari~Hakim Anggota~Ya

I Gusti Made Juliartawan~Hakim Ketua~Ya
Ni Made Kushandari~Hakim Anggota~Ya
I Gusti Ayu Kade Ari Wulandari~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Ya
Made Hermayanti Muliartha~Hakim Anggota~Ya
Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Ya
Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya
Pulung Yustisia Dewi~Hakim Anggota~Ya

I Gusti Made Juliartawan~Hakim Ketua~Ya
I Gust

In [31]:
df_copy.loc[:, 'hakim'] = df_copy['hakim'].apply(lambda x: [entry for entry in x if '~Tidak' not in entry])

In [32]:
for index, row in df_copy.iterrows():
    i = len(row["hakim"])
    for j in range(i):
        print(row["hakim"][j])
    print("")

I Made Bagiarta~Hakim Ketua~Ya
Ni Made Kushandari~Hakim Anggota~Ya
Ni Putu Asih Yudiastri~Hakim Anggota~Ya

Made Hermayanti Muliartha~Hakim Ketua~Ya
Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya
Wayan Eka Satria Utama~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Ya
I Gusti Made Juliartawan~Hakim Anggota~Ya
I Gusti Ayu Kade Ari Wulandari~Hakim Anggota~Ya

I Gusti Made Juliartawan~Hakim Ketua~Ya
Ni Made Kushandari~Hakim Anggota~Ya
I Gusti Ayu Kade Ari Wulandari~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Ya
Made Hermayanti Muliartha~Hakim Anggota~Ya
Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Ya
Anak Agung Ayu Sri Sudanthi~Hakim Anggota~Ya
Pulung Yustisia Dewi~Hakim Anggota~Ya

I Gusti Made Juliartawan~Hakim Ketua~Ya
I Gusti Ayu Kade Ari Wulandari~Hakim Anggota~Ya
Made Astina Dwipayana~Hakim Anggota~Ya

I Gusti Ayu Kade Ari Wulandari~Hakim Ketua~Ya
Made Astina Dwipayana~Hakim Anggota~Ya
Ni Putu Asih Yudiastri~Hakim Anggota~Ya

I Made Bagiarta~Hakim Ketua~Y

In [33]:
unique_types = df_copy['hakim'].apply(len).unique()
print(unique_types)

[3]


In [34]:
unique_types = df_copy['hakim'].apply(type).unique()
print(unique_types)

[<class 'list'>]


In [35]:
df_copy.loc[:, 'hakim'] = df_copy['hakim'].apply(lambda x: [value for value in x if '~Hakim Anggota' not in value])

In [36]:
for index, row in df_copy.iterrows():
    print(row['hakim'])

['I Made Bagiarta~Hakim Ketua~Ya']
['Made Hermayanti Muliartha~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['I Gusti Ayu Kade Ari Wulandari~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['Made Hermayanti Muliartha~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['I Gusti Ayu Kade Ari Wulandari~Hakim Ketua~Ya']
['Made Hermayanti Muliartha~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim Ketua~Ya']
['Heriyanti~Hakim Ketua~Ya']
['I Made Bagiarta~Hakim Ketua~Ya']
['I Gusti Made Juliartawan~Hakim K

In [37]:
unique_types = df_copy['hakim'].apply(len).unique()
print(unique_types)

[1]


In [38]:
df_copy.loc[:, 'hakim'] = df_copy['hakim'].apply(
    lambda x: [value.split('~Hakim')[0].strip() for value in x]
)

In [39]:
for index, row in df_copy.iterrows():
    print(row['hakim'])

['I Made Bagiarta']
['Made Hermayanti Muliartha']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Made Bagiarta']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Gusti Ayu Kade Ari Wulandari']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['Made Hermayanti Muliartha']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Gusti Made Juliartawan']
['I Gusti Ayu Kade Ari Wulandari']
['Made Hermayanti Muliartha']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Made Bagiarta']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Gusti Made Juliartawan']
['Heriyanti']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['Made Hermayanti Muliartha']
['I Gusti Made Juliartawan']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Made Bagiarta']
['I Gusti Made Juliartawan']
['I Made Bagiarta']
['I Made Bagiarta']
['Made Hermayanti Muliartha']
['I Gusti Made Juliartawan']
['Made Hermayanti Muliartha']
['I Gusti Made Juliartawan']
['I Gusti Made Juliartawan']
['Heriyanti']
['I Gus

In [40]:
df_copy['hakim'] = df_copy['hakim'].apply(lambda x: str(x[0]) if isinstance(x, list) and len(x) > 0 else str(x))

In [41]:
unique_types = df_copy['hakim'].apply(type).unique()
print(unique_types)

[<class 'str'>]


In [42]:
for index, row in df_copy.iterrows():
    print(row['hakim'])

I Made Bagiarta
Made Hermayanti Muliartha
I Made Bagiarta
I Gusti Made Juliartawan
I Made Bagiarta
I Made Bagiarta
I Gusti Made Juliartawan
I Gusti Ayu Kade Ari Wulandari
I Made Bagiarta
I Gusti Made Juliartawan
Made Hermayanti Muliartha
I Made Bagiarta
I Gusti Made Juliartawan
I Gusti Made Juliartawan
I Gusti Ayu Kade Ari Wulandari
Made Hermayanti Muliartha
I Made Bagiarta
I Gusti Made Juliartawan
I Made Bagiarta
I Made Bagiarta
I Gusti Made Juliartawan
I Gusti Made Juliartawan
Heriyanti
I Made Bagiarta
I Gusti Made Juliartawan
Made Hermayanti Muliartha
I Gusti Made Juliartawan
I Made Bagiarta
I Gusti Made Juliartawan
I Made Bagiarta
I Gusti Made Juliartawan
I Made Bagiarta
I Made Bagiarta
Made Hermayanti Muliartha
I Gusti Made Juliartawan
Made Hermayanti Muliartha
I Gusti Made Juliartawan
I Gusti Made Juliartawan
Heriyanti
I Gusti Made Juliartawan
Ni Made Kushandari
I Made Bagiarta
I Made Bagiarta
I Gusti Made Juliartawan
I Made Bagiarta
I Gusti Made Juliartawan
I Made Bagiarta
I Mad

### Data Cleaning: Barang Bukti

In [43]:
text = df_copy.iloc[-1]['barang_bukti']
print(text)

M E N G A D I L I
Â 
Menyatakan Terdakwa Jumat Ariyanto dengan identitas sebagaimana tersebut diatas telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana â€œdengan sengaja mengangkut hasil hutan kayu yangÂ  tidak dilengkapi bersama dengan surat keterangan sahnya hasil hutanâ€;
Menjatuhkan pidana kepada Terdakwa oleh karena itu dengan pidana penjara selamaÂ  1 (satu) Â tahun dan denda sebesar Rp.500.000.000 (lima ratus juta rupiah) dengan ketentuan apabila denda tersebut tidak dibayar, maka akan diganti dengan pidana penjara pengganti selama 3 (tiga) bulan;
Â Menetapkan lamanya Terdakwa berada dalam penahanan dikurangkan seluruhnya dari pidana yang dijatuhkan ;
Menetapkan Terdakwa tetap berada dalam tahanan;
Menetapkan barang bukti berupa:
1 (satu) 1 (satu) batang kayu balang-balang bentuk bulan ukuran panjang 3 meter berdiameter 18 cm dengan volume 0,073 M3 ;
1 (satu) batang kayu balang-bala

In [44]:
cleaned_text = re.sub(r'[^a-zA-Z0-9., /\\()\"\'\n-]+', '', text)

In [45]:
print(df[df['nomor_perkara'] == '86/Pid.B/2019/PN Sgr']['barang_bukti'])


727    M E N G A D I L I :\nMenyatakan Terdakwa NYOMA...
Name: barang_bukti, dtype: object


In [46]:
print(cleaned_text)

M E N G A D I L I
 
Menyatakan Terdakwa Jumat Ariyanto dengan identitas sebagaimana tersebut diatas telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana dengan sengaja mengangkut hasil hutan kayu yang  tidak dilengkapi bersama dengan surat keterangan sahnya hasil hutan
Menjatuhkan pidana kepada Terdakwa oleh karena itu dengan pidana penjara selama  1 (satu)  tahun dan denda sebesar Rp.500.000.000 (lima ratus juta rupiah) dengan ketentuan apabila denda tersebut tidak dibayar, maka akan diganti dengan pidana penjara pengganti selama 3 (tiga) bulan
 Menetapkan lamanya Terdakwa berada dalam penahanan dikurangkan seluruhnya dari pidana yang dijatuhkan 
Menetapkan Terdakwa tetap berada dalam tahanan
Menetapkan barang bukti berupa
1 (satu) 1 (satu) batang kayu balang-balang bentuk bulan ukuran panjang 3 meter berdiameter 18 cm dengan volume 0,073 M3 
1 (satu) batang kayu balang-balang bentuk balok u

In [47]:
cleaned_text = re.sub(r' +', ' ', cleaned_text)

In [48]:
print(cleaned_text)

M E N G A D I L I
 
Menyatakan Terdakwa Jumat Ariyanto dengan identitas sebagaimana tersebut diatas telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana dengan sengaja mengangkut hasil hutan kayu yang tidak dilengkapi bersama dengan surat keterangan sahnya hasil hutan
Menjatuhkan pidana kepada Terdakwa oleh karena itu dengan pidana penjara selama 1 (satu) tahun dan denda sebesar Rp.500.000.000 (lima ratus juta rupiah) dengan ketentuan apabila denda tersebut tidak dibayar, maka akan diganti dengan pidana penjara pengganti selama 3 (tiga) bulan
 Menetapkan lamanya Terdakwa berada dalam penahanan dikurangkan seluruhnya dari pidana yang dijatuhkan 
Menetapkan Terdakwa tetap berada dalam tahanan
Menetapkan barang bukti berupa
1 (satu) 1 (satu) batang kayu balang-balang bentuk bulan ukuran panjang 3 meter berdiameter 18 cm dengan volume 0,073 M3 
1 (satu) batang kayu balang-balang bentuk balok ukur

In [49]:
pattern = r"Menetapkan barang bukti berupa\s*(.*?)(?=\n\s*[A-Z]|$)"
# pattern = r"Menetapkan barang bukti berupa\s*(.*?)(?=\n[A-Z])"
match = re.search(pattern, cleaned_text, re.DOTALL)

if match:
    print(match.group().strip())
else:
    print("Tidak ada kecocokan ditemukan.")

Menetapkan barang bukti berupa
1 (satu) 1 (satu) batang kayu balang-balang bentuk bulan ukuran panjang 3 meter berdiameter 18 cm dengan volume 0,073 M3 
1 (satu) batang kayu balang-balang bentuk balok ukuran panjang 3 meter lebar 21 cm, tinggi 14 cm dengan volume 0,0882 M3


In [50]:
text = df_copy.iloc[3]['barang_bukti']
print(text)

MENGADILI:
Menyatakan Terdakwa ANDRIANI, telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana “Pencurian” sebagaimana dalam dakwaan tunggal Penuntut Umum;
Menjatuhkan pidana kepada Terdakwa, oleh karena itu dengan pidana penjara selama 1 ( satu ) tahun dan 2 ( dua ) bulan;
Menetapkan masa penangkapan dan penahanan yang telah dijalani oleh Terdakwa dikurangkan seluruhnya dari pidana penjara yang dijatuhkan;
Menetapkan Terdakwa tetap ditahan ;
Menetapkan barang bukti berupa:
1 (satu) buah gelang emas (batu ungu) berat 5 gram;
1 (satu) buah gelang emas bangle berat 3,5 gram;
1 (satu) buah gelang rantai emas berat 9,8 gram;
1 (satu) buah cincin emas mata Batu merah berat 2,5 gram;
2 (dua) pasang anting-anting emas;
1 (satu) buah dompet warna merah muda dengan motif  kupu-kupu;
1 (satu) lembar nota dari toko perhiasan emas “Dewata Ayu” tanggal 10 Oktober 2022 dengan nota No. KP 185.
1 (satu) kalung rantai ditaksir perhiasan emas 9 karat berat 12,2 / 12.0 Gram;
1 (satu) 

In [51]:
patterns = [
    r"barang\s*bukti\s*berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"Barang\s*bukti\s*berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"Barang\s*Bukti\s*berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"Barang\s*Bukti\s*Berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"barang\s*Bukti\s*Berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"barang\s*bukti\s*Berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"barang\s*bukti,\s*berupa\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"barang\s*bukti\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)",
    r"barang-bukti\s*:?\s*(.*?)(?=\n\s*[A-Z]|$)"
]

# List to store indices of rows to drop
rows_to_drop = []

# Iterate over rows
for index, row in df_copy.iterrows():
    text_barang_bukti = row['barang_bukti']
    
    # Clean the 'barang_bukti' text
    cleaned_barang_bukti = re.sub(r'[^a-zA-Z0-9., /\\()\"\'\n-]+', '', text_barang_bukti)
    
    match_found = False

    # Iterate over the patterns and try to find a match
    for pattern in patterns:
        match = re.search(pattern, cleaned_barang_bukti, re.DOTALL)

        if match:
            df_copy.loc[index, 'cleaned_barang_bukti'] = match.group(1).strip().replace('\n', ' ')  # Store the match
            match_found = True
            break

    
    if not match_found:
        # Print info for unmatched rows
        print(f"{index}. Tidak ada kecocokan ditemukan. {row['nomor_perkara']}")
        rows_to_drop.append(index)  # Collect row index to drop
        print(cleaned_barang_bukti)

# Drop rows after the loop completes
df_copy.drop(rows_to_drop, inplace=True)
print(f"rows dropped: {rows_to_drop}")
print(f"rows dropped: {len(rows_to_drop)}")

39. Tidak ada kecocokan ditemukan. 68/Pid.B/2024/PN Sgr
MENGADILI
Menyatakan Terdakwa I Nyoman Eryanto Alias Munuk telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana penganiayaan sebagaimana dalam dakwaan tunggal
Menjatuhkan pidana terhadap Terdakwa dengan pidana penjara selama 4  (empat) bulan       
Menetapkan masa penangkapan dan penahanan yang telah dijalani Terdakwa dikurangkan seluruhnya dari pidana yang dijatuhkan
Menetapkan agar Terdakwa tetap ditahan
Membebankan kepada Terdakwa untuk membayar biaya perkara sebesar Rp. 5.000. (lima ribu rupiah)
99. Tidak ada kecocokan ditemukan. 149/Pid.B/2023/PN Sgr
MENGADILI
Menyatakan Terdakwa PUTU HERRY SUARTHANA Alias HERIK sebagaimana identitas tersebut di atas, telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana penganiayaan sebagaimana dalam dakwaan tunggal  Penuntut Umum 
Menjatuhkan pidana terhadap Terdakwa oleh karena itu dengan pidana penjara selama 1 ( satu ) tahun
Menetapkan lamanya mas

In [52]:
df_copy.iloc[0]['cleaned_barang_bukti']

'38 (tiga puluh delapan) potongan pipet plastik bening yang masing-masing di dalamnya berisi butiran kristal bening diduga narkotika jenis shabu dengan berat'

In [53]:
print(df_copy[df_copy['nomor_perkara'] == '86/Pid.B/2019/PN Sgr']['cleaned_barang_bukti'])


631    
Name: cleaned_barang_bukti, dtype: object


In [54]:
df_copy

Unnamed: 0,status_perkara,nomor_perkara,klasifikasi_perkara,tanggal_pendaftaran,lama_proses,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,barang_bukti,dakwaan,cleaned_barang_bukti
0,Minutasi,118/Pid.Sus/2024/PN Sgr,Narkotika,"Senin, 05 Agu. 2024",44 Hari,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,MENGADILI:\nMenyatakan Terdakwa I NYOMAN SUART...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,38 (tiga puluh delapan) potongan pipet plastik...
1,Minutasi,117/Pid.Sus/2024/PN Sgr,Narkotika,"Kamis, 01 Agu. 2024",49 Hari,"['MADE SUDAMA ALIAS KARTOLO', 'PUTU WISNU ALIA...","['I MADE SUTAPA,S.H.']",Made Hermayanti Muliartha,3,['PUTU WISNU ALIAS WISNU~Subsider Penjara (10 ...,MENGADILI:\nMenyatakan Terdakwa I Made Sudama ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,1 (satu) buah plastik klip bening berisi butir...
2,Minutasi,112/Pid.Sus/2024/PN Sgr,Narkotika,"Selasa, 30 Jul. 2024",44 Hari,"['I KOMANG TEKEN Alias TEKEN', 'KADEK ALBET SA...","['Kadek Adi Pramarta, S.H.']",I Made Bagiarta,4,['KADEK ALBET SANJAYA Alias ALBET~Pidana Penja...,M E N G A D I L I :\nMenyatakan terda...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,1. 1 (satu) buah buah pipet kaca berisi residu...
3,Minutasi,101/Pid.B/2024/PN Sgr,Pencurian,"Kamis, 11 Jul. 2024",56 Hari,['ANDRIANI'],"['Nyoman Arif Budiman, S.H., M.H.']",I Gusti Made Juliartawan,3,['ANDRIANI~Pidana Penjara Waktu Tertentu (1 Ta...,"MENGADILI:\nMenyatakan Terdakwa ANDRIANI, tela...",“Demi Keadilan dan Kebenaran\nBerdasarkan Ketu...,1 (satu) buah gelang emas (batu ungu) berat 5 ...
4,Minutasi,98/Pid.Sus/2024/PN Sgr,Narkotika,"Rabu, 10 Jul. 2024",64 Hari,['I NYOMAN IWAN MAHADI alias MANG IWAN'],"['I GUSTI NGURAH ARYA DIATMIKA,S.H.']",I Made Bagiarta,3,['I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana ...,MENGADILI:\nMenyatakan Terdakwa I Nyoman Iwan ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,1 (satu) buah bong alat hisap sabu 1 (satu) pi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1086,Minutasi,237/Pid.B/2017/PN Sgr,Penggelapan,"Rabu, 20 Des. 2017",56 Hari,[Ayu Putu Erlyandani],"[IMAM EKA SETYAWAN, SH.]",Ni Made Dewi Sukrani,4,[Ayu Putu Erlyandani~Pidana Penjara Waktu Tert...,M E N G A D I L I:\nMenyatakan Terdakwa AYU PU...,Kesatu : Â \nPrimair.\n-------- Bahwa terdakwa...,1 (satu) buah Buku Simpanan atau Tabungan Kopr...
1087,Minutasi,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"Rabu, 20 Des. 2017",97 Hari,[Ketut Mahardika Alias Kelet],"[I Nyoman Sulitra, SH.,MH.]",Ni Made Dewi Sukrani,10,[Ketut Mahardika Alias Kelet~Pidana Penjara Wa...,MENGADILI:\nÂ \nMenyatakan Terdakwa Ketut Maha...,Kesatu\n----- Bahwa terdakwa Â KETUT MAHARDIKA...,"1 (satu) buah pedang dengan panjang 55 cm, den..."
1088,Minutasi,227/Pid.B/2017/PN Sgr,Penggelapan,"Senin, 11 Des. 2017",52 Hari,[Iwan Hermato Alias Iwan],"[I GEDE PUTU ASTAWA, SH.]",Ida Bagus Bama Dewa. P,3,[Iwan Hermato Alias Iwan~Pidana Kurungan (1 Ta...,MENGADILI\nMenyatakan Terdakwa Iwan Hermanto A...,Pertama : Pasal 372 Kitab Undang-undang Hukum ...,1(satu) buah tas Kalep warna merah merk Son Al...
1089,Minutasi,226/Pid.Sus/2017/PN Sgr,Narkotika,"Senin, 11 Des. 2017",31 Hari,[Komang Irwin Pranata Alias Erwin],"[I GEDE PUTU ASTAWA, SH.]",Sudar,2,[Komang Irwin Pranata Alias Erwin~Pidana Penja...,Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ...,Pertama : Pasal 112 ayat (1) UU RI No. 35 tahu...,1(satu) bungkus rokok merk Sampoerna Mild Men...


In [55]:
null_counts = df_copy.isnull().sum()

print("Number of null values in each column:")
print(null_counts)

Number of null values in each column:
status_perkara          0
nomor_perkara           0
klasifikasi_perkara     0
tanggal_pendaftaran     0
lama_proses             0
terdakwa                0
penuntut_umum           0
hakim                   0
jumlah_saksi            0
putusan_hukuman         0
barang_bukti            0
dakwaan                 0
cleaned_barang_bukti    0
dtype: int64


### Data Cleaning: Dakwaan

In [56]:
dakwaan = df_copy.iloc[1]['dakwaan']
print(dakwaan)

KEJAKSAAN REPUBLIK INDONESIA
KEJAKSAAN TINGGI BALI
KEJAKSAAN NEGERI BULELENG
Jl. Dewi Sartika No. 23, Kaliuntu, Kec. Buleleng, Kab. Buleleng, Bali
Telp. (0362) 22580. www.kejari-buleleng.go.id
“Demi Keadilan Dan Kebenaran
Berdasarkan Ketuhanan Yang Maha Esa”
P-29
          SURAT DAKWAAN
NOMOR : REG. PERKARA PDM-47/Enz.2/BLL/07/2024
  IDENTITAS PARA TERDAKWA
Terdakwa I
Nama lengkap                :    MADE SUDAMA Alias KARTOLO
Nomor Identitas             :    5108073112800028
Tempat lahir                  :    Sangsit
Umur/tanggal lahir         :    43 Tahun / 31 Desember 1980
Jenis kelamin                 :    Laki-laki
Kewarganegaraan          :    Indonesia
Tempat tinggal               :    Banjar Dinas Beji, Desa Sangsit, Kecamatan Sawan, Kabupaten Buleleng
Agama                           :    Hindu
Pekerjaan                       :    Buruh Harian Lepas
Pendidikan                     :    SMA (Tamat)
  Terdakwa II
Nama lengkap                :    PUTU WISNU Alias WISNU
Nomor Identi

In [57]:
count_short_texts = sum(1 for text in df_copy['dakwaan'] if len(text) < 500)
print(f"Jumlah teks dengan panjang di bawah 500 karakter: {count_short_texts}")

Jumlah teks dengan panjang di bawah 500 karakter: 15


In [58]:
# cleaning_dakwaan_unwanted_char = re.findall(r'[a-zA-Z0-9., /\\()\"\'\n]+', dakwaan)

# cleaned_dakwaan = ''.join(cleaning_dakwaan_unwanted_char)

# print(cleaned_dakwaan)

cleaning_dakwaan_unwanted_char = re.findall(r'[a-zA-Z0-9., /\\()\"\'\n-]+', dakwaan)

cleaned_dakwaan = ''.join(cleaning_dakwaan_unwanted_char)
cleaned_dakwaan = re.sub(r'-{2,}', '-', cleaned_dakwaan)

print(cleaned_dakwaan)


KEJAKSAAN REPUBLIK INDONESIA
KEJAKSAAN TINGGI BALI
KEJAKSAAN NEGERI BULELENG
Jl. Dewi Sartika No. 23, Kaliuntu, Kec. Buleleng, Kab. Buleleng, Bali
Telp. (0362) 22580. www.kejari-buleleng.go.id
Demi Keadilan Dan Kebenaran
Berdasarkan Ketuhanan Yang Maha Esa
P-29
          SURAT DAKWAAN
NOMOR  REG. PERKARA PDM-47/Enz.2/BLL/07/2024
  IDENTITAS PARA TERDAKWA
Terdakwa I
Nama lengkap                    MADE SUDAMA Alias KARTOLO
Nomor Identitas                 5108073112800028
Tempat lahir                      Sangsit
Umur/tanggal lahir             43 Tahun / 31 Desember 1980
Jenis kelamin                     Laki-laki
Kewarganegaraan              Indonesia
Tempat tinggal                   Banjar Dinas Beji, Desa Sangsit, Kecamatan Sawan, Kabupaten Buleleng
Agama                               Hindu
Pekerjaan                           Buruh Harian Lepas
Pendidikan                         SMA (Tamat)
  Terdakwa II
Nama lengkap                    PUTU WISNU Alias WISNU
Nomor Identitas           

In [59]:
def clean_dakwaan(text):
    # Extract unwanted characters
    cleaning_dakwaan_unwanted_char = re.findall(r'[a-zA-Z0-9., /\\()\"\'\n-]+', text)
    cleaned_dakwaan = ''.join(cleaning_dakwaan_unwanted_char)
    
    # Replace multiple dashes with a single dash
    cleaned_dakwaan = re.sub(r'-{2,}', '-', cleaned_dakwaan)
    
    return cleaned_dakwaan

In [60]:
# Apply the cleaning function row-wise
df_copy['cleaned_dakwaan'] = df_copy['dakwaan'].apply(clean_dakwaan)

In [61]:
df_copy

Unnamed: 0,status_perkara,nomor_perkara,klasifikasi_perkara,tanggal_pendaftaran,lama_proses,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,barang_bukti,dakwaan,cleaned_barang_bukti,cleaned_dakwaan
0,Minutasi,118/Pid.Sus/2024/PN Sgr,Narkotika,"Senin, 05 Agu. 2024",44 Hari,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,MENGADILI:\nMenyatakan Terdakwa I NYOMAN SUART...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,38 (tiga puluh delapan) potongan pipet plastik...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
1,Minutasi,117/Pid.Sus/2024/PN Sgr,Narkotika,"Kamis, 01 Agu. 2024",49 Hari,"['MADE SUDAMA ALIAS KARTOLO', 'PUTU WISNU ALIA...","['I MADE SUTAPA,S.H.']",Made Hermayanti Muliartha,3,['PUTU WISNU ALIAS WISNU~Subsider Penjara (10 ...,MENGADILI:\nMenyatakan Terdakwa I Made Sudama ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,1 (satu) buah plastik klip bening berisi butir...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
2,Minutasi,112/Pid.Sus/2024/PN Sgr,Narkotika,"Selasa, 30 Jul. 2024",44 Hari,"['I KOMANG TEKEN Alias TEKEN', 'KADEK ALBET SA...","['Kadek Adi Pramarta, S.H.']",I Made Bagiarta,4,['KADEK ALBET SANJAYA Alias ALBET~Pidana Penja...,M E N G A D I L I :\nMenyatakan terda...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...
3,Minutasi,101/Pid.B/2024/PN Sgr,Pencurian,"Kamis, 11 Jul. 2024",56 Hari,['ANDRIANI'],"['Nyoman Arif Budiman, S.H., M.H.']",I Gusti Made Juliartawan,3,['ANDRIANI~Pidana Penjara Waktu Tertentu (1 Ta...,"MENGADILI:\nMenyatakan Terdakwa ANDRIANI, tela...",“Demi Keadilan dan Kebenaran\nBerdasarkan Ketu...,1 (satu) buah gelang emas (batu ungu) berat 5 ...,Demi Keadilan dan Kebenaran\nBerdasarkan Ketuh...
4,Minutasi,98/Pid.Sus/2024/PN Sgr,Narkotika,"Rabu, 10 Jul. 2024",64 Hari,['I NYOMAN IWAN MAHADI alias MANG IWAN'],"['I GUSTI NGURAH ARYA DIATMIKA,S.H.']",I Made Bagiarta,3,['I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana ...,MENGADILI:\nMenyatakan Terdakwa I Nyoman Iwan ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,1 (satu) buah bong alat hisap sabu 1 (satu) pi...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1086,Minutasi,237/Pid.B/2017/PN Sgr,Penggelapan,"Rabu, 20 Des. 2017",56 Hari,[Ayu Putu Erlyandani],"[IMAM EKA SETYAWAN, SH.]",Ni Made Dewi Sukrani,4,[Ayu Putu Erlyandani~Pidana Penjara Waktu Tert...,M E N G A D I L I:\nMenyatakan Terdakwa AYU PU...,Kesatu : Â \nPrimair.\n-------- Bahwa terdakwa...,1 (satu) buah Buku Simpanan atau Tabungan Kopr...,Kesatu \nPrimair.\n- Bahwa terdakwa AYU PUTU...
1087,Minutasi,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"Rabu, 20 Des. 2017",97 Hari,[Ketut Mahardika Alias Kelet],"[I Nyoman Sulitra, SH.,MH.]",Ni Made Dewi Sukrani,10,[Ketut Mahardika Alias Kelet~Pidana Penjara Wa...,MENGADILI:\nÂ \nMenyatakan Terdakwa Ketut Maha...,Kesatu\n----- Bahwa terdakwa Â KETUT MAHARDIKA...,"1 (satu) buah pedang dengan panjang 55 cm, den...",Kesatu\n- Bahwa terdakwa KETUT MAHARDIKA Als ...
1088,Minutasi,227/Pid.B/2017/PN Sgr,Penggelapan,"Senin, 11 Des. 2017",52 Hari,[Iwan Hermato Alias Iwan],"[I GEDE PUTU ASTAWA, SH.]",Ida Bagus Bama Dewa. P,3,[Iwan Hermato Alias Iwan~Pidana Kurungan (1 Ta...,MENGADILI\nMenyatakan Terdakwa Iwan Hermanto A...,Pertama : Pasal 372 Kitab Undang-undang Hukum ...,1(satu) buah tas Kalep warna merah merk Son Al...,Pertama Pasal 372 Kitab Undang-undang Hukum P...
1089,Minutasi,226/Pid.Sus/2017/PN Sgr,Narkotika,"Senin, 11 Des. 2017",31 Hari,[Komang Irwin Pranata Alias Erwin],"[I GEDE PUTU ASTAWA, SH.]",Sudar,2,[Komang Irwin Pranata Alias Erwin~Pidana Penja...,Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ...,Pertama : Pasal 112 ayat (1) UU RI No. 35 tahu...,1(satu) bungkus rokok merk Sampoerna Mild Men...,Pertama Pasal 112 ayat (1) UU RI No. 35 tahun...


In [62]:
print(df_copy.iloc[0]["cleaned_dakwaan"])

KEJAKSAAN REPUBLIK INDONESIA
KEJAKSAAN TINGGI BALI
KEJAKSAAN NEGERI BULELENG
JL. Dewi Sartika Selatan No. 23 Singaraja  Bali 81116
Telp. (0362) 22580 www.kejari-buleleng.go.id
"Demi Keadilan dan Kebenaran
Berdasarkan Ketuhanan Yang Maha Esa"
P-29
        SURAT DAKWAAN
NOMOR REG PERK  PDM   44 /Enz.2/BLL/07/2024
  A.     IDENTITAS TERDAKWA  
Nama Lengkap                 I NYOMAN SUARTA
Nomor Identitas                 5103051109720003
Tempat lahir                      Wanagiri
Umur/Tgl lahir                   51 Tahun / 11 September 1972
Jenis Kelamin                    Laki-laki      
Kebangsaan                      Indonesia
Tempat tinggal                   Banjar Dinas Yeh Ketipat, Desa Wanagiri, Kecamatan Sukasada, Kabupaten Buleleng
Agama                               Hindu
Pekerjaan                          Wiraswasta
Pendidikan                        S1
  B.     STATUS PENANGKAPAN DAN PENAHANAN    
1      Penangkapan                
Penangkapan                              Tanggal

In [63]:
pattern = r"(?i)dakwaan(?!\nnomor)[\s\S]*?(?=(?:dakwaan|$))"

for index, row in df_copy.iterrows():
    text_dakwaan = row['cleaned_dakwaan']
    matches = re.findall(pattern, text_dakwaan, re.IGNORECASE)

    if matches: 
        last_match = matches[-1]
        df_copy.loc[index, "cleaned_dakwaan"] = last_match.strip()
    else:
        df_copy.loc[index, "cleaned_dakwaan"] = text_dakwaan
        print(f"raw dakwaan copied at {index}")


raw dakwaan copied at 2
raw dakwaan copied at 22
raw dakwaan copied at 74
raw dakwaan copied at 80
raw dakwaan copied at 81
raw dakwaan copied at 84
raw dakwaan copied at 85
raw dakwaan copied at 86
raw dakwaan copied at 87
raw dakwaan copied at 96
raw dakwaan copied at 97
raw dakwaan copied at 98
raw dakwaan copied at 100
raw dakwaan copied at 101
raw dakwaan copied at 102
raw dakwaan copied at 104
raw dakwaan copied at 105
raw dakwaan copied at 106
raw dakwaan copied at 107
raw dakwaan copied at 108
raw dakwaan copied at 110
raw dakwaan copied at 111
raw dakwaan copied at 112
raw dakwaan copied at 113
raw dakwaan copied at 114
raw dakwaan copied at 115
raw dakwaan copied at 116
raw dakwaan copied at 117
raw dakwaan copied at 118
raw dakwaan copied at 119
raw dakwaan copied at 120
raw dakwaan copied at 121
raw dakwaan copied at 122
raw dakwaan copied at 123
raw dakwaan copied at 124
raw dakwaan copied at 125
raw dakwaan copied at 126
raw dakwaan copied at 127
raw dakwaan copied at 129

In [64]:
df_copy

Unnamed: 0,status_perkara,nomor_perkara,klasifikasi_perkara,tanggal_pendaftaran,lama_proses,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,barang_bukti,dakwaan,cleaned_barang_bukti,cleaned_dakwaan
0,Minutasi,118/Pid.Sus/2024/PN Sgr,Narkotika,"Senin, 05 Agu. 2024",44 Hari,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,MENGADILI:\nMenyatakan Terdakwa I NYOMAN SUART...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...
1,Minutasi,117/Pid.Sus/2024/PN Sgr,Narkotika,"Kamis, 01 Agu. 2024",49 Hari,"['MADE SUDAMA ALIAS KARTOLO', 'PUTU WISNU ALIA...","['I MADE SUTAPA,S.H.']",Made Hermayanti Muliartha,3,['PUTU WISNU ALIAS WISNU~Subsider Penjara (10 ...,MENGADILI:\nMenyatakan Terdakwa I Made Sudama ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...
2,Minutasi,112/Pid.Sus/2024/PN Sgr,Narkotika,"Selasa, 30 Jul. 2024",44 Hari,"['I KOMANG TEKEN Alias TEKEN', 'KADEK ALBET SA...","['Kadek Adi Pramarta, S.H.']",I Made Bagiarta,4,['KADEK ALBET SANJAYA Alias ALBET~Pidana Penja...,M E N G A D I L I :\nMenyatakan terda...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...
3,Minutasi,101/Pid.B/2024/PN Sgr,Pencurian,"Kamis, 11 Jul. 2024",56 Hari,['ANDRIANI'],"['Nyoman Arif Budiman, S.H., M.H.']",I Gusti Made Juliartawan,3,['ANDRIANI~Pidana Penjara Waktu Tertentu (1 Ta...,"MENGADILI:\nMenyatakan Terdakwa ANDRIANI, tela...",“Demi Keadilan dan Kebenaran\nBerdasarkan Ketu...,1 (satu) buah gelang emas (batu ungu) berat 5 ...,DAKWAAN \nBahwa ia TERDAKWA ANDRIANI selanjutn...
4,Minutasi,98/Pid.Sus/2024/PN Sgr,Narkotika,"Rabu, 10 Jul. 2024",64 Hari,['I NYOMAN IWAN MAHADI alias MANG IWAN'],"['I GUSTI NGURAH ARYA DIATMIKA,S.H.']",I Made Bagiarta,3,['I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana ...,MENGADILI:\nMenyatakan Terdakwa I Nyoman Iwan ...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,1 (satu) buah bong alat hisap sabu 1 (satu) pi...,DAKWAAN \nKESATU\n-Bahwa Terdakwa I Nyoman Iw...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1086,Minutasi,237/Pid.B/2017/PN Sgr,Penggelapan,"Rabu, 20 Des. 2017",56 Hari,[Ayu Putu Erlyandani],"[IMAM EKA SETYAWAN, SH.]",Ni Made Dewi Sukrani,4,[Ayu Putu Erlyandani~Pidana Penjara Waktu Tert...,M E N G A D I L I:\nMenyatakan Terdakwa AYU PU...,Kesatu : Â \nPrimair.\n-------- Bahwa terdakwa...,1 (satu) buah Buku Simpanan atau Tabungan Kopr...,Kesatu \nPrimair.\n- Bahwa terdakwa AYU PUTU...
1087,Minutasi,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"Rabu, 20 Des. 2017",97 Hari,[Ketut Mahardika Alias Kelet],"[I Nyoman Sulitra, SH.,MH.]",Ni Made Dewi Sukrani,10,[Ketut Mahardika Alias Kelet~Pidana Penjara Wa...,MENGADILI:\nÂ \nMenyatakan Terdakwa Ketut Maha...,Kesatu\n----- Bahwa terdakwa Â KETUT MAHARDIKA...,"1 (satu) buah pedang dengan panjang 55 cm, den...",Kesatu\n- Bahwa terdakwa KETUT MAHARDIKA Als ...
1088,Minutasi,227/Pid.B/2017/PN Sgr,Penggelapan,"Senin, 11 Des. 2017",52 Hari,[Iwan Hermato Alias Iwan],"[I GEDE PUTU ASTAWA, SH.]",Ida Bagus Bama Dewa. P,3,[Iwan Hermato Alias Iwan~Pidana Kurungan (1 Ta...,MENGADILI\nMenyatakan Terdakwa Iwan Hermanto A...,Pertama : Pasal 372 Kitab Undang-undang Hukum ...,1(satu) buah tas Kalep warna merah merk Son Al...,Pertama Pasal 372 Kitab Undang-undang Hukum P...
1089,Minutasi,226/Pid.Sus/2017/PN Sgr,Narkotika,"Senin, 11 Des. 2017",31 Hari,[Komang Irwin Pranata Alias Erwin],"[I GEDE PUTU ASTAWA, SH.]",Sudar,2,[Komang Irwin Pranata Alias Erwin~Pidana Penja...,Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ...,Pertama : Pasal 112 ayat (1) UU RI No. 35 tahu...,1(satu) bungkus rokok merk Sampoerna Mild Men...,Pertama Pasal 112 ayat (1) UU RI No. 35 tahun...


In [65]:
print(df_copy.iloc[132]['cleaned_dakwaan'])

KESATU
- Bahwa ia Terdakwa KADEK FAJAR SUKAMADI Alias FAJAR pada hari Kamis tanggal 08 Juni 2023 sekira pukul 13.00 WITA atau setidak-tidaknya pada suatu waktu dalam bulan Juni 2023 atau setidaktidaknya pada suatu waktu dalam Tahun 2023 bertempat di areal terminal Penarukan yang beralamat di Kelurahan Penarukan, Kecamatan Buleleng, Kabupaten Buleleng atau setidak-tidaknya pada suatu tempat yang masih termasuk dalam daerah hukum Pengadilan Negeri Singaraja yang berwenang memeriksa dan mengadili perkara ini, Dengan tanpa hak atau melawan hukum memiliki, menyimpan, menguasai, atau menyediakan Narkotika Golongan I bukan tanaman,
Perbuatan Terdakwa tersebut diatas sebagaimana diatur dan diancam pidana dalam Pasal 112 Ayat (1) UU. RI. Nomor 35 Tahun 2009 tentang Narkotika
atau 
  KEDUA
- Bahwa ia Terdakwa KADEK FAJAR SUKAMADI Alias FAJAR pada hari Kamis tanggal 08 Juni 2023 sekira pukul 13.00 WITA atau setidak-tidaknya pada suatu waktu dalam bulan Juni 2023 atau setidaktidaknya pada suatu wa

In [66]:

pattern = r"^(.*?)\s+Singaraja,\s+\d{1,2}\s+\w+\s+\d{4}"

for index, row in df_copy.iterrows():
    text_dakwaan = row['cleaned_dakwaan']
    matches = re.match(pattern, text_dakwaan, re.DOTALL)

    if matches: 
        df_copy.loc[index, "cleaned_dakwaan"] = matches.group(1).strip()
    else:
        df_copy.loc[index, "cleaned_dakwaan"] = text_dakwaan
        print(f"raw dakwaan copied at {index}")


raw dakwaan copied at 2
raw dakwaan copied at 22
raw dakwaan copied at 38
raw dakwaan copied at 58
raw dakwaan copied at 80
raw dakwaan copied at 81
raw dakwaan copied at 82
raw dakwaan copied at 83
raw dakwaan copied at 84
raw dakwaan copied at 85
raw dakwaan copied at 86
raw dakwaan copied at 87
raw dakwaan copied at 89
raw dakwaan copied at 94
raw dakwaan copied at 96
raw dakwaan copied at 97
raw dakwaan copied at 98
raw dakwaan copied at 100
raw dakwaan copied at 101
raw dakwaan copied at 102
raw dakwaan copied at 104
raw dakwaan copied at 105
raw dakwaan copied at 106
raw dakwaan copied at 107
raw dakwaan copied at 108
raw dakwaan copied at 110
raw dakwaan copied at 111
raw dakwaan copied at 112
raw dakwaan copied at 113
raw dakwaan copied at 114
raw dakwaan copied at 115
raw dakwaan copied at 116
raw dakwaan copied at 117
raw dakwaan copied at 118
raw dakwaan copied at 119
raw dakwaan copied at 120
raw dakwaan copied at 121
raw dakwaan copied at 122
raw dakwaan copied at 123
raw 

In [67]:
print(df_copy.iloc[108]['cleaned_dakwaan'])

- Bahwa ia Terdakwa MADE SUADNYANA bersama-sama dengan Saksi GEDE SUARSANA selaku Sekretaris Kelompok Nelayan Segara Timbul (dalam penuntutan terpisah) dan MADE DWI DARMAWAN selaku Ketua Kelompok Nelayan Sari Segara (dalam penuntutan terpisah), pada hari Selasa Tanggal 02 Mei 2023 sekira pukul 22.30 WITA atau setidak-tidaknya pada suatu waktu dalam bulan Mei 2023 atau setidak-tidaknya pada suatu waktu dalam Tahun 2023 bertempat di Toko Takoyah, Jl. Raya Singgaraja Amlapura, Banjar Dinas Kawanan, Desa Tejakula, Kecamatan Tejakula, Kabupaten Buleleng atau setidak-tidaknya pada suatu tempat yang masih termasuk dalam daerah hukum Pengadilan Negeri Singaraja yang berwenang memeriksa dan mengadili perkara ini, mereka yang melakukan, yang menyuruh melakukan dan yang turut serta melakukan perbuatan menyalahgunakan Pengangkutan dan/atau Niaga Bahan Bakar Minyak, bahan bakar gas, dan/atau liquefied petroleum gas yang disubsidi dan/atau penyediaan dan pendistribusiannya diberikan penugasan Pemeri

In [68]:
df_copy.to_csv(f'../Data/STAGE 1 PREPROCESSING/STAGE_1_PREPROCESSING_{database_name}.csv', index=False)

## Data Transformation

In [69]:
df_clean = pd.read_csv(f'../Data/STAGE 1 PREPROCESSING/STAGE_1_PREPROCESSING_{database_name}.csv')

In [70]:
# Print the 'cleaned_barang_bukti' column for rows where 'nomor_perkara' is '86/Pid.B/2019/PN Sgr'
print(df_copy[df_copy['nomor_perkara'] == '86/Pid.B/2019/PN Sgr']['cleaned_barang_bukti'])


631    
Name: cleaned_barang_bukti, dtype: object


### Data Transformation: Copy to New DataFrame

In [71]:
df_clean.head(1)

Unnamed: 0,status_perkara,nomor_perkara,klasifikasi_perkara,tanggal_pendaftaran,lama_proses,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,barang_bukti,dakwaan,cleaned_barang_bukti,cleaned_dakwaan
0,Minutasi,118/Pid.Sus/2024/PN Sgr,Narkotika,"Senin, 05 Agu. 2024",44 Hari,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,MENGADILI:\nMenyatakan Terdakwa I NYOMAN SUART...,KEJAKSAAN REPUBLIK INDONESIA\nKEJAKSAAN TINGGI...,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...


In [72]:
selected_columns = ['nomor_perkara', 'klasifikasi_perkara', 'terdakwa', 'penuntut_umum', 'hakim', 'jumlah_saksi', 'putusan_hukuman', 'cleaned_barang_bukti', 'cleaned_dakwaan']
df_clean = df_clean[selected_columns].copy()

df_clean.head(1)

Unnamed: 0,nomor_perkara,klasifikasi_perkara,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,cleaned_barang_bukti,cleaned_dakwaan
0,118/Pid.Sus/2024/PN Sgr,Narkotika,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...


### Data Transformation: Extract Multiple Terdakwa & Putusan Hukuman to each Row 

In [73]:
print(f"Jumlah Data: {len(df_clean)}")

count_str_terdakwa = df_clean[df_clean["terdakwa"].apply(lambda x: isinstance(x, str))].shape[0]
print(f"Jumlah data dengan 'terdakwa' berupa string: {count_str_terdakwa}")

count_str_putusan_hukuman = df_clean[df_clean["putusan_hukuman"].apply(lambda x: isinstance(x, str))].shape[0]
print(f"Jumlah data dengan 'putusan_hukuman' berupa string: {count_str_terdakwa}")

Jumlah Data: 1031
Jumlah data dengan 'terdakwa' berupa string: 1031
Jumlah data dengan 'putusan_hukuman' berupa string: 1031


In [74]:
def clean_and_convert_to_list(terdakwa):
    # Ensure the input is treated as a string
    if isinstance(terdakwa, list):
        # If it's already a list, return it as is
        return terdakwa
    if isinstance(terdakwa, str):
        # Clean and convert only if it's a string
        cleaned_str = re.sub(r'\s+', ' ', terdakwa.strip())  # Clean whitespace
        try:
            # Use literal_eval to convert if it's a proper list
            result_list = ast.literal_eval(cleaned_str)
        except (ValueError, SyntaxError):
            # If there's an error, just return as a single-element list
            result_list = [cleaned_str]

        # Ensure the result is a list
        if isinstance(result_list, str):
            return [result_list]
        return result_list
    return []


# Apply the cleaning and conversion function
df_clean['transformed_terdakwa'] = df_clean['terdakwa'].apply(clean_and_convert_to_list)
df_clean['transformed_putusan_hukuman'] = df_clean['putusan_hukuman'].apply(clean_and_convert_to_list)

In [75]:
print(df_clean['transformed_terdakwa'].apply(type).unique())
print(df_clean['transformed_putusan_hukuman'].apply(type).unique())
print(f"Jumlah Data: {len(df_clean)}")

# print(type(df_clean.iloc[1]['terdakwa']))
count_str_terdakwa = df_clean[df_clean["transformed_terdakwa"].apply(lambda x: isinstance(x, list))].shape[0]
print(f"Jumlah data dengan 'transformed_terdakwa' berupa list: {count_str_terdakwa}")

count_str_putusan_hukuman = df_clean[df_clean["transformed_putusan_hukuman"].apply(lambda x: isinstance(x, list))].shape[0]
print(f"Jumlah data dengan 'transformed_putusan_hukuman' berupa list: {count_str_terdakwa}")


[<class 'list'>]
[<class 'list'>]
Jumlah Data: 1031
Jumlah data dengan 'transformed_terdakwa' berupa list: 1031
Jumlah data dengan 'transformed_putusan_hukuman' berupa list: 1031


In [76]:
df_clean['transformed_terdakwa'].apply(len).unique()

array([1, 2, 4, 3, 5])

In [77]:
print(df_clean.loc[18, "putusan_hukuman"])
print(df_clean.loc[18, "terdakwa"])


['MOCH HASAN BASRI alias BAS~Pidana Denda Rp.5.000.000,00\nPidana Penjara Waktu Tertentu (1 Tahun 6 Bulan )\nSubsider Kurungan (3 Bulan )', 'KETUT SUMANTRA alias LOTOT~Subsider Kurungan (3 Bulan )\nPidana Denda Rp.5.000.000,00\nPidana Penjara Waktu Tertentu (1 Tahun 6 Bulan )', 'MOCH HASAN BASRI alias BAS~Pidana Denda Rp.5.000.000,00\nPidana Penjara Waktu Tertentu (1 Tahun 6 Bulan )\nSubsider Kurungan (3 Bulan )']
['KETUT SUMANTRA alias LOTOT', 'MOCH HASAN BASRI alias BAS']


In [78]:
df_clean

Unnamed: 0,nomor_perkara,klasifikasi_perkara,terdakwa,penuntut_umum,hakim,jumlah_saksi,putusan_hukuman,cleaned_barang_bukti,cleaned_dakwaan,transformed_terdakwa,transformed_putusan_hukuman
0,118/Pid.Sus/2024/PN Sgr,Narkotika,['I NYOMAN SUARTA'],"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,['I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\...,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...,[I NYOMAN SUARTA],[I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\n...
1,117/Pid.Sus/2024/PN Sgr,Narkotika,"['MADE SUDAMA ALIAS KARTOLO', 'PUTU WISNU ALIA...","['I MADE SUTAPA,S.H.']",Made Hermayanti Muliartha,3,['PUTU WISNU ALIAS WISNU~Subsider Penjara (10 ...,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,"[MADE SUDAMA ALIAS KARTOLO, PUTU WISNU ALIAS W...",[PUTU WISNU ALIAS WISNU~Subsider Penjara (10 B...
2,112/Pid.Sus/2024/PN Sgr,Narkotika,"['I KOMANG TEKEN Alias TEKEN', 'KADEK ALBET SA...","['Kadek Adi Pramarta, S.H.']",I Made Bagiarta,4,['KADEK ALBET SANJAYA Alias ALBET~Pidana Penja...,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,"[I KOMANG TEKEN Alias TEKEN, KADEK ALBET SANJA...",[KADEK ALBET SANJAYA Alias ALBET~Pidana Penjar...
3,101/Pid.B/2024/PN Sgr,Pencurian,['ANDRIANI'],"['Nyoman Arif Budiman, S.H., M.H.']",I Gusti Made Juliartawan,3,['ANDRIANI~Pidana Penjara Waktu Tertentu (1 Ta...,1 (satu) buah gelang emas (batu ungu) berat 5 ...,DAKWAAN \nBahwa ia TERDAKWA ANDRIANI selanjutn...,[ANDRIANI],[ANDRIANI~Pidana Penjara Waktu Tertentu (1 Tah...
4,98/Pid.Sus/2024/PN Sgr,Narkotika,['I NYOMAN IWAN MAHADI alias MANG IWAN'],"['I GUSTI NGURAH ARYA DIATMIKA,S.H.']",I Made Bagiarta,3,['I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana ...,1 (satu) buah bong alat hisap sabu 1 (satu) pi...,DAKWAAN \nKESATU\n-Bahwa Terdakwa I Nyoman Iw...,[I NYOMAN IWAN MAHADI alias MANG IWAN],[I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana P...
...,...,...,...,...,...,...,...,...,...,...,...
1026,237/Pid.B/2017/PN Sgr,Penggelapan,['Ayu Putu Erlyandani'],"['IMAM EKA SETYAWAN, SH.']",Ni Made Dewi Sukrani,4,['Ayu Putu Erlyandani~Pidana Penjara Waktu Ter...,1 (satu) buah Buku Simpanan atau Tabungan Kopr...,Kesatu \nPrimair.\n- Bahwa terdakwa AYU PUTU...,[Ayu Putu Erlyandani],[Ayu Putu Erlyandani~Pidana Penjara Waktu Tert...
1027,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,['Ketut Mahardika Alias Kelet'],"['I Nyoman Sulitra, SH.,MH.']",Ni Made Dewi Sukrani,10,['Ketut Mahardika Alias Kelet~Pidana Penjara W...,"1 (satu) buah pedang dengan panjang 55 cm, den...",Kesatu\n- Bahwa terdakwa KETUT MAHARDIKA Als ...,[Ketut Mahardika Alias Kelet],[Ketut Mahardika Alias Kelet~Pidana Penjara Wa...
1028,227/Pid.B/2017/PN Sgr,Penggelapan,['Iwan Hermato Alias Iwan'],"['I GEDE PUTU ASTAWA, SH.']",Ida Bagus Bama Dewa. P,3,['Iwan Hermato Alias Iwan~Pidana Kurungan (1 T...,1(satu) buah tas Kalep warna merah merk Son Al...,Pertama Pasal 372 Kitab Undang-undang Hukum P...,[Iwan Hermato Alias Iwan],[Iwan Hermato Alias Iwan~Pidana Kurungan (1 Ta...
1029,226/Pid.Sus/2017/PN Sgr,Narkotika,['Komang Irwin Pranata Alias Erwin'],"['I GEDE PUTU ASTAWA, SH.']",Sudar,2,['Komang Irwin Pranata Alias Erwin~Pidana Penj...,1(satu) bungkus rokok merk Sampoerna Mild Men...,Pertama Pasal 112 ayat (1) UU RI No. 35 tahun...,[Komang Irwin Pranata Alias Erwin],[Komang Irwin Pranata Alias Erwin~Pidana Penja...


In [79]:
print(len(df_clean))
print(df_clean['transformed_terdakwa'].apply(lambda x: isinstance(x, list)).sum())  # Should return number of rows with lists
print(df_clean['transformed_putusan_hukuman'].apply(lambda x: isinstance(x, list)).sum())  # Same for the other column


1031
1031
1031


In [80]:
# Check number of elements in each list
df_clean['terdakwa_len'] = df_clean['transformed_terdakwa'].apply(len)
df_clean['putusan_len'] = df_clean['transformed_putusan_hukuman'].apply(len)

# Display rows where the lengths are not equal
mismatched_rows = df_clean[df_clean['terdakwa_len'] != df_clean['putusan_len']]
print(mismatched_rows[['terdakwa_len', 'putusan_len']].head())


     terdakwa_len  putusan_len
18              2            3
388             2            1


In [81]:
print(mismatched_rows)

                 nomor_perkara          klasifikasi_perkara  \
18   93/Pid.Sus-LH/2024/PN Sgr  Konservasi Sumber Daya Alam   
388      147/Pid.B/2021/PN Sgr                    Pencurian   

                                              terdakwa  \
18   ['KETUT SUMANTRA alias LOTOT', 'MOCH HASAN BAS...   
388  ['IDA BAGUS ARI WIBAWA Als. GUS UNENG', 'KKADE...   

                    penuntut_umum                              hakim  \
18   ['Kadek Adi Pramarta, S.H.']                    I Made Bagiarta   
388   ['Ida Kade Widiatmika, SH']  Anak Agung Ngurah Budhi Dharmawan   

     jumlah_saksi                                    putusan_hukuman  \
18              4  ['MOCH HASAN BASRI alias BAS~Pidana Denda Rp.5...   
388             4  ['IDA BAGUS ARI WIBAWA Als. GUS UNENG~Pidana P...   

                                  cleaned_barang_bukti  \
18   1 (satu) pucuk senapan angin warna hitam rakit...   
388  1 (satu) buah laptop merk axioo warna hitam. 1...   

                          

In [82]:
df_clean = df_clean[df_clean['terdakwa_len'] == df_clean['putusan_len']]

In [83]:
def match_terdakwa_to_putusan(df):
    combined_terdakwa_putusan_list = []
    for index, row in df.iterrows():
        for terdakwa in row['transformed_terdakwa']:
            matched = False
            for putusan_hukuman in row['transformed_putusan_hukuman']:
                if terdakwa in putusan_hukuman:
                    temp_list = [row['nomor_perkara'], row['klasifikasi_perkara'], row['penuntut_umum'], row['hakim'], row['jumlah_saksi'], row['cleaned_barang_bukti'], row['cleaned_dakwaan'],terdakwa, putusan_hukuman] 
                    combined_terdakwa_putusan_list.append(temp_list)
                    matched = True
                    break
            if not matched:
                print(f'no match {terdakwa} at index {index}')
    df_combined = pd.DataFrame(combined_terdakwa_putusan_list, columns=['nomor_perkara', 'klasifikasi_perkara', 'penuntut_umum', 'hakim', 'jumlah_saksi', 'cleaned_barang_bukti', 'cleaned_dakwaan', 'terdakwa', 'putusan_hukuman'])

    return df_combined

df_clean_exploded = match_terdakwa_to_putusan(df_clean)
print(df_clean_exploded)

                nomor_perkara                            klasifikasi_perkara  \
0     118/Pid.Sus/2024/PN Sgr                                      Narkotika   
1     117/Pid.Sus/2024/PN Sgr                                      Narkotika   
2     117/Pid.Sus/2024/PN Sgr                                      Narkotika   
3     112/Pid.Sus/2024/PN Sgr                                      Narkotika   
4     112/Pid.Sus/2024/PN Sgr                                      Narkotika   
...                       ...                                            ...   
1195    237/Pid.B/2017/PN Sgr                                    Penggelapan   
1196    236/Pid.B/2017/PN Sgr                       Kejahatan terhadap Nyawa   
1197    227/Pid.B/2017/PN Sgr                                    Penggelapan   
1198  226/Pid.Sus/2017/PN Sgr                                      Narkotika   
1199  223/Pid.Sus/2017/PN Sgr  Perlindungan dan Pengelolaan Lingkungan Hidup   

                      penuntut_umum    

In [84]:
df_clean_exploded.iloc[1]['putusan_hukuman']

'MADE SUDAMA ALIAS KARTOLO~Subsider Penjara (1 Tahun 2 Bulan )'

In [85]:
type(df_clean_exploded.iloc[1]['putusan_hukuman'])

str

In [86]:
print(len(df_clean_exploded['terdakwa']))
print(len(df_clean_exploded['putusan_hukuman']))

1200
1200


In [87]:
for i in range(10):
    print(i, ".", df_clean_exploded.iloc[i]['putusan_hukuman'])

0 . I NYOMAN SUARTA~Subsider Penjara (2 Bulan )
Pidana Denda Rp.1.000.000.000,00
Pidana Penjara Waktu Tertentu (4 Tahun 4 Bulan )
1 . MADE SUDAMA ALIAS KARTOLO~Subsider Penjara (1 Tahun 2 Bulan )
2 . PUTU WISNU ALIAS WISNU~Subsider Penjara (10 Bulan )
3 . I KOMANG TEKEN Alias TEKEN~Pidana Penjara Waktu Tertentu (9 Bulan )
4 . KADEK ALBET SANJAYA Alias ALBET~Pidana Penjara Waktu Tertentu (9 Bulan )
5 . ANDRIANI~Pidana Penjara Waktu Tertentu (1 Tahun 2 Bulan )
6 . I NYOMAN IWAN MAHADI alias MANG IWAN~Pidana Penjara Waktu Tertentu (9 Bulan )
7 . KETUT SUBAGIA~Pidana Penjara Waktu Tertentu (4 Bulan )
8 . KOMANG ADITYA~Pidana Penjara Waktu Tertentu (4 Bulan )
9 . MUCH.ARIFIN~Pidana Penjara Waktu Tertentu (1 Tahun 4 Bulan )


In [88]:
IXA = 90
print(df_clean_exploded.iloc[IXA]['putusan_hukuman'])
print(df_clean_exploded.iloc[IXA]['nomor_perkara'])

PUTU SATRIA~Pidana Penjara Waktu Tertentu (4 Bulan )
23/Pid.B/2024/PN Sgr


### Data transformation: Extract Multiple Penuntut Umum

In [89]:
# Get the indices of rows where any column contains NaN values
null_indices = df_clean_exploded[df_clean_exploded.isnull().any(axis=1)].index

# Display the indices of rows with NaN values
print(null_indices)

print(df_clean_exploded.iloc[890]['cleaned_barang_bukti'])
print(df_clean_exploded.iloc[890]['nomor_perkara'])


Index([704], dtype='int64')
1 (satu) buah Handphone merk VIVO V15 warna hitam
242/Pid.B/2019/PN Sgr


In [90]:
df_clean_exploded

Unnamed: 0,nomor_perkara,klasifikasi_perkara,penuntut_umum,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,terdakwa,putusan_hukuman
0,118/Pid.Sus/2024/PN Sgr,Narkotika,"['I MADE SUTAPA,S.H.']",I Made Bagiarta,3,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...,I NYOMAN SUARTA,I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\nP...
1,117/Pid.Sus/2024/PN Sgr,Narkotika,"['I MADE SUTAPA,S.H.']",Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,MADE SUDAMA ALIAS KARTOLO,MADE SUDAMA ALIAS KARTOLO~Subsider Penjara (1 ...
2,117/Pid.Sus/2024/PN Sgr,Narkotika,"['I MADE SUTAPA,S.H.']",Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,PUTU WISNU ALIAS WISNU,PUTU WISNU ALIAS WISNU~Subsider Penjara (10 Bu...
3,112/Pid.Sus/2024/PN Sgr,Narkotika,"['Kadek Adi Pramarta, S.H.']",I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,I KOMANG TEKEN Alias TEKEN,I KOMANG TEKEN Alias TEKEN~Pidana Penjara Wakt...
4,112/Pid.Sus/2024/PN Sgr,Narkotika,"['Kadek Adi Pramarta, S.H.']",I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,KADEK ALBET SANJAYA Alias ALBET,KADEK ALBET SANJAYA Alias ALBET~Pidana Penjara...
...,...,...,...,...,...,...,...,...,...
1195,237/Pid.B/2017/PN Sgr,Penggelapan,"['IMAM EKA SETYAWAN, SH.']",Ni Made Dewi Sukrani,4,1 (satu) buah Buku Simpanan atau Tabungan Kopr...,Kesatu \nPrimair.\n- Bahwa terdakwa AYU PUTU...,Ayu Putu Erlyandani,Ayu Putu Erlyandani~Pidana Penjara Waktu Terte...
1196,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"['I Nyoman Sulitra, SH.,MH.']",Ni Made Dewi Sukrani,10,"1 (satu) buah pedang dengan panjang 55 cm, den...",Kesatu\n- Bahwa terdakwa KETUT MAHARDIKA Als ...,Ketut Mahardika Alias Kelet,Ketut Mahardika Alias Kelet~Pidana Penjara Wak...
1197,227/Pid.B/2017/PN Sgr,Penggelapan,"['I GEDE PUTU ASTAWA, SH.']",Ida Bagus Bama Dewa. P,3,1(satu) buah tas Kalep warna merah merk Son Al...,Pertama Pasal 372 Kitab Undang-undang Hukum P...,Iwan Hermato Alias Iwan,Iwan Hermato Alias Iwan~Pidana Kurungan (1 Tah...
1198,226/Pid.Sus/2017/PN Sgr,Narkotika,"['I GEDE PUTU ASTAWA, SH.']",Sudar,2,1(satu) bungkus rokok merk Sampoerna Mild Men...,Pertama Pasal 112 ayat (1) UU RI No. 35 tahun...,Komang Irwin Pranata Alias Erwin,Komang Irwin Pranata Alias Erwin~Pidana Penjar...


In [91]:
df_clean_exploded['penuntut_umum']

0              ['I MADE SUTAPA,S.H.']
1              ['I MADE SUTAPA,S.H.']
2              ['I MADE SUTAPA,S.H.']
3        ['Kadek Adi Pramarta, S.H.']
4        ['Kadek Adi Pramarta, S.H.']
                    ...              
1195       ['IMAM EKA SETYAWAN, SH.']
1196    ['I Nyoman Sulitra, SH.,MH.']
1197      ['I GEDE PUTU ASTAWA, SH.']
1198      ['I GEDE PUTU ASTAWA, SH.']
1199    ['I Nyoman Sulitra, SH.,MH.']
Name: penuntut_umum, Length: 1200, dtype: object

In [92]:
# Count unique data types in the column
unique_types = df_clean_exploded['penuntut_umum'].apply(type).unique()

# Display the unique types
print(f"Unique data types in the column: {unique_types}")

Unique data types in the column: [<class 'str'>]


In [93]:
df_clean_exploded['penuntut_umum'] = df_clean_exploded['penuntut_umum'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [94]:
# Count unique data types in the column
unique_types = df_clean_exploded['penuntut_umum'].apply(type).unique()

# Display the unique types
print(f"Unique data types in the column: {unique_types}")

Unique data types in the column: [<class 'list'>]


In [95]:
# Count unique data types in the column
unique_types = df_clean_exploded['penuntut_umum'].apply(len).unique()

# Display the unique types
print(f"Unique data types in the column: {unique_types}")

Unique data types in the column: [1 2]


In [96]:
# Count the length of each list in the 'penuntut_umum' column
list_lengths = df_clean_exploded['penuntut_umum'].apply(len)

# Get unique list lengths
unique_lengths = list_lengths.unique()

# Print out the unique lengths and the corresponding indices
print(f"Unique lengths of data in the 'penuntut_umum' column: {unique_lengths}")
print("Indices for each unique length:")

# Display indices for each unique length
for length in unique_lengths:
    # Find indices where the list length matches the current length
    indices = list_lengths[list_lengths == length].index
    print(f"Length {length}: Indices - {indices.tolist()}")


Unique lengths of data in the 'penuntut_umum' column: [1 2]
Indices for each unique length:
Length 1: Indices - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 20

In [97]:
for index, row in df_clean_exploded.iterrows():
    data = row['penuntut_umum']
    if len(data) > 1:
        print(index, row['penuntut_umum'])


132 ['IDA KADE WIDIATMIKA, SH', 'IDA KADE WIDIATMIKA, SH']
302 ['IDA KADE WIDIATMIKA, SH', 'IDA KADE WIDIATMIKA, SH']
309 ['ISNARTI JAYANINGSIH, SH.', 'I MADE HERI PERMANA PUTRA,SH.']
339 ['Komang Tirtawati, S.H.', 'KOMANG TIRTA WATI, S.H.']
343 ['IDA KADE WIDIATMIKA, SH', 'IDA KADE WIDIATMIKA, SH']
762 ['PUTU ANDY SUTADHARMA, SH.', 'Gusti Putu Karmawan, S.H.']


In [98]:
# df_clean_exploded['penuntut_umum'] = df_clean_exploded['penuntut_umum'].apply(lambda x: list(set([name.lower() for name in x])))
df_clean_exploded['penuntut_umum'] = df_clean_exploded['penuntut_umum'].apply(lambda x: list(set(x)))
df_clean_exploded['penuntut_umum'] = df_clean_exploded['penuntut_umum'].apply(lambda x: sorted(x))

In [99]:
for index, row in df_clean_exploded.iterrows():
    data = row['penuntut_umum']
    if len(data) > 1:
        print(index, row['penuntut_umum'])

309 ['I MADE HERI PERMANA PUTRA,SH.', 'ISNARTI JAYANINGSIH, SH.']
339 ['KOMANG TIRTA WATI, S.H.', 'Komang Tirtawati, S.H.']
762 ['Gusti Putu Karmawan, S.H.', 'PUTU ANDY SUTADHARMA, SH.']


In [100]:
# Explode the 'penuntut_umum' column so that each value in the list gets its own row
df_clean_penuntut_exploded = df_clean_exploded.explode('penuntut_umum', ignore_index=True)


In [101]:
df_clean_penuntut_exploded

Unnamed: 0,nomor_perkara,klasifikasi_perkara,penuntut_umum,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,terdakwa,putusan_hukuman
0,118/Pid.Sus/2024/PN Sgr,Narkotika,"I MADE SUTAPA,S.H.",I Made Bagiarta,3,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...,I NYOMAN SUARTA,I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\nP...
1,117/Pid.Sus/2024/PN Sgr,Narkotika,"I MADE SUTAPA,S.H.",Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,MADE SUDAMA ALIAS KARTOLO,MADE SUDAMA ALIAS KARTOLO~Subsider Penjara (1 ...
2,117/Pid.Sus/2024/PN Sgr,Narkotika,"I MADE SUTAPA,S.H.",Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,PUTU WISNU ALIAS WISNU,PUTU WISNU ALIAS WISNU~Subsider Penjara (10 Bu...
3,112/Pid.Sus/2024/PN Sgr,Narkotika,"Kadek Adi Pramarta, S.H.",I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,I KOMANG TEKEN Alias TEKEN,I KOMANG TEKEN Alias TEKEN~Pidana Penjara Wakt...
4,112/Pid.Sus/2024/PN Sgr,Narkotika,"Kadek Adi Pramarta, S.H.",I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,KADEK ALBET SANJAYA Alias ALBET,KADEK ALBET SANJAYA Alias ALBET~Pidana Penjara...
...,...,...,...,...,...,...,...,...,...
1198,237/Pid.B/2017/PN Sgr,Penggelapan,"IMAM EKA SETYAWAN, SH.",Ni Made Dewi Sukrani,4,1 (satu) buah Buku Simpanan atau Tabungan Kopr...,Kesatu \nPrimair.\n- Bahwa terdakwa AYU PUTU...,Ayu Putu Erlyandani,Ayu Putu Erlyandani~Pidana Penjara Waktu Terte...
1199,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"I Nyoman Sulitra, SH.,MH.",Ni Made Dewi Sukrani,10,"1 (satu) buah pedang dengan panjang 55 cm, den...",Kesatu\n- Bahwa terdakwa KETUT MAHARDIKA Als ...,Ketut Mahardika Alias Kelet,Ketut Mahardika Alias Kelet~Pidana Penjara Wak...
1200,227/Pid.B/2017/PN Sgr,Penggelapan,"I GEDE PUTU ASTAWA, SH.",Ida Bagus Bama Dewa. P,3,1(satu) buah tas Kalep warna merah merk Son Al...,Pertama Pasal 372 Kitab Undang-undang Hukum P...,Iwan Hermato Alias Iwan,Iwan Hermato Alias Iwan~Pidana Kurungan (1 Tah...
1201,226/Pid.Sus/2017/PN Sgr,Narkotika,"I GEDE PUTU ASTAWA, SH.",Sudar,2,1(satu) bungkus rokok merk Sampoerna Mild Men...,Pertama Pasal 112 ayat (1) UU RI No. 35 tahun...,Komang Irwin Pranata Alias Erwin,Komang Irwin Pranata Alias Erwin~Pidana Penjar...


### Data Transformation: Sentences to Month

In [102]:
pattern = r"Pidana Penjara Waktu Tertentu\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"
pattern_2 = r"Subsider Penjara\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"
pattern_3 = r"Subsider Kurungan\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"

for index, row in df_clean_penuntut_exploded.iterrows():
    # Use re.search to find the match
    text = row["putusan_hukuman"]
    match = re.search(pattern, text, re.IGNORECASE)
    match_2 = re.search(pattern_2, text, re.IGNORECASE)
    match_3 = re.search(pattern_3, text, re.IGNORECASE)

    # Extract and print the result
    if match:
        extracted_duration = match.group(0)
        print(index,"-", extracted_duration)

    elif match_2:
        extracted_duration = match_2.group(0)
        print(index,"-", extracted_duration)

    elif match_3:
        extracted_duration = match_3.group(0)
        print(index,"-", extracted_duration)
        
    else:
        print(f"{index} - No match found.")

0

 - Pidana Penjara Waktu Tertentu (4 Tahun 4 Bulan )
1 - Subsider Penjara (1 Tahun 2 Bulan )
2 - Subsider Penjara (10 Bulan )
3 - Pidana Penjara Waktu Tertentu (9 Bulan )
4 - Pidana Penjara Waktu Tertentu (9 Bulan )
5 - Pidana Penjara Waktu Tertentu (1 Tahun 2 Bulan )
6 - Pidana Penjara Waktu Tertentu (9 Bulan )
7 - Pidana Penjara Waktu Tertentu (4 Bulan )
8 - Pidana Penjara Waktu Tertentu (4 Bulan )
9 - Pidana Penjara Waktu Tertentu (1 Tahun 4 Bulan )
10 - Pidana Penjara Waktu Tertentu (1 Tahun 2 Bulan )
11 - Pidana Penjara Waktu Tertentu (1 Tahun )
12 - Pidana Penjara Waktu Tertentu (9 Bulan )
13 - Pidana Penjara Waktu Tertentu (9 Bulan )
14 - Pidana Penjara Waktu Tertentu (9 Bulan )
15 - Pidana Penjara Waktu Tertentu (1 Tahun )
16 - Pidana Penjara Waktu Tertentu (1 Tahun 3 Bulan )
17 - Pidana Penjara Waktu Tertentu (6 Tahun )
18 - Pidana Penjara Waktu Tertentu (4 Tahun )
19 - Pidana Penjara Waktu Tertentu (4 Tahun )
20 - Pidana Penjara Waktu Tertentu (6 Bulan )
21 - Pidana Penjara Wa

In [103]:
pattern = r"Pidana Penjara Waktu Tertentu\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"
pattern_2 = r"Subsider Penjara\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"
pattern_3 = r"Subsider Kurungan\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"
pattern_4 = r"Pidana Kurungan\s*\(\s*(\d+)\s*(Tahun)?\s*(\d*)\s*(Bulan)?\s*\)"

# pattern_5 = r"Pidana Penjara Waktu Tertentu\s*\(\s*(\d+)\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"
# pattern_6 = r"Subsider Penjara\s*\(\s*(\d+)\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"
# pattern_7 = r"Subsider Kurungan\s*\(\s*(\d+)\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"
# pattern_8 = r"Pidana Kurungan\s*\(\s*(\d+)\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"

pattern_5 = r"~Pidana Penjara Waktu Tertentu\s*\(\s*(\d+)\s*(\d*)\s*(Tahun)?\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"
pattern_6 = r"~Subsider Penjara\s*\(\s*(\d+)\s*(\d*)\s*(Tahun)?\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"
pattern_7 = r"~Subsider Kurungan\s*\(\s*(\d+)\s*(\d*)\s*(Tahun)?\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"
pattern_8 = r"~Pidana Kurungan\s*\(\s*(\d+)\s*(\d*)\s*(Tahun)?\s*(Bulan)?\s*(\d*)\s*(Hari)?\s*\)"

extracted_data = []

for index, row in df_clean_penuntut_exploded.iterrows():
    text = row["putusan_hukuman"]
    match = re.search(pattern, text, re.IGNORECASE)
    match_2 = re.search(pattern_2, text, re.IGNORECASE)
    match_3 = re.search(pattern_3, text, re.IGNORECASE)
    match_4 = re.search(pattern_4, text, re.IGNORECASE)

    match_5 = re.search(pattern_5, text, re.IGNORECASE)
    match_6 = re.search(pattern_6, text, re.IGNORECASE)
    match_7 = re.search(pattern_7, text, re.IGNORECASE)
    match_8 = re.search(pattern_8, text, re.IGNORECASE)

    if match:
        extracted_duration = match.group(0)
    elif match_2:
        extracted_duration = match_2.group(0)
    elif match_3:
        extracted_duration = match_3.group(0)
    elif match_4:
        extracted_duration = match_4.group(0)
    elif match_5:
        extracted_duration = match_5.group(0)
    elif match_6:
        extracted_duration = match_6.group(0)
    elif match_7:
        extracted_duration = match_7.group(0)
    elif match_8:
        extracted_duration = match_8.group(0)
    else:
        extracted_duration = None
    
    print(index, extracted_duration)
    df_clean_penuntut_exploded.loc[index,'extracted_duration'] = extracted_duration



0 Pidana Penjara Waktu Tertentu (4 Tahun 4 Bulan )


1 Subsider Penjara (1 Tahun 2 Bulan )
2 Subsider Penjara (10 Bulan )
3 Pidana Penjara Waktu Tertentu (9 Bulan )
4 Pidana Penjara Waktu Tertentu (9 Bulan )
5 Pidana Penjara Waktu Tertentu (1 Tahun 2 Bulan )
6 Pidana Penjara Waktu Tertentu (9 Bulan )
7 Pidana Penjara Waktu Tertentu (4 Bulan )
8 Pidana Penjara Waktu Tertentu (4 Bulan )
9 Pidana Penjara Waktu Tertentu (1 Tahun 4 Bulan )
10 Pidana Penjara Waktu Tertentu (1 Tahun 2 Bulan )
11 Pidana Penjara Waktu Tertentu (1 Tahun )
12 Pidana Penjara Waktu Tertentu (9 Bulan )
13 Pidana Penjara Waktu Tertentu (9 Bulan )
14 Pidana Penjara Waktu Tertentu (9 Bulan )
15 Pidana Penjara Waktu Tertentu (1 Tahun )
16 Pidana Penjara Waktu Tertentu (1 Tahun 3 Bulan )
17 Pidana Penjara Waktu Tertentu (6 Tahun )
18 Pidana Penjara Waktu Tertentu (4 Tahun )
19 Pidana Penjara Waktu Tertentu (4 Tahun )
20 Pidana Penjara Waktu Tertentu (6 Bulan )
21 Pidana Penjara Waktu Tertentu (1 Tahun 6 Bulan )
22 Pidana Penjara Waktu Tertentu (1 Tahun 3 Bulan )
23 Pidana 

In [104]:
df_clean_penuntut_exploded.loc[1199, 'putusan_hukuman']

'Ketut Mahardika Alias Kelet~Pidana Penjara Waktu Tertentu (2 Tahun 8 Bulan )'

Not Catching Pidana Penjara Waktu Tertentu dengan Tahun Bulan dan Hari

In [105]:
# Filter rows where 'extracted_duration' is null
null_rows = df_clean_penuntut_exploded[df_clean_penuntut_exploded["extracted_duration"].isnull()]

# Display rows with null values in 'extracted_duration'
print(null_rows['putusan_hukuman'])

null_rows

489                          Wayan Lanus~Pidana Bersyarat
820                   NYOMAN SUKRIATMAJA~Pidana Bersyarat
1076    Ketut Bagus Jolinda Atmaja Alias Jolinda~Pidan...
1108    Nyoman Sudiarta alias Comek~Pidana Penjara Wak...
1139    Gusti Ngurah Darma Putra Alias Cuplis~Pidana B...
1140    Gusti Putu Abdiyasa Alias Gusti Abdi~Pidana Be...
Name: putusan_hukuman, dtype: object


Unnamed: 0,nomor_perkara,klasifikasi_perkara,penuntut_umum,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,terdakwa,putusan_hukuman,extracted_duration
489,170/Pid.B/2020/PN Sgr,Penganiayaan,"I MADE HERI PERMANA PUTRA,SH.",I Gede Karang Anggayasa,4,satu buah pedang beserta sarungnya dirampas un...,Bahwa ia Terdakwa Wayan Lanus pada hari Kamis ...,Wayan Lanus,Wayan Lanus~Pidana Bersyarat,
820,172/Pid.B/2020/PN Sgr,Penganiayaan,"MADE JUNI ARTINI, S.H.",I Gede Karang Anggayasa,4,- 1 (satu) buah kayu (potongan alu) - 1 ...,Bahwa ia terdakwa NYOMAN SUKRIATMAJA pada hari...,NYOMAN SUKRIATMAJA,NYOMAN SUKRIATMAJA~Pidana Bersyarat,
1076,139/Pid.B/2018/PN Sgr,Penggelapan,"MADE JUNI ARTINI, S.H.",Sudar,2,----------------------------------------------...,- Bahwa terdakwa KETUT BAGUS JOLINDA ATMAJA pa...,Ketut Bagus Jolinda Atmaja Alias Jolinda,Ketut Bagus Jolinda Atmaja Alias Jolinda~Pidan...,
1108,123/Pid.B/2018/PN Sgr,Kejahatan Perjudian,"I KETUT KINDRA, SH",Sudar,3,1 (satu) bendel Kupon putih berisi tulisan ang...,"dakwaan, terdakwa NYOMAN SUDIARTA Als COMEK y...",Nyoman Sudiarta alias Comek,Nyoman Sudiarta alias Comek~Pidana Penjara Wak...,
1139,82/Pid.B/2018/PN Sgr,Kejahatan Terhadap Ketertiban Umum,"I GEDE PUTU ASTAWA, SH.",Sudar,3,1(satu) buah baju kaos warna hitam milik korba...,"dakwaan tersebut diatas, yaitu berawal kesalah...",Gusti Ngurah Darma Putra Alias Cuplis,Gusti Ngurah Darma Putra Alias Cuplis~Pidana B...,
1140,82/Pid.B/2018/PN Sgr,Kejahatan Terhadap Ketertiban Umum,"I GEDE PUTU ASTAWA, SH.",Sudar,3,1(satu) buah baju kaos warna hitam milik korba...,"dakwaan tersebut diatas, yaitu berawal kesalah...",Gusti Putu Abdiyasa Alias Gusti Abdi,Gusti Putu Abdiyasa Alias Gusti Abdi~Pidana Be...,


In [106]:
#TEMPORARY
df_clean_penuntut_exploded = df_clean_penuntut_exploded.dropna(subset=['extracted_duration'])

In [109]:
df_extracted = df_clean_penuntut_exploded.copy()

pattern_0 = r"\(?\s*(\d+)?\s*Tahun?\s*(\d+)?\s*Bulan?\s*(\d+)\s*Hari?\s*\)?"
pattern = r"\(?\s*(\d+)?\s*Tahun?\s*(\d+)?\s*Bulan?\s*\)?"
pattern_tahun = r"\s*(\d+)\s*Tahun"
pattern_bulan = r"\s*(\d+)\s*Bulan"

# Initialize new columns in the DataFrame for years and months
df_extracted['Tahun'] = None
df_extracted['Bulan'] = None
df_extracted['Hari'] = None

# Iterate over each row and extract years and months
for index, row in df_extracted.iterrows():
    text = row["putusan_hukuman"]
    match = re.search(pattern, text, re.IGNORECASE)
    match_2 = re.search(pattern_tahun, text, re.IGNORECASE)
    match_3 = re.search(pattern_bulan, text, re.IGNORECASE)
    match_0 = re.search(pattern_0, text, re.IGNORECASE)
    
    if match:
        tahun = int(match.group(1)) if match.group(1) else 0  # Extract "Tahun" as an integer
        bulan = int(match.group(2)) if match.group(2) else 0  # Extract "Bulan" as an integer, default to 0 if empty
        hari = 0

    elif match_0:
        tahun = int(match.group(1))  # Extract "Tahun" as an integer
        bulan = int(match.group(2)) if match.group(2) else 0  # Extract "Bulan" as an integer, default to 0 if empty
        hari = int(match.group(3)) if match.group(3) else 0

    elif match_2:
        tahun = int(match_2.group(1))
        bulan = 0
        hari = 0

    elif match_3:
        tahun = 0
        bulan = int(match_3.group(1))
        hari = 0

    else:
        tahun = None
        bulan = None
        hari = None

    df_extracted.loc[index, 'Tahun'] = tahun
    df_extracted.loc[index, 'Bulan'] = bulan
    df_extracted.loc[index, 'Hari'] = hari
# Display the updated DataFrame
print(df_extracted)

                nomor_perkara                            klasifikasi_perkara  \
0     118/Pid.Sus/2024/PN Sgr                                      Narkotika   
1     117/Pid.Sus/2024/PN Sgr                                      Narkotika   
2     117/Pid.Sus/2024/PN Sgr                                      Narkotika   
3     112/Pid.Sus/2024/PN Sgr                                      Narkotika   
4     112/Pid.Sus/2024/PN Sgr                                      Narkotika   
...                       ...                                            ...   
1198    237/Pid.B/2017/PN Sgr                                    Penggelapan   
1199    236/Pid.B/2017/PN Sgr                       Kejahatan terhadap Nyawa   
1200    227/Pid.B/2017/PN Sgr                                    Penggelapan   
1201  226/Pid.Sus/2017/PN Sgr                                      Narkotika   
1202  223/Pid.Sus/2017/PN Sgr  Perlindungan dan Pengelolaan Lingkungan Hidup   

                  penuntut_umum        

In [110]:
# Check if any column contains NaN values
df_extracted.isnull().any()

nomor_perkara           False
klasifikasi_perkara     False
penuntut_umum           False
hakim                   False
jumlah_saksi            False
cleaned_barang_bukti     True
cleaned_dakwaan         False
terdakwa                False
putusan_hukuman         False
extracted_duration      False
Tahun                   False
Bulan                   False
Hari                    False
dtype: bool

In [111]:
# Check for NaN values in 'extracted_duration' and drop the rows
df_extracted = df_extracted.dropna(subset=['extracted_duration'])
df_extracted = df_extracted.dropna(subset=['cleaned_barang_bukti']) # TEMPORARY


In [112]:
# Check if any column contains NaN values
df_extracted.isnull().any()

nomor_perkara           False
klasifikasi_perkara     False
penuntut_umum           False
hakim                   False
jumlah_saksi            False
cleaned_barang_bukti    False
cleaned_dakwaan         False
terdakwa                False
putusan_hukuman         False
extracted_duration      False
Tahun                   False
Bulan                   False
Hari                    False
dtype: bool

In [113]:
df_extracted['total_pidana_penjara_bulan'] = (df_extracted['Tahun'] * 12 + df_extracted['Bulan'].fillna(0).astype(int)).astype(int)

  df_extracted['total_pidana_penjara_bulan'] = (df_extracted['Tahun'] * 12 + df_extracted['Bulan'].fillna(0).astype(int)).astype(int)


In [114]:
df_extracted

Unnamed: 0,nomor_perkara,klasifikasi_perkara,penuntut_umum,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,terdakwa,putusan_hukuman,extracted_duration,Tahun,Bulan,Hari,total_pidana_penjara_bulan
0,118/Pid.Sus/2024/PN Sgr,Narkotika,"I MADE SUTAPA,S.H.",I Made Bagiarta,3,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...,I NYOMAN SUARTA,I NYOMAN SUARTA~Subsider Penjara (2 Bulan )\nP...,Pidana Penjara Waktu Tertentu (4 Tahun 4 Bulan ),4,4,0,52
1,117/Pid.Sus/2024/PN Sgr,Narkotika,"I MADE SUTAPA,S.H.",Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,MADE SUDAMA ALIAS KARTOLO,MADE SUDAMA ALIAS KARTOLO~Subsider Penjara (1 ...,Subsider Penjara (1 Tahun 2 Bulan ),1,2,0,14
2,117/Pid.Sus/2024/PN Sgr,Narkotika,"I MADE SUTAPA,S.H.",Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,PUTU WISNU ALIAS WISNU,PUTU WISNU ALIAS WISNU~Subsider Penjara (10 Bu...,Subsider Penjara (10 Bulan ),0,10,0,10
3,112/Pid.Sus/2024/PN Sgr,Narkotika,"Kadek Adi Pramarta, S.H.",I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,I KOMANG TEKEN Alias TEKEN,I KOMANG TEKEN Alias TEKEN~Pidana Penjara Wakt...,Pidana Penjara Waktu Tertentu (9 Bulan ),0,9,0,9
4,112/Pid.Sus/2024/PN Sgr,Narkotika,"Kadek Adi Pramarta, S.H.",I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,KADEK ALBET SANJAYA Alias ALBET,KADEK ALBET SANJAYA Alias ALBET~Pidana Penjara...,Pidana Penjara Waktu Tertentu (9 Bulan ),0,9,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198,237/Pid.B/2017/PN Sgr,Penggelapan,"IMAM EKA SETYAWAN, SH.",Ni Made Dewi Sukrani,4,1 (satu) buah Buku Simpanan atau Tabungan Kopr...,Kesatu \nPrimair.\n- Bahwa terdakwa AYU PUTU...,Ayu Putu Erlyandani,Ayu Putu Erlyandani~Pidana Penjara Waktu Terte...,Pidana Penjara Waktu Tertentu (10 Bulan ),0,10,0,10
1199,236/Pid.B/2017/PN Sgr,Kejahatan terhadap Nyawa,"I Nyoman Sulitra, SH.,MH.",Ni Made Dewi Sukrani,10,"1 (satu) buah pedang dengan panjang 55 cm, den...",Kesatu\n- Bahwa terdakwa KETUT MAHARDIKA Als ...,Ketut Mahardika Alias Kelet,Ketut Mahardika Alias Kelet~Pidana Penjara Wak...,Pidana Penjara Waktu Tertentu (2 Tahun 8 Bulan ),2,8,0,32
1200,227/Pid.B/2017/PN Sgr,Penggelapan,"I GEDE PUTU ASTAWA, SH.",Ida Bagus Bama Dewa. P,3,1(satu) buah tas Kalep warna merah merk Son Al...,Pertama Pasal 372 Kitab Undang-undang Hukum P...,Iwan Hermato Alias Iwan,Iwan Hermato Alias Iwan~Pidana Kurungan (1 Tah...,Pidana Kurungan (1 Tahun ),1,0,0,12
1201,226/Pid.Sus/2017/PN Sgr,Narkotika,"I GEDE PUTU ASTAWA, SH.",Sudar,2,1(satu) bungkus rokok merk Sampoerna Mild Men...,Pertama Pasal 112 ayat (1) UU RI No. 35 tahun...,Komang Irwin Pranata Alias Erwin,Komang Irwin Pranata Alias Erwin~Pidana Penjar...,Pidana Penjara Waktu Tertentu (11 Bulan ),0,11,0,11


In [None]:
df_extracted.to_csv(f'../Data/STAGE 2 PREPROCESSING/STAGE_2_PREPROCESSING_{database_name}.csv', index=False)

## Data Validation

In [3]:
df_extracted = pd.read_csv(f'../Data/STAGE 2 PREPROCESSING/STAGE_2_PREPROCESSING_{database_name}.csv')

selected_data = ['klasifikasi_perkara', 'terdakwa', 'hakim', 'jumlah_saksi', 'cleaned_barang_bukti', 'cleaned_dakwaan', 'total_pidana_penjara_bulan']
df_extracted = df_extracted[selected_data].copy()

df_extracted.head(10)

Unnamed: 0,klasifikasi_perkara,terdakwa,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,total_pidana_penjara_bulan
0,Kejahatan Perjudian,KETUT SUBAGIA,I Made Bagiarta,3,1 (satu) buah papan bola bergambar 1 (satu) bu...,DAKWAAN\n PERTAMA \n- Bahwa Terdakwa KETUT SU...,4
1,Kejahatan Perjudian,KOMANG ADITYA,I Made Bagiarta,3,1 (satu) buah papan bola bergambar 1 (satu) bu...,DAKWAAN\n PERTAMA \n- Bahwa Terdakwa KETUT SU...,4
2,Narkotika,I NYOMAN SUARTA,I Made Bagiarta,3,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN \nKESATU\n Bahwa ia Terdak...,52
3,Narkotika,RICO JAYADI,I Made Bagiarta,3,1 (satu) buah dompet warna hijau 20 (dua puluh...,DAKWAAN \nKESATU\n ...,72
4,Narkotika,SANG PUTU WIDIANA,I Gusti Made Juliartawan,4,1(satu) buah HP merk Realme warna hitam putih,Dakwaan\nKe Satu\n- Bahwa terdakwa SANG PUTU...,48
5,Narkotika,WAYAN DARNA MASTONO,I Gusti Made Juliartawan,4,6 (enam) Paket plastik klip bening yang berisi...,dakwaan petugas Sat Narkoba Polres Buleleng m...,48
6,Narkotika,MADE SUDAMA ALIAS KARTOLO,Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,14
7,Narkotika,PUTU WISNU ALIAS WISNU,Made Hermayanti Muliartha,3,1 (satu) buah plastik klip bening berisi butir...,DAKWAAN \nKESATU\n Bahwa ia Terdakw...,10
8,Narkotika,MUCH.ARIFIN,I Gusti Made Juliartawan,3,1 (satu) pipet plastik didalamnya terdapat 2 (...,DAKWAAN\nKesatu \n-Bahwa Terdakwa MUCH ARIFIN...,16
9,Narkotika,I KOMANG TEKEN Alias TEKEN,I Made Bagiarta,4,1. 1 (satu) buah buah pipet kaca berisi residu...,KESATU\nBahwa Terdakwa 1. I Komang Teken Alias...,9


In [5]:
def validate_schema(df):
    print("**Schema Validation**")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Data types:\n{df.dtypes}\n")

def check_nulls(df):
    print("**Null Value Analysis**")
    null_counts = df.isnull().sum()
    print(f"Null counts per column:\n{null_counts}\n")

def check_duplicates(df):
    print("**Duplicate Rows**")
    duplicates = df[df.duplicated()]
    print(f"Number of duplicate rows: {len(duplicates)}\n")
    if not duplicates.empty:
        print(f"Duplicate rows:\n{duplicates}\n")

def summarize_statistics(df):
    print("**Statistics Summary**")
    stats = df.describe(include="all").transpose()
    print(tabulate(stats, headers='keys', tablefmt='pretty'))
    


In [6]:
validate_schema(df_extracted)

**Schema Validation**
Columns: ['klasifikasi_perkara', 'terdakwa', 'hakim', 'jumlah_saksi', 'cleaned_barang_bukti', 'cleaned_dakwaan', 'total_pidana_penjara_bulan']
Data types:
klasifikasi_perkara           object
terdakwa                      object
hakim                         object
jumlah_saksi                   int64
cleaned_barang_bukti          object
cleaned_dakwaan               object
total_pidana_penjara_bulan     int64
dtype: object



In [8]:
# Iterate through each column in the DataFrame
for column in df_extracted.columns:
    # Get unique data types in the column
    types_in_column = df_extracted[column].apply(type).unique()
    # Print the column name and its unique data types
    print(f"Data types in column '{column}' -> {types_in_column}")


Data types in column 'klasifikasi_perkara' -> [<class 'str'>]
Data types in column 'terdakwa' -> [<class 'str'>]
Data types in column 'hakim' -> [<class 'str'>]
Data types in column 'jumlah_saksi' -> [<class 'int'>]
Data types in column 'cleaned_barang_bukti' -> [<class 'str'>]
Data types in column 'cleaned_dakwaan' -> [<class 'str'>]
Data types in column 'total_pidana_penjara_bulan' -> [<class 'int'>]


In [9]:
check_nulls(df_extracted)

**Null Value Analysis**
Null counts per column:
klasifikasi_perkara           0
terdakwa                      0
hakim                         0
jumlah_saksi                  0
cleaned_barang_bukti          0
cleaned_dakwaan               0
total_pidana_penjara_bulan    0
dtype: int64



In [10]:
check_duplicates(df_extracted)

**Duplicate Rows**
Number of duplicate rows: 3

Duplicate rows:
    klasifikasi_perkara                             terdakwa  \
157         Penggelapan              KOMANG AGUS SULENDRAWAN   
234           Narkotika  I GEDE EKA ANGGA WIJANA alias ANGGA   
995          Pembunuhan                  I Ketut Budi Astawa   

                        hakim  jumlah_saksi  \
157  I Gusti Made Juliartawan             3   
234           I Made Bagiarta             3   
995          Mayasari Oktavia             6   

                                  cleaned_barang_bukti  \
157  1 (satu) unit Sepeda motor Jenis Honda BEAT, T...   
234  2 (dua) paket gulungan aluminium foil warna si...   
995  1 (satu) potong baju kaos warna hitam beri nod...   

                                       cleaned_dakwaan  \
157  - Bahwa ia Terdakwa KOMANG AGUS SULENDRAWAN Al...   
234  Bahwa  terdakwa  I GEDE EKA ANGGA WIJANA alias...   
995  Primair  \n \n-Bahwa ia terdakwa I KETUT BUDI ...   

     total_pidana_penjar

In [11]:
row1 = df_extracted.loc[157]
row2 = df_extracted.loc[234]
print(row1.equals(row2))


False


In [9]:
differences = row1 != row2
print(differences[differences]) 


klasifikasi_perkara           True
terdakwa                      True
hakim                         True
cleaned_barang_bukti          True
cleaned_dakwaan               True
total_pidana_penjara_bulan    True
dtype: bool


In [10]:
summarize_statistics(df_extracted)

**Statistics Summary**
+----------------------------+--------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------------+--------------------+-----+-----+------+------+-------+
|                            | count  | unique |                                                                                   top                                                                                    | freq |        mean        |        std         | min | 25% | 50%  | 75%  |  max  |
+----------------------------+--------+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------------+--------------------+-----+-----+------+------+-------+
|    klasifikasi_perkara     |  1196  |   33   |                    

In [11]:
validation_results = pd.DataFrame()

validation_results = df_extracted.select_dtypes(include="number").apply(
    lambda x: x[(x < x.quantile(0.01)) | (x > x.quantile(0.99))]
)

print(tabulate(validation_results, headers='keys', tablefmt='pretty'))

+------+--------------+----------------------------+
|      | jumlah_saksi | total_pidana_penjara_bulan |
+------+--------------+----------------------------+
|  93  |     nan      |           144.0            |
|  96  |     15.0     |            nan             |
| 113  |     nan      |           144.0            |
| 144  |     nan      |           168.0            |
| 146  |     13.0     |            nan             |
| 220  |     nan      |           156.0            |
| 263  |     12.0     |            nan             |
| 279  |     nan      |           144.0            |
| 280  |     nan      |           144.0            |
| 448  |     13.0     |            nan             |
| 449  |     13.0     |            nan             |
| 450  |     13.0     |            nan             |
| 480  |     13.0     |            nan             |
| 563  |     12.0     |           156.0            |
| 743  |     nan      |           720.0            |
| 859  |     nan      |           168.0       

# Exploratory Data Analysis

# Text Summarization

In [5]:
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("CUDA version:", torch.version.cuda)

CUDA available: True
Device count: 1
CUDA version: 12.1


In [4]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.1214, 0.0298, 0.0406],
        [0.8481, 0.8833, 0.1258],
        [0.8622, 0.3680, 0.7690],
        [0.6251, 0.0697, 0.8667],
        [0.2934, 0.7982, 0.9171]])


In [15]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Sep_12_02:55:00_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.6, V12.6.77
Build cuda_12.6.r12.6/compiler.34841621_0


In [16]:
!nvidia-smi

Fri Dec  6 12:56:58 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.14                 Driver Version: 566.14         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   49C    P8              4W /   76W |     670MiB /   8188MiB |     13%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Text Summarization "t5-base-indonesian-summarization-cased"

In [7]:
from transformers import pipeline

from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu
import torch
from bert_score import score
from nltk.tokenize import word_tokenize
from nltk.translate import meteor_score
import nltk

import time
from torch import float16

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### CLEAN \N

In [8]:
df_extracted['cleaned_dakwaan'] = df_extracted['cleaned_dakwaan'].str.replace('\n', ' ', regex=True)

In [9]:
df_extracted['cleaned_dakwaan']

0       DAKWAAN   PERTAMA  - Bahwa Terdakwa KETUT SUBA...
1       DAKWAAN   PERTAMA  - Bahwa Terdakwa KETUT SUBA...
2       DAKWAAN   KESATU             Bahwa ia Terdakwa...
3       DAKWAAN   KESATU                              ...
4       Dakwaan Ke Satu - Bahwa  terdakwa  SANG PUTU W...
                              ...                        
1191    Kesatu  Pasal 83 ayat (1) huruf b UU RI No. 18...
1192    Perbuatan terdakwa sebagaimana diatur dan dian...
1193    Pasal 351 ayat (1) Kitab Undang-undang Hukum p...
1194    Pertama  Pasal 187 ayat (1) KUHP dan Kedua  Pa...
1195          Pasal 362 Kitab Undang-undang Hukum Pidana.
Name: cleaned_dakwaan, Length: 1196, dtype: object

In [None]:
    # "cahya/bert2bert-indonesian-summarization",
    # "rowjak/bert-indonesian-news-summarization"
    # "cahya/bert2gpt-indonesian-summarization",

model_names = [
    "cahya/t5-base-indonesian-summarization-cased",
    "panggi/t5-small-indonesian-summarization-cased",
    "interstellarx95/mt5-small-finetuned-indonesian-text-summarization-one-epoch",
    "interstellarx95/mt5-small-finetuned-indonesian-text-summarization-v3",
    "panggi/t5-base-indonesian-summarization-cased",
    "rayendito/mt5-small-finetuned-xl-sum-indonesia",
    "interstellarx95/mt5-small-finetuned-indonesian-text-summarization-three-epochs",
]

In [None]:
# pipelines = {
#     model_name: pipeline(
#         "summarization",
#         model=model_name,
#         use_fast=True,
#         device="cuda",
#         batch_size=batch_size,
#         torch_dtype="auto"
#     )
#     for model_name in model_names
# }

Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "_name_or_path": "cahya/gpt2-small-indonesian-522M",
  "activation_func

KeyboardInterrupt: 

In [None]:
# import warnings
# from transformers import pipeline, AutoTokenizer
# pipelines ={}

# for model_name in model_names:
#     try:
#         with warnings.catch_warnings(record=True) as caught_warnings:
#             warnings.simplefilter("always")  # Capture all warnings
#             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

#             # Check if the specific sentencepiece warning is raised
#             sentencepiece_warning = any(
#                 "The sentencepiece tokenizer that you are converting to a fast tokenizer" in str(w.message)
#                 for w in caught_warnings
#             )

#             if sentencepiece_warning:
#                 print(f"Warning for {model_name}: Sentencepiece tokenizer fallback detected. Using slow tokenizer.")
#                 use_fast = False
#             else:
#                 use_fast = True

#     except Exception as e:
#         print(f"Fast tokenizer not supported for {model_name} due to error: {str(e)}. Using slow tokenizer.")
#         use_fast = False

#     # Create pipeline with determined tokenizer type
#     pipelines[model_name] = pipeline(
#         "summarization",
#         model=model_name,
#         use_fast=use_fast,
#         device=0,
#     )



The encoder model config class: <class 'transformers.models.bert.configuration_bert.BertConfig'> is different from the decoder model config class: <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'>. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.
Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0



Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

Config of the decoder: <class 'transformers.models.bert.modeling_bert.BertLMHeadModel'> is overwritten by shared decoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "add_cross_attent

In [20]:
pipelines

{'cahya/t5-base-indonesian-summarization-cased': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243ddf0ece0>,
 'panggi/t5-small-indonesian-summarization-cased': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243e05602e0>,
 'interstellarx95/mt5-small-finetuned-indonesian-text-summarization-one-epoch': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243d667f610>,
 'interstellarx95/mt5-small-finetuned-indonesian-text-summarization-v3': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243e056dae0>,
 'cahya/bert2gpt-indonesian-summarization': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243e54cbfd0>,
 'panggi/t5-base-indonesian-summarization-cased': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243e54cbe20>,
 'rayendito/mt5-small-finetuned-xl-sum-indonesia': <transformers.pipelines.text2text_generation.SummarizationPipeline at 0x243d38195d0>,


In [17]:
import pprint

pprint.pprint(pipelines)

{'cahya/bert2gpt-indonesian-summarization': <transformers.pipelines.text2text_generation.SummarizationPipeline object at 0x00000243E04B5EA0>,
 'cahya/t5-base-indonesian-summarization-cased': <transformers.pipelines.text2text_generation.SummarizationPipeline object at 0x00000243D3818FA0>,
 'interstellarx95/mt5-small-finetuned-indonesian-text-summarization-one-epoch': <transformers.pipelines.text2text_generation.SummarizationPipeline object at 0x00000243D4E7FA30>,
 'interstellarx95/mt5-small-finetuned-indonesian-text-summarization-three-epochs': <transformers.pipelines.text2text_generation.SummarizationPipeline object at 0x00000243D6723010>,
 'interstellarx95/mt5-small-finetuned-indonesian-text-summarization-v3': <transformers.pipelines.text2text_generation.SummarizationPipeline object at 0x00000243D4E3DCC0>,
 'panggi/t5-base-indonesian-summarization-cased': <transformers.pipelines.text2text_generation.SummarizationPipeline object at 0x00000243D4BB4880>,
 'panggi/t5-small-indonesian-summ

In [10]:
df_extracted

Unnamed: 0,klasifikasi_perkara,terdakwa,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,total_pidana_penjara_bulan
0,Kejahatan Perjudian,KETUT SUBAGIA,I Made Bagiarta,3,1 (satu) buah papan bola bergambar 1 (satu) bu...,DAKWAAN PERTAMA - Bahwa Terdakwa KETUT SUBA...,4
1,Kejahatan Perjudian,KOMANG ADITYA,I Made Bagiarta,3,1 (satu) buah papan bola bergambar 1 (satu) bu...,DAKWAAN PERTAMA - Bahwa Terdakwa KETUT SUBA...,4
2,Narkotika,I NYOMAN SUARTA,I Made Bagiarta,3,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN KESATU Bahwa ia Terdakwa...,52
3,Narkotika,RICO JAYADI,I Made Bagiarta,3,1 (satu) buah dompet warna hijau 20 (dua puluh...,DAKWAAN KESATU ...,72
4,Narkotika,SANG PUTU WIDIANA,I Gusti Made Juliartawan,4,1(satu) buah HP merk Realme warna hitam putih,Dakwaan Ke Satu - Bahwa terdakwa SANG PUTU W...,48
...,...,...,...,...,...,...,...
1191,Perlindungan dan Pengelolaan Lingkungan Hidup,Jumat Ariyanto,Ida Bagus Bama Dewa. P,3,1 (satu) 1 (satu) batang kayu balang-balang be...,Kesatu Pasal 83 ayat (1) huruf b UU RI No. 18...,12
1192,Kejahatan Perjudian,I KETUT ANA SAPUTRA Alias BENCUT,Sudar,1,36 (tiga puluh enam) lembar kertas rekapan,Perbuatan terdakwa sebagaimana diatur dan dian...,4
1193,Penganiayaan,Sahabudin Alias Udin,Ni Luh Suantini,2,1. 1 (satu) buah tabung gas elpiji ukuran 3 (t...,Pasal 351 ayat (1) Kitab Undang-undang Hukum p...,9
1194,Kejahatan yang Membahayakan Keamananan Umum Ba...,Putu Kristian Damanta,A.A. Sagung Yuni Wulantrisna,4,-1 (satu)unit sepeda motor merk Yamaha Jupiter...,Pertama Pasal 187 ayat (1) KUHP dan Kedua Pa...,24


In [10]:
avg_length = df_extracted['cleaned_dakwaan'].apply(lambda text: len(text.split())).mean()

print(f"Average length of cleaned_dakwaan: {avg_length:.2f} words")

Average length of cleaned_dakwaan: 691.86 words


In [10]:
import datetime
import psutil
import os
def log_memory_usage():
    print(f"CPU usage: {psutil.cpu_percent()}%, Memory usage: {psutil.virtual_memory().percent}%")
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")
        print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB")

save_dir = '../Data/STAGE 3 PREPROCESSING'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)



In [16]:
import warnings
from transformers import pipeline, AutoTokenizer
import time
import datetime
import torch
from torch.amp import autocast

warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
warnings.filterwarnings("ignore", message="Your max_length is set to \d+, but your input_length is only \d+", category=UserWarning, module="transformers")
chunk_size = 450
batch_size = 2048

for model_name in model_names:
    pipelines = {}
    try:
        with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.simplefilter("always")  # Capture all warnings
            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

            # Check for the sentencepiece warning
            sentencepiece_warning = any(
                "The sentencepiece tokenizer that you are converting to a fast tokenizer" in str(w.message)
                for w in caught_warnings
            )

            if sentencepiece_warning:
                print(f"Warning for {model_name}: Sentencepiece tokenizer fallback detected. Using slow tokenizer.")
                use_fast = False
            else:
                use_fast = True

    except Exception as e:
        print(f"Fast tokenizer not supported for {model_name} due to error: {str(e)}. Using slow tokenizer.")
        use_fast = False

    # Create the summarization pipeline
    pipelines[model_name] = pipeline(
        "summarization",
        model=model_name,
        use_fast=use_fast,
        device=0,  # Change this to `-1` if not using GPU
    )

    for key, pipe in pipelines.items():
        start_time = time.time()
        summarized_texts = []  # To store summaries for each row

        for index, row in df_extracted.iterrows():
            # print(f"Processing row {index + 1}/{len(df_extracted)} for model {key}")
            if index % 100 == 0:
                print(f"Processing row {index + 1}/{len(df_extracted)} - {index + 100}/{len(df_extracted)} for model {key}")

            text = row['cleaned_dakwaan']

            # Split the text into manageable chunks
            chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

            # Summarize each batch of chunks
            summaries = []
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i + batch_size]

                # Dynamically calculate max_length and min_length
                valid_lengths = [len(chunk.split()) for chunk in batch if chunk.strip()]
                if valid_lengths:
                    batch_max_length = min(max(max(valid_lengths) // 3, 100), 200)  # Adjust max range
                    batch_min_length = max(batch_max_length // 2, 50)  # Raise min range
                else:
                    batch_max_length = 100  # Default max for very short chunks
                    batch_min_length = 50  # Default min for very short chunks

                try:
                    with autocast(device_type='cuda'):  # Enable mixed-precision for efficiency
                        results = pipe(
                            batch,
                            max_length=batch_max_length,
                            min_length=batch_min_length,
                            do_sample=False,
                        )
                        summaries.extend([result.get("generated_text", result.get("summary_text", "")) for result in results])
                except Exception as e:
                    print(f"Error with model {key} on batch {i // batch_size}: {e}")
                    summaries.extend(["" for _ in batch])  # Add empty summaries for failed batches

            # Combine all summaries for the current row
            summarized_texts.append(" ".join(summaries))

        # Save summarized texts in the DataFrame
        df_extracted[f'{key}_summarization_dakwaan'] = summarized_texts

        # Save intermediate results to avoid data loss
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
        df_extracted.to_csv(save_dir, index=False)
        print(f"\nCHECKPOINT! Data Saved for model {key} at {save_dir}")

        # Log memory usage (assuming log_memory_usage is implemented elsewhere)
        # log_memory_usage()

        # Free GPU memory
        del pipe
        torch.cuda.empty_cache()
        print(f"Model {key} - Execution time: {time.time() - start_time:.2f} seconds\n")

    # Free pipelines dictionary
    del pipelines
    torch.cuda.empty_cache()


NameError: name 'model_names' is not defined

# single model


In [None]:
import warnings
from transformers import BertTokenizer, EncoderDecoderModel
import time
import torch
import datetime
from torch.amp import autocast

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained("cahya/bert2bert-indonesian-summarization").to('cuda')

# Parameters
chunk_size = 450
batch_size = 16  # Number of chunks per batch for model inference

# To store summaries
summarized_texts = []

start_time = time.time()

# Iterate over rows
for index, row in df_extracted.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index + 1}/{len(df_extracted)} - {index + 100}/{len(df_extracted)}")
    text = row['cleaned_dakwaan']

    # Split the text into manageable chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    # Summarize each batch of chunks
    summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]

        # Tokenize the batch
        encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=chunk_size).to('cuda')

        # Dynamic length adjustment
        valid_lengths = [len(chunk.split()) for chunk in batch if chunk.strip()]
        if valid_lengths:
            batch_max_length = min(max(max(valid_lengths) // 3, 100), 200)  # Adjust max range
            batch_min_length = max(batch_max_length // 2, 50)  # Raise min range
        else:
            batch_max_length = 100  # Default max for very short chunks
            batch_min_length = 50  # Default min for very short chunks

        try:
            with autocast(device_type='cuda'):
            # Generate summaries
                summary_ids = model.generate(
                    input_ids=encoded_input['input_ids'],
                    attention_mask=encoded_input['attention_mask'],  # Include this line
                    min_length=batch_min_length,
                    max_length=batch_max_length,
                    num_beams=10,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                    early_stopping=True,
                    no_repeat_ngram_size=2,
                    use_cache=True,
                    do_sample=True,
                    temperature=0.8,
                    top_k=50,
                    top_p=0.95
                )
                summaries.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids])
        except Exception as e:
            print(f"Error processing batch {i // batch_size}: {e}")
            summaries.extend(["" for _ in batch])  # Add empty summaries for failed batches

    # Combine all summaries for the current row
    # print(summaries)
    summarized_texts.append(" ".join(summaries))
    del encoded_input, summary_ids
    torch.cuda.empty_cache()

# Save summarized texts in the DataFrame
df_extracted['cahya_bert2bert_summarization_dakwaan'] = summarized_texts

# Save intermediate results to avoid data loss
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
df_extracted.to_csv(save_dir, index=False)
print(f"\nCHECKPOINT! Data Saved for model {model_name} at {save_dir}")
print(f"Summarization completed in {time.time() - start_time:.2f} seconds")



Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

Config of the decoder: <class 'transformers.models.bert.modeling_bert.BertLMHeadModel'> is overwritten by shared decoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "add_cross_attent

Processing row 1/1196 - 100/1196


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Processing row 101/1196 - 200/1196
Processing row 201/1196 - 300/1196
Processing row 301/1196 - 400/1196
Processing row 401/1196 - 500/1196
Processing row 501/1196 - 600/1196
Processing row 601/1196 - 700/1196
Processing row 701/1196 - 800/1196
Processing row 801/1196 - 900/1196
Processing row 901/1196 - 1000/1196
Processing row 1001/1196 - 1100/1196
Processing row 1101/1196 - 1200/1196


OSError: Cannot save file into a non-existent directory: '..\Data\STAGE_3_PREPROCESSING'

In [12]:
# Save intermediate results to avoid data loss
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
df_extracted.to_csv(save_dir, index=False)
print(f"\nCHECKPOINT! Data Saved for model cahya/bert2bert at {save_dir}")
print(f"Summarization completed in {time.time() - start_time:.2f} seconds")


CHECKPOINT! Data Saved for model cahya/bert2bert at ../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_20241207_101016.csv
Summarization completed in 8207.40 seconds


In [None]:
import warnings
from transformers import BertTokenizer, EncoderDecoderModel
import time
import torch
import datetime
from torch.amp import autocast

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("rowjak/bert-indonesian-news-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained("rowjak/bert-indonesian-news-summarization").to('cuda')

# Parameters
chunk_size = 450
batch_size = 32  # Number of chunks per batch for model inference

# To store summaries
summarized_texts = []

start_time = time.time()

# Iterate over rows
for index, row in df_extracted.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index + 1}/{len(df_extracted)} - {index + 100}/{len(df_extracted)}")
    text = row['cleaned_dakwaan']

    # Split the text into manageable chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    # Summarize each batch of chunks
    summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]

        # Tokenize the batch
        encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=chunk_size).to('cuda')

        # Dynamic length adjustment
        valid_lengths = [len(chunk.split()) for chunk in batch if chunk.strip()]
        if valid_lengths:
            batch_max_length = min(max(max(valid_lengths) // 3, 100), 200)  # Adjust max range
            batch_min_length = max(batch_max_length // 2, 50)  # Raise min range
        else:
            batch_max_length = 100  # Default max for very short chunks
            batch_min_length = 50  # Default min for very short chunks

        try:
            with autocast(device_type='cuda'):
            # Generate summaries
                summary_ids = model.generate(
                    input_ids=encoded_input['input_ids'],
                    attention_mask=encoded_input['attention_mask'],
                    num_beams=2,
                    repetition_penalty=2.5, 
                    length_penalty=1.0, 
                    early_stopping=True,
                    no_repeat_ngram_size=2,
                    use_cache=True,
                    max_length=batch_max_length,
                    min_length=batch_min_length)

                summaries.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids])
        except Exception as e:
            print(f"Error processing batch {i // batch_size}: {e}")
            summaries.extend(["" for _ in batch])  # Add empty summaries for failed batches

    # Combine all summaries for the current row
    # print(summaries)
    summarized_texts.append(" ".join(summaries))
    del encoded_input, summary_ids
    torch.cuda.empty_cache()

# Save summarized texts in the DataFrame
df_extracted['rowjak/bert-indonesian-news-summarization'] = summarized_texts

# Save intermediate results to avoid data loss
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
df_extracted.to_csv(save_dir, index=False)
print(f"\nCHECKPOINT! Data Saved for model at {save_dir}")
print(f"Summarization completed in {time.time() - start_time:.2f} seconds")



Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

Config of the decoder: <class 'transformers.models.bert.modeling_bert.BertLMHeadModel'> is overwritten by shared decoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "add_cross_attent

Processing row 1/1196 - 100/1196
Processing row 101/1196 - 200/1196
Processing row 201/1196 - 300/1196
Processing row 301/1196 - 400/1196
Processing row 401/1196 - 500/1196
Processing row 501/1196 - 600/1196
Processing row 601/1196 - 700/1196
Processing row 701/1196 - 800/1196
Processing row 801/1196 - 900/1196
Processing row 901/1196 - 1000/1196
Processing row 1001/1196 - 1100/1196
Processing row 1101/1196 - 1200/1196


NameError: name 'model_name' is not defined

In [14]:
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
df_extracted.to_csv(save_dir, index=False)
print(f"\nCHECKPOINT! Data Saved for model at {save_dir}")


CHECKPOINT! Data Saved for model at ../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_20241207_105235.csv


In [15]:
import warnings
from transformers import BertTokenizer, EncoderDecoderModel
import time
import torch
import datetime
from torch.amp import autocast

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("cahya/bert2gpt-indonesian-summarization")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained("cahya/bert2gpt-indonesian-summarization").to('cuda')

# Parameters
chunk_size = 450
batch_size = 32  # Number of chunks per batch for model inference

# To store summaries
summarized_texts = []

start_time = time.time()

# Iterate over rows
for index, row in df_extracted.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index + 1}/{len(df_extracted)} - {index + 100}/{len(df_extracted)}")
    text = row['cleaned_dakwaan']

    # Split the text into manageable chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    # Summarize each batch of chunks
    summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]

        # Tokenize the batch
        encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=chunk_size).to('cuda')

        # Dynamic length adjustment
        valid_lengths = [len(chunk.split()) for chunk in batch if chunk.strip()]
        if valid_lengths:
            batch_max_length = min(max(max(valid_lengths) // 3, 100), 200)  # Adjust max range
            batch_min_length = max(batch_max_length // 2, 50)  # Raise min range
        else:
            batch_max_length = 100  # Default max for very short chunks
            batch_min_length = 50  # Default min for very short chunks

        try:
            with autocast(device_type='cuda'):
            # Generate summaries
                summary_ids = model.generate(
                    input_ids=encoded_input['input_ids'],
                    attention_mask=encoded_input['attention_mask'],  # Include this line
                    min_length=batch_min_length,
                    max_length=batch_max_length,
                    num_beams=10,
                    repetition_penalty=2.5,
                    length_penalty=1.0,
                    early_stopping=True,
                    no_repeat_ngram_size=2,
                    use_cache=True,
                    do_sample=True,
                    temperature=0.8,
                    top_k=50,
                    top_p=0.95
                )
                summaries.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in summary_ids])
        except Exception as e:
            print(f"Error processing batch {i // batch_size}: {e}")
            summaries.extend(["" for _ in batch])  # Add empty summaries for failed batches

    # Combine all summaries for the current row
    # print(summaries)
    summarized_texts.append(" ".join(summaries))
    del encoded_input, summary_ids
    torch.cuda.empty_cache()

# Save summarized texts in the DataFrame
df_extracted['cahya_bert2gpt_summarization_dakwaan'] = summarized_texts

# Save intermediate results to avoid data loss
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
df_extracted.to_csv(save_dir, index=False)
print(f"\nCHECKPOINT! Data Saved for model at {save_dir}")
print(f"Summarization completed in {time.time() - start_time:.2f} seconds")



Config of the encoder: <class 'transformers.models.bert.modeling_bert.BertModel'> is overwritten by shared encoder config: BertConfig {
  "_name_or_path": "cahya/bert-base-indonesian-1.5G",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "_name_or_path": "cahya/gpt2-small-indonesian-522M",
  "activation_func

Processing row 1/1196 - 100/1196
Processing row 101/1196 - 200/1196
Processing row 201/1196 - 300/1196
Processing row 301/1196 - 400/1196
Processing row 401/1196 - 500/1196
Processing row 501/1196 - 600/1196
Processing row 601/1196 - 700/1196
Processing row 701/1196 - 800/1196
Processing row 801/1196 - 900/1196
Processing row 901/1196 - 1000/1196
Processing row 1001/1196 - 1100/1196
Processing row 1101/1196 - 1200/1196

CHECKPOINT! Data Saved for model at ../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_20241207_122142.csv
Summarization completed in 5332.07 seconds


In [27]:
del encoded_input
del summary_ids
del model
del tokenizer
torch.cuda.empty_cache()

NameError: name 'summary_ids' is not defined

In [None]:
# dont run!

import warnings
from transformers import pipeline, AutoTokenizer
import time
import datetime
import torch
from torch.amp import autocast

chunk_size = 450
batch_size = 2048

for model_name in model_names:
    break
    pipelines = {}
    try:
        with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.simplefilter("always")  # Capture all warnings
            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

            # Check for the sentencepiece warning
            sentencepiece_warning = any(
                "The sentencepiece tokenizer that you are converting to a fast tokenizer" in str(w.message)
                for w in caught_warnings
            )

            if sentencepiece_warning:
                print(f"Warning for {model_name}: Sentencepiece tokenizer fallback detected. Using slow tokenizer.")
                use_fast = False
            else:
                use_fast = True

    except Exception as e:
        print(f"Fast tokenizer not supported for {model_name} due to error: {str(e)}. Using slow tokenizer.")
        use_fast = False

    # Create the summarization pipeline
    pipelines[model_name] = pipeline(
        "summarization",
        model=model_name,
        use_fast=use_fast,
        device=0,  # Change this to `-1` if not using GPU
    )

    for key, pipe in pipelines.items():
        start_time = time.time()
        summarized_texts = []  # To store summaries for each row

        for index, row in df_extracted.iterrows():
            # Print progress every 100 rows
            print(f"Processing row {index + 1}/{len(df_extracted)} for model {key}")
            # if index % 50 == 0:
                # print(f"Processing row {index + 1}/{len(df_extracted)} - {index + 50}/{len(df_extracted)} for model {key}")

            text = row['cleaned_dakwaan']

            # Split the text into manageable chunks
            chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

            # Summarize each batch of chunks
            summaries = []
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i + batch_size]

                # Dynamically calculate max_length and min_length
                valid_lengths = [len(chunk.split()) for chunk in batch if chunk.strip()]
                if valid_lengths:
                    batch_max_length = max(min(max(length // 3 for length in valid_lengths), 150), 30)
                    batch_min_length = max(batch_max_length // 2, 20)
                else:
                    batch_max_length = 30
                    batch_min_length = 20

                try:
                    with autocast(device_type='cuda'):  # Enable mixed-precision for efficiency
                        results = pipe(
                            batch,
                            max_length=batch_max_length,
                            min_length=batch_min_length,
                            do_sample=False,
                        )
                        summaries.extend([result.get("generated_text", result.get("summary_text", "")) for result in results])
                except Exception as e:
                    print(f"Error with model {key} on batch {i // batch_size}: {e}")
                    summaries.extend(["" for _ in batch])  # Add empty summaries for failed batches

            # Combine all summaries for the current row
            summarized_texts.append(" ".join(summaries))

        # Save summarized texts in the DataFrame
        df_extracted[f'{key}_summarization_dakwaan'] = summarized_texts

        # Save intermediate results to avoid data loss
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        save_dir = f'../Data/STAGE_3_PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
        df_extracted.to_csv(save_dir, index=False)
        print(f"\nCHECKPOINT! Data Saved for model {key} at {save_dir}")

        # Log memory usage (assuming log_memory_usage is implemented elsewhere)
        # log_memory_usage()

        # Free GPU memory
        del pipes
        torch.cuda.empty_cache()
        print(f"Model {key} - Execution time: {time.time() - start_time:.2f} seconds\n")

    # Free pipelines dictionary
    del pipelines
    torch.cuda.empty_cache()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Processing row 1/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 2/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 3/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


Processing row 4/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


Processing row 5/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 6/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


Processing row 7/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 8/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 9/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


Processing row 10/1196 for model cahya/t5-base-indonesian-summarization-cased


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processing row 11/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 12/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)


Processing row 13/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


Processing row 14/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Processing row 15/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Processing row 16/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Processing row 17/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Processing row 18/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 19/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 20/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 21/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 22/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 23/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


Processing row 24/1196 for model cahya/t5-base-indonesian-summarization-cased


Your max_length is set to 30, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


Processing row 25/1196 for model cahya/t5-base-indonesian-summarization-cased
Processing row 26/1196 for model cahya/t5-base-indonesian-summarization-cased


: 

In [11]:
del pipe
torch.cuda.empty_cache()

In [12]:

del pipelines
torch.cuda.empty_cache()

In [24]:
save_dir = f'../Data/STAGE 3 PREPROCESSING/SINGARAJA_TEST_STAGE_3_PREPROCESSING_{timestamp}.csv'
df_extracted.to_csv(save_dir, index=False)

In [22]:
df_extracted

Unnamed: 0,klasifikasi_perkara,terdakwa,hakim,jumlah_saksi,cleaned_barang_bukti,cleaned_dakwaan,total_pidana_penjara_bulan,cahya/t5-base-indonesian-summarization-cased_summarization_dakwaan
0,Kejahatan Perjudian,KETUT SUBAGIA,I Made Bagiarta,3,1 (satu) buah papan bola bergambar 1 (satu) bu...,DAKWAAN PERTAMA - Bahwa Terdakwa KETUT SUBA...,4,DAKWAAN PERTAMA- Bahwa Terdakwa KETUT SUBAGIA ...
1,Kejahatan Perjudian,KOMANG ADITYA,I Made Bagiarta,3,1 (satu) buah papan bola bergambar 1 (satu) bu...,DAKWAAN PERTAMA - Bahwa Terdakwa KETUT SUBA...,4,DAKWAAN PERTAMA- Bahwa Terdakwa KETUT SUBAGIA ...
2,Narkotika,I NYOMAN SUARTA,I Made Bagiarta,3,38 (tiga puluh delapan) potongan pipet plastik...,DAKWAAN KESATU Bahwa ia Terdakwa...,52,DAKWAAN KESATU Bahwa ia Terdakwa I NYOMAN SUAR...
3,Narkotika,RICO JAYADI,I Made Bagiarta,3,1 (satu) buah dompet warna hijau 20 (dua puluh...,DAKWAAN KESATU ...,72,DAKWAAN KESATU Bahwa ia Terdakwa RICO JAYADI p...
4,Narkotika,SANG PUTU WIDIANA,I Gusti Made Juliartawan,4,1(satu) buah HP merk Realme warna hitam putih,Dakwaan Ke Satu - Bahwa terdakwa SANG PUTU W...,48,Dakwaan Ke Satu- Bahwa terdakwa SANG PUTU WIDI...
...,...,...,...,...,...,...,...,...
1191,Perlindungan dan Pengelolaan Lingkungan Hidup,Jumat Ariyanto,Ida Bagus Bama Dewa. P,3,1 (satu) 1 (satu) batang kayu balang-balang be...,Kesatu Pasal 83 ayat (1) huruf b UU RI No. 18...,12,Kesatu Pasal 83 ayat(1) huruf b UU RI No. 18 t...
1192,Kejahatan Perjudian,I KETUT ANA SAPUTRA Alias BENCUT,Sudar,1,36 (tiga puluh enam) lembar kertas rekapan,Perbuatan terdakwa sebagaimana diatur dan dian...,4,Perbuatan terdakwa sebagaimana diatur dan dian...
1193,Penganiayaan,Sahabudin Alias Udin,Ni Luh Suantini,2,1. 1 (satu) buah tabung gas elpiji ukuran 3 (t...,Pasal 351 ayat (1) Kitab Undang-undang Hukum p...,9,Pasal 351 ayat( 1) Kitab Undang-undang Hukum p...
1194,Kejahatan yang Membahayakan Keamananan Umum Ba...,Putu Kristian Damanta,A.A. Sagung Yuni Wulantrisna,4,-1 (satu)unit sepeda motor merk Yamaha Jupiter...,Pertama Pasal 187 ayat (1) KUHP dan Kedua Pa...,24,Pasal 187 ayat(1) KUHP dan Kedua Pasal 363 aya...


In [None]:
pipe = pipeline("text2text-generation", model="panggi/t5-small-indonesian-summarization-cased", use_fast=False, device="cuda", batch_size=2)

# Track the start time
start_time = time.time()

# Initialize an empty list to store the summaries
summarized_texts = []

# Iterate through the rows of the DataFrame
for index, row in df_extracted.iterrows():
    text = row['cleaned_dakwaan']

    # Chunk the text into smaller pieces
    chunk_size = 512
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    try:
        # Summarize each chunk
        summaries = [pipe(chunk, max_length=30, min_length=20, do_sample=False)[0]['generated_text'] for chunk in chunks]
    except KeyError:
        summaries = [pipe(chunk, max_length=30, min_length=20, do_sample=False)[0]['summary_text'] for chunk in chunks]

    # Combine the summarized chunks into one string
    full_summary = " ".join(summaries)

    # Append the summarized text to the list
    summarized_texts.append(full_summary)

# Add the summarized texts as a new column
df_extracted['second_model_summarization_dakwaan'] = summarized_texts

# Print execution time
print(f"Execution time: {time.time() - start_time} seconds")


### DOCS

In [524]:
from transformers import pipeline

# pipe = pipeline("text2text-generation", model="facebook/mbart-large-50", use_fast=False, device="cuda", batch_size=2)
# pipe = pipeline("text2text-generation", model="indobenchmark/indobart-large")
# pipe = pipeline("summarization", model="mrm8488/t5-base-finetuned-indonesian-summarization")
# pipe = pipeline("text2text-generation", model="google/mt5-large", use_fast=False)

# tensorflow_summarization_models = [
#     "thonyyy/pegasus_indonesian_base-finetune",
#     "thonyyy/pegasus_indonesian_base-pretrain"
# ]

pytorch_summarization_models = [
    # "cahya/bert2bert-indonesian-summarization",
    "cahya/t5-base-indonesian-summarization-cased",
    "panggi/t5-small-indonesian-summarization-cased",
    "interstellarx95/mt5-small-finetuned-indonesian-text-summarization-one-epoch",
    "interstellarx95/mt5-small-finetuned-indonesian-text-summarization-v3",
    # "cahya/bert2gpt-indonesian-summarization",
    "panggi/t5-base-indonesian-summarization-cased",
    "rayendito/mt5-small-finetuned-xl-sum-indonesia",
    "interstellarx95/mt5-small-finetuned-indonesian-text-summarization-three-epochs",
    # "rowjak/bert-indonesian-news-summarization"
]


# tensorflow_summarization_pipes = [
#     pipeline("summarization", model=model, from_tf=True, device=0, batch_size=4)
#     for model in tensorflow_summarization_models
# ]

In [None]:
def evaluate_meteor(generated, reference):
    # Tokenize the generated and reference text
    tokenized_generated = word_tokenize(generated)
    tokenized_reference = word_tokenize(reference)
    
    # Evaluate METEOR score
    score = meteor_score.meteor_score([tokenized_reference], tokenized_generated)
    return score

def evaluate_rouge(generated, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores

def evaluate_bleu(generated, reference):
    reference_tokens = reference.split()
    generated_tokens = generated.split()
    score = sentence_bleu([reference_tokens], generated_tokens)
    return score

# BERTScore evaluation
def evaluate_bertscore(generated, reference):
    P, R, F1 = score([generated], [reference], lang="en")
    return F1.item()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kalea\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pipe = pipeline("summarization", model="cahya/bert2bert-indonesian-summarization", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="cahya/t5-base-indonesian-summarization-cased", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="panggi/t5-small-indonesian-summarization-cased", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="interstellarx95/mt5-small-finetuned-indonesian-text-summarization-one-epoch", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="interstellarx95/mt5-small-finetuned-indonesian-text-summarization-v3", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="cahya/bert2gpt-indonesian-summarization", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="panggi/t5-base-indonesian-summarization-cased", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="rayendito/mt5-small-finetuned-xl-sum-indonesia", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="interstellarx95/mt5-small-finetuned-indonesian-text-summarization-three-epochs", use_fast=False, device="cuda", batch_size=8)
pipe = pipeline("summarization", model="rowjak/bert-indonesian-news-summarization", use_fast=False, device="cuda", batch_size=8)


In [None]:
import time

text = df_extracted.loc[0,'cleaned_dakwaan']
print(len(text))
reference_summary = """Terdakwa Ketut Subagia dan Komang Aditya pada 2 Juni 2024 sekitar pukul 00.30 WITA di pesisir pantai Gondol, Desa Penyabangan, Kecamatan Gerokgak, Kabupaten Buleleng, Bali, diduga mengadakan permainan judi bola adil secara ilegal. Ketut Subagia bertindak sebagai penyelenggara dan pemodal, sementara Komang Aditya membantu sebagai pekerja yang merapikan uang taruhan serta menyerahkannya kepada pemenang. Permainan melibatkan papan bergambar, bola karet, perlak bergambar, dan cek kayu sebagai pengganti uang taruhan. Barang bukti yang disita meliputi papan bola adil, tas, perlak, bola karet, kayu penyeimbang, bedak bayi, cek kayu dalam berbagai denominasi, lap, dan uang tunai Rp1.108.000. Permainan tersebut dilakukan dengan sistem taruhan yang memberikan hadiah kelipatan sembilan kali lipat dari jumlah taruhan jika menang, namun peserta yang kalah kehilangan uangnya. Aktivitas ini berlangsung selama beberapa jam dengan melibatkan sekitar 20 pemain dan keuntungan digunakan untuk kebutuhan sehari-hari. Namun, kegiatan ini dilakukan tanpa izin resmi dan melanggar hukum, sehingga kedua terdakwa ditangkap dan diancam pidana berdasarkan Pasal 303 KUHP juncto Undang-Undang Nomor 7 Tahun 1974 tentang Penertiban Perjudian."""

for i, model in enumerate(pytorch_summarization_models):
    print(model)
    pipe = pipeline("summarization", model=model, device="cuda", batch_size=4, use_fast=False) 
    chunk_size = 512 

    start_time = time.time()

    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    try:
        summaries = [pipe(chunk, max_length=30, min_length=20, do_sample=False)[0]['generated_text'] for chunk in chunks]
    except KeyError:
        summaries = [pipe(chunk, max_length=30, min_length=20, do_sample=False)[0]['summary_text'] for chunk in chunks]

    full_summary = " ".join(summaries)
    end_time = time.time()

    time_taken = end_time - start_time

    print("------------------------------------------------------")

    rouge_scores = evaluate_rouge(full_summary, reference_summary)
    print("ROUGE Scores:", rouge_scores)
    meteor_score_value = evaluate_meteor(full_summary, reference_summary)
    print("METEOR Score:", meteor_score_value)
    bleu_score_value = evaluate_bleu(full_summary, reference_summary)
    print("BLEU Score:", bleu_score_value)
    bertscore_value = evaluate_bertscore(full_summary, reference_summary)
    print("BERTScore F1:", bertscore_value)

    # Display ROUGE scores
    print(len(full_summary))
    print(f"Summary: using {model}: {full_summary}")
    print("------------------------------------------------------")

    del pipe
    torch.cuda.empty_cache()

    print(f"Time taken for summarization: {time_taken:.2f} seconds\n\n")


21579
cahya/t5-base-indonesian-summarization-cased


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


------------------------------------------------------
ROUGE Scores: {'rouge1': Score(precision=0.12121212121212122, recall=0.5847953216374269, fmeasure=0.20080321285140565), 'rouge2': Score(precision=0.04611650485436893, recall=0.2235294117647059, fmeasure=0.07645875251509053), 'rougeL': Score(precision=0.08242424242424243, recall=0.39766081871345027, fmeasure=0.13654618473895586), 'rougeLsum': Score(precision=0.08242424242424243, recall=0.39766081871345027, fmeasure=0.13654618473895586)}
METEOR Score: 0.30761823728726106
BLEU Score: 0.017997592188981122


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.8316876292228699
5486
Summary: using cahya/t5-base-indonesian-summarization-cased: DAKWAAN PERTAMA - Bahwa Terdakwa KETUT SUBAGIA bersama dengan Terdakwa KOMANG ADITYA pada SUBAGIA bersama dengan Terdakwa KOMANG ADITYA pada saat terdakwa KETUT SUBAGIA bersama dengan Terdakwa UT SUBAGIA dan Terdakwa KOMANG ADITYA sedang berlangsung permainan perjudian bola adil di pesisir pantai Gondol, Desa Penya Dua buah bola bergambar, satu (satu) buah tas kain warna abu-abu, satu (satu) buah tas karung warna Sebanyak 44 (empat puluh empat) buah papan cek kayu berwarna biru dengan nomor 20 (duapuluh), 17 (tujuh belas) KETUT SUBAGIA dalam permainan judi bola adil tersebut berperan sebagai penyelenggara yang mengadakan permainan judi bola adil dan juga sebagai penyelenggara yang mengadakan permainan KOMANG ADITYA sebagai pekerja dan merapikan uang serta memberikan uang kepada pemenang permainan bola adil. Bahwa dalam permainan judi bola adil Pasangan uang taruhan judi bola adil 1 (satu)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.8341520428657532
5476
Summary: using panggi/t5-small-indonesian-summarization-cased: Bahwa Terdakwa KETUT SUBAGIA bersama dengan terdakwa KOMANG ADITYA pada hari Minggu tanggal 02 Juni 2024 udi bola adil dan menjadikan sebagai pencaharian, atau dengan sengaja turut serta dalam suatu perusahaan untuk itu, yang dilakukan antara lain dengan cara sebagai UT SUBAGIA dan Terdakwa KOMANG ADITYA sedang berlangsung permainan perjudian bola adil di pesisir pantai Gondol, Desa Penya bola bergambar, 1 (satu) buah tas kain warna abu-abu. 1 biji biji bola karet warna hitam, 2 biji kacang uh), 44 (empat puluh empat) buah papan cek kayu berwarna biru dengan nomor 20 'duapuluh - duapuluh Rp. 50.000,- (lima puluh ribu rupiah) 4 [empat) lembar uang pecahan Rp yang 20.00, hingga pemodal, mengambil uang - uang taruhan yang berada di atas perlak, menggelindingkan bola, sedangkan Terdakwa KOMANG pan bola adil 1 (satu ) buah perlak bergambar digunakan untuk pasangan uang taruhan judi bola saks

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.826798677444458
4507
Summary: using interstellarx95/mt5-small-finetuned-indonesian-text-summarization-one-epoch: KeTUT SUBAGIA bersama dengan Terdakwa KOMANG ADITYA pada hari Minggu tanggal 02 Juni 2024 sekitar jam 0 KeTUT SUBAGIA bersama dengan Terdakwa KOMANG ADITYA pada saat terdakwa KETSUBAGIA bersama dengan Terdakwa KOMANG KeTUT SUBAGIA dan Terdakwa KOMANG ADITYA sedang berlangsung permainan perjudian bola adil di pesisir pantai Kembaran cek kayu berwarna hijau dengan nomor 5 (lima) buah papan cek kayu berwarna hijau dengan Kediaman uang pecahan Rp. 1.108.000,- (satu juta seratus delapan ribu rupiah) terdiri dari 8 ( KeTUT SUBAGIA dalam permainan judi bola adil tersebut berperan sebagai penyelenggara yang mengadakan permainan judi bola adil dan Kepolisian pemodal, mengambil uang - uang taruhan yang berada di atas perlak, menggelindingkan bola Plak bergambar digunakan untuk pasangan uang taruhan judi bola adil 1 (satu) buah perlak bergambar digunakan Kegunaan papan 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.8306238651275635
4355
Summary: using interstellarx95/mt5-small-finetuned-indonesian-text-summarization-v3: Bahwa Terdakwa KETUT SUBAGIA bersama dengan Terdakwa KOMANG ADITYA pada hari Minggu tanggal 02 Juni 2024 udi bola adil dan menjadikan sebagai pencaharian, atau dengan sengaja turut serta dalam suatu perusahaan UT SUBAGIA dan Terdakwa KOMANG ADITYA sedang berlangsung permainan perjudian bola adil di pesisir pantai Bola bergambar, 1(satu) buah tas kain warna abu-abu, 1(satu) buah perlak bergambar, uh), 44(empat puluh empat) buah papan cek kayu berwarna biru dengan nomor 20(duapuluh Rp. 50.000,-(lima puluh ribu rupiah) 4(empat) lembar uang pecahan Rp. 50.000,- Pemodal, mengambil uang - uang taruhan yang berada di atas perlak, menggelindingkan bola, sedangkan Pan bola adil 1(satu) buah perlak bergambar digunakan untuk pasangan uang taruhan bola adil 1(satu s kain warna hitam digunakan untuk penyimpan bola 4(empat) buah kayu penyeimbang digunakan untuk menyeimbangkan n 

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.8033068180084229
176
Summary: using panggi/t5-base-indonesian-summarization-cased:                 Bahwa di papan bola adil terdapat 4 warna yaitu merah, kuning hijau dan hitam adapun gambarnya yaitu berbentuk segitiga, palang, bola,                          
------------------------------------------------------
Time taken for summarization: 17.71 seconds


rayendito/mt5-small-finetuned-xl-sum-indonesia
------------------------------------------------------
ROUGE Scores: {'rouge1': Score(precision=0.13314037626628075, recall=0.5380116959064327, fmeasure=0.2134570765661253), 'rouge2': Score(precision=0.043478260869565216, recall=0.17647058823529413, fmeasure=0.06976744186046512), 'rougeL': Score(precision=0.06801736613603473, recall=0.27485380116959063, fmeasure=0.10904872389791181), 'rougeLsum': Score(precision=0.06801736613603473, recall=0.27485380116959063, fmeasure=0.10904872389791181)}
METEOR Score: 0.3027963518759939
BLEU Score: 0.01799532323977946


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.8162375688552856
4444
Summary: using rayendito/mt5-small-finetuned-xl-sum-indonesia: Seorang terdakwa kecelakaan pertama - telah mendakwa seorang terdakwa kecelakaan pertama - setelah Polisi dan saksi Polisi dan saksi Polisi dan saksi Polisi dan saksi Polisi dan saksi Polisi dan Seorang pemain bola adil di Bali telah berlangsung permainan perjudian bola adil di pesisir pantai Gondol, Kec Sebuah tas kain warna hitam, 1 (satu) buah tas kain warna hitam, 1 (satu) buah tas Sebuah lembar uang pecahan Rp. 1.108.000,- (satu juta seratus delapan ribu rupiah) terdiri dari Seorang terdakwa keTUT SUBAGIA dalam permainan judi bola adil di Indonesia, telah mendakwa terdakwa keTUT Suba Pemenang permainan bola adil di Indonesia, telah menentukan kemenangan dalam permainan bola adil, karena seorang pemodal Permainan bola adil 1 (satu) buah tas karung warna biru garis merah digunakan untuk pasangan uang Sebuah papan cek kayu berwarna hitam digunakan untuk penyimpan bola adil setelah dig

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.8330612778663635
4361
Summary: using interstellarx95/mt5-small-finetuned-indonesian-text-summarization-three-epochs: Bahwa Terdakwa KETUT SUBAGIA bersama dengan Terdakwa KOMANG ADITYA pada hari Minggu tanggal 02 Juni 2024 udi bola adil dan menjadikan sebagai pencaharian, atau dengan sengaja turut serta dalam suatu perusahaan UT SUBAGIA dan Terdakwa KOMANG ADITYA sedang berlangsung permainan perjudian bola adil di pesisir pantai buah tas kain warna abu-abu, 1 (satu) buah perlak bergambar, 1 (satu) buah perlak uang pecahan Rp. 1.108.000,- (satu juta seratus delapan ribu rupiah) terdiri dari 8 (delapan Terdakwa KETUT SUBAGIA dalam permainan judi bola adil tersebut berperan sebagai penyelenggara yang mengadakan permainan judi bola Pemodal, mengambil uang - uang taruhan yang berada di atas perlak, menggelindingkan bola, sedangkan PAN bola adil 1 (satu) buah perlak bergambar digunakan untuk pasangan uang taruhan bola adil 1 (satu s kain warna hitam digunakan untuk penyimpan b