In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import pickle

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


In [2]:
df = pd.read_csv('../Data/STAGE 4 FINAL MERGED DATA/STAGE_4_MERGED_FINAL_ENCODED.csv')

In [3]:
model_scenario_name = "BiLSTM_12"
max_len = 1024

model_save_path = f'../Model/{model_scenario_name}'
tokenizer_save_path = f'../Model/{model_scenario_name}_tokenizer.pkl'
# history_training_save_path = f'../Model/{model_scenario_name}_training_history.pkl'
    
loaded_model = tf.keras.models.load_model(model_save_path)

with open(tokenizer_save_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [4]:
def stratified_split(df: pd.DataFrame, split_size: float = 0.8) -> tuple[pd.DataFrame, pd.DataFrame]:
    unique_groups = df.groupby(["klasifikasi_perkara_encoded", "penuntut_umum_encoded", "hakim_encoded"]).apply(lambda x: x.index.tolist()).to_dict()
    
    print(unique_groups)
    train_idx, test_idx = [], []
    
    for indices in unique_groups.values():        
        if len(indices) == 1:
            train_idx.extend(indices)
        else:
            train, test = train_test_split(indices, train_size=split_size, random_state=42)
            train_idx.extend(train)
            test_idx.extend(test)
    
    return df.loc[train_idx], df.loc[test_idx]

In [5]:
train_df, test_df = stratified_split(df)

  unique_groups = df.groupby(["klasifikasi_perkara_encoded", "penuntut_umum_encoded", "hakim_encoded"]).apply(lambda x: x.index.tolist()).to_dict()


{(0, 0, 0): [0, 20, 42], (0, 0, 2): [2], (0, 1, 0): [19, 37], (0, 1, 4): [35, 36], (0, 4, 1): [5], (0, 4, 3): [10], (0, 5, 21): [843], (0, 7, 1): [9], (0, 8, 0): [15, 17], (0, 9, 0): [16, 18], (0, 11, 4): [22, 23], (0, 12, 19): [775, 776, 813], (0, 12, 21): [774], (0, 12, 22): [715], (0, 12, 23): [806], (0, 13, 0): [32], (0, 13, 3): [34], (0, 14, 0): [38], (0, 14, 14): [437, 501, 595, 609], (0, 14, 16): [422], (0, 14, 18): [474, 665], (0, 14, 19): [783, 784, 785, 786], (0, 14, 20): [432, 438, 518, 578, 608, 724, 745], (0, 14, 21): [639, 822], (0, 15, 4): [33], (0, 16, 0): [39], (0, 17, 0): [40, 41], (0, 19, 7): [46], (0, 20, 36): [1051], (0, 21, 52): [1612], (0, 21, 54): [1630, 1632], (0, 22, 10): [241, 268, 269, 270, 271, 303], (0, 22, 14): [259, 381, 415, 416, 538, 545, 577, 621], (0, 22, 16): [308, 314, 315, 383, 386, 417], (0, 22, 18): [654, 655, 656], (0, 22, 19): [548, 762], (0, 22, 20): [446, 570, 571, 572, 573, 690, 725], (0, 22, 22): [719], (0, 23, 10): [52, 54], (0, 24, 10): 

In [6]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\(\)]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
stopword_factory = StopWordRemoverFactory()
stop_words = set(stopword_factory.get_stop_words())

def remove_stopwords(text, stop_words):
    return ' '.join([word for word in text.split() if word not in stop_words])

In [29]:
def independent_klasifikasi_perkara_metrics(model, klasifikasi_perkara_value, train_df, test_df):

    test_df = test_df.loc[test_df['klasifikasi_perkara_encoded'] == klasifikasi_perkara_value].copy()
    print(len(test_df))

    test_df['concat_text'] = test_df[['terdakwa', 'summarized_dakwaan']].apply(lambda x: '. '.join(x), axis=1)

    y_test = test_df['total_pidana_penjara_bulan']

    test_df['normalized_text'] = test_df['concat_text'].apply(normalize_text)

    test_df['stopword_removal'] = test_df['normalized_text'].apply(lambda x: remove_stopwords(x, stop_words))

    test_sequences = tokenizer.texts_to_sequences(test_df['stopword_removal'])

    X_test_texts = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

    test_numerical = tf.constant(test_df[['klasifikasi_perkara_encoded', 'penuntut_umum_encoded', 'hakim_encoded', 'jumlah_saksi', 'maks_penjara_berdasarkan_pasal']].values, dtype=tf.float32)

    evaluation_result = model.evaluate([X_test_texts, test_numerical], y_test)
    return evaluation_result


In [30]:
klasifikasi_perkara_mapping_path = f'../Mapping/klasifikasi_perkara_mapping.pkl'

with open(klasifikasi_perkara_mapping_path, 'rb') as handle:
    klasifikasi_perkara_mapping = pickle.load(handle)

klasifikasi_perkara_mapping

{'Pencurian': 0,
 'Penipuan': 1,
 'Narkotika': 2,
 'Penggelapan': 3,
 'Kejahatan Perjudian': 4}

In [31]:
klasifikasi_perkara_unique_values = set(klasifikasi_perkara_mapping.values())

klasifikasi_perkara_unique_values

{0, 1, 2, 3, 4}

In [32]:
inverse_klasifikasi_perkara_mapping = {v: k for k, v in klasifikasi_perkara_mapping.items()}

In [34]:
for klasifikasi_perkara_value in klasifikasi_perkara_unique_values:

    print(f'Nilai Metrik Evaluasi pada Klasifikasi {inverse_klasifikasi_perkara_mapping.get(klasifikasi_perkara_value, klasifikasi_perkara_value)}')
    independent_klasifikasi_perkara_metrics(model=loaded_model, klasifikasi_perkara_value=klasifikasi_perkara_value, train_df=train_df, test_df=test_df)
    print("\n")


Nilai Metrik Evaluasi pada Klasifikasi Pencurian
490


Nilai Metrik Evaluasi pada Klasifikasi Penipuan
23


Nilai Metrik Evaluasi pada Klasifikasi Narkotika
439


Nilai Metrik Evaluasi pada Klasifikasi Penggelapan
50


Nilai Metrik Evaluasi pada Klasifikasi Kejahatan Perjudian
51




In [27]:
test_df

Unnamed: 0,klasifikasi_perkara_encoded,penuntut_umum_encoded,hakim_encoded,jumlah_saksi,maks_penjara_berdasarkan_pasal,terdakwa,summarized_dakwaan,total_pidana_penjara_bulan
0,0,0,0,6,80,GEDE DARMAYASA,Terdakwa Gede Darmayasa didakwa telah melakuka...,5.000000
37,0,1,0,6,80,Gede Kastawa,Terdakwa Gede Kastawa didakwa melakukan serang...,30.000000
36,0,1,4,3,84,GUNAWAN,"Pada Minggu, 11 Desember 2022 sekitar pukul 00...",8.000000
17,0,8,0,6,144,ROSITA EVAYANTI DEWI,Terdakwa I Ketut Joni Adnyana Adi Putra dan Te...,14.000000
18,0,9,0,6,144,ROSITA EVAYANTI DEWI,Terdakwa I Ketut Joni Adnyana Adi Putra dan Te...,14.000000
...,...,...,...,...,...,...,...,...
2466,4,221,80,3,120,Nyoman Sudiarta alias Comek,Terdakwa NYOMAN SUDIARTA alias COMEK telah men...,30.500000
2375,4,222,80,1,120,I Nyoman Sentana,"Pada hari Minggu, 18 November 2018, sekitar pu...",3.666667
4093,4,259,119,2,120,NA'IM Bin Alm SARNAN,"Pada hari Minggu, 09 Juli 2023, sekitar pukul ...",8.000000
4086,4,259,119,2,120,RASMAN Bin RASIM,"Pada Minggu, 09 Juli 2023, sekitar pukul 00.15...",8.000000


In [28]:
loaded_model.predict([X_test_texts, test_numerical])

NameError: name 'X_test_texts' is not defined