<a href="https://colab.research.google.com/github/AkhdanFirdaus/bmn-model/blob/main/final_ta_klasifikasi_teks_bert_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Klasifikasi Teks Laporan Mobil Dinas dengan BERT

# 4.3 Data Preparation

In [None]:
# Mount Drive

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
location_invoice = '/content/drive/MyDrive/TA/dataset/raw_invoice.csv'
location_sukucadang = '/content/drive/MyDrive/TA/dataset/raw_sukucadang.csv'
location_kerusakan = '/content/drive/MyDrive/TA/dataset/raw_kategori_kerusakan.csv'
location_laporan = '/content/drive/MyDrive/TA/dataset/raw_laporan.csv'

## 4.3.1 Pemilahan Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
data_invoice = pd.read_csv(location_invoice)[['Uraian Pekerjaan', 'Harga Satuan']]
data_sukucadang =  pd.read_csv(location_sukucadang)[['Komponen', 'Keterangan']]
data_kerusakan = pd.read_csv(location_kerusakan)[['Kerusakan', 'Keterangan', 'Kategori']]
data_laporan = pd.read_csv(location_laporan)[['Laporan', 'Masalah']]

In [None]:
print('Invoice = ', data_invoice.shape)
print('Sukucadang = ', data_sukucadang.shape)
print('Kerusakan = ', data_kerusakan.shape)
print('Laporan = ', data_laporan.shape)

Invoice =  (526, 2)
Sukucadang =  (210, 2)
Kerusakan =  (11, 3)
Laporan =  (138, 2)


### Preview

In [None]:
data_invoice.head()

Unnamed: 0,Uraian Pekerjaan,Harga Satuan
0,Shell HX-6 10/40 4Ltr,420000
1,Filter Oli YZZE1,60000
2,Filter Udara Avanza VVTi,175000
3,Service Rem Depan + Belakang,300000
4,Tune Up Injection Cleaner,350000


In [None]:
data_sukucadang.head()

Unnamed: 0,Komponen,Keterangan
0,Shell HX-6 10/40 4Ltr,Oli mesin untuk kendaraan roda empat dengan b...
1,Filter Oli YZZE1,Filter oli untuk kendaraan roda empat yang di...
2,Filter Udara Avanza VVTi,Filter udara kualitas tinggi untuk kendaraan ...
3,Service Rem Depan + Belakang,Layanan perawatan rem depan dan belakang pada...
4,Tune Up Injection Cleaner,Layanan tune up dengan teknologi injection Cl...


In [None]:
data_kerusakan.head()

Unnamed: 0,Kerusakan,Keterangan,Kategori
0,Masalah Sistem Knalpot,Sistem pembuangan bertanggung jawab untuk meng...,Filter Udara;Oli;Bahan Bakar;Cleaner;Pemasangan
1,Masalah Transmisi,Masalah transmisi kendaraan roda empat adalah ...,Oli;Transmisi;Pemasangan;Kopling;Joint;Bearing...
2,Masalah Suspensi,Suspensi kendaraan roda empat adalah kumpulan ...,Shockbreaker;Spooring;Balancing;Hidrolik;Pemas...
3,Gangguan Listrik,Gangguan listrik pada kendaraan roda empat ter...,Lampu;Listrik;Pemasangan
4,Masalah Sistem Bahan Bakar,Masalah sistem bahan bakar kendaraan roda empa...,Bahan Bakar;Cleaner;Pemasangan;Kruk As


In [None]:
data_laporan.head()

Unnamed: 0,Laporan,Masalah
0,Saya melihat indikator check engine menyala di...,Masalah Mesin
1,"Saat saya menginjak rem, terdengar suara berde...",Kegagalan Rem
2,Saya merasakan getaran yang tidak normal saat ...,Masalah Kemudi;Masalah Ban
3,"Mobil saya stir nya tidak stabil, lalu pada sa...",Gangguan Listrik;Masalah Kemudi;Masalah Mesin
4,kaca mobil tidak bisa bergerak,"Kerusakan Aksesoris Interior, Eksterior"


## 4.3.2 Pembersihan Data

In [None]:
sample_texts = data_laporan.head(3)['Laporan']
print(sample_texts[0])
print(sample_texts[1])
print(sample_texts[2])

Saya melihat indikator check engine menyala di dashboard mobil saya.
Saat saya menginjak rem, terdengar suara berdecit yang tidak biasa.
Saya merasakan getaran yang tidak normal saat mengemudi di kecepatan tinggi.


### Casefolding

In [None]:
def casefolding(val):
  return str(val).lower()

In [None]:
hasil_casefolding = [casefolding(sample) for sample in sample_texts]
print(hasil_casefolding[0])
print(hasil_casefolding[1])
print(hasil_casefolding[2])

saya melihat indikator check engine menyala di dashboard mobil saya.
saat saya menginjak rem, terdengar suara berdecit yang tidak biasa.
saya merasakan getaran yang tidak normal saat mengemudi di kecepatan tinggi.


In [None]:
data_invoice = data_invoice.applymap(casefolding)
data_sukucadang = data_sukucadang.applymap(casefolding)
data_kerusakan = data_kerusakan.applymap(casefolding)
data_laporan = data_laporan.applymap(casefolding)

### Cleaning

In [None]:
import re

def cleaning(val):
  # Membersihkan Whitespace
  val = re.sub(r'\s+', ' ', val)

  # Hanya Mengambil karakter alfanumerik
  val = re.sub("[^a-zA-Z0-9]", " ", val)

  return val

In [None]:
hasil_cleaning = [cleaning(sample) for sample in hasil_casefolding]
print(hasil_cleaning[0])
print(hasil_cleaning[1])
print(hasil_cleaning[2])

saya melihat indikator check engine menyala di dashboard mobil saya 
saat saya menginjak rem  terdengar suara berdecit yang tidak biasa 
saya merasakan getaran yang tidak normal saat mengemudi di kecepatan tinggi 


In [None]:
data_invoice = data_invoice.applymap(cleaning)
data_sukucadang = data_sukucadang.applymap(cleaning)
data_kerusakan = data_kerusakan.applymap(cleaning)
data_laporan = data_laporan.applymap(cleaning)

### Stemming

In [None]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

def stemming(val):
  return stemmer.stem(str(val))

In [None]:
hasil_stemming = [stemming(sample) for sample in hasil_cleaning]
print(hasil_stemming[0])
print(hasil_stemming[1])
print(hasil_stemming[2])

saya lihat indikator check engine nyala di dashboard mobil saya
saat saya injak rem dengar suara decit yang tidak biasa
saya rasa getar yang tidak normal saat kemudi di cepat tinggi


In [None]:
data_invoice = data_invoice.applymap(stemming)
data_sukucadang = data_sukucadang.applymap(stemming)
data_kerusakan = data_kerusakan.applymap(stemming)
data_laporan = data_laporan.applymap(stemming)

### Stopword Removal

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopword_remover = StopWordRemoverFactory().create_stop_word_remover()

def stopwordremove(val):
  return stopword_remover.remove(str(val))

In [None]:
hasil_stopword_remove = [stopwordremove(sample) for sample in hasil_stemming]
print(hasil_stopword_remove[0])
print(hasil_stopword_remove[1])
print(hasil_stopword_remove[2])

lihat indikator check engine nyala dashboard mobil
saya injak rem dengar suara decit tidak biasa
rasa getar tidak normal kemudi cepat tinggi


In [None]:
data_invoice = data_invoice.applymap(stopwordremove)
data_sukucadang = data_sukucadang.applymap(stopwordremove)
data_kerusakan = data_kerusakan.applymap(stopwordremove)
data_laporan = data_laporan.applymap(stopwordremove)

### Filtering

In [None]:
data_invoice = data_invoice.drop_duplicates(subset=['Uraian Pekerjaan'])
data_sukucadang = data_sukucadang.drop_duplicates(subset=['Komponen'])
data_kerusakan = data_kerusakan.drop_duplicates(subset=['Kerusakan'])
data_laporan = data_laporan.drop_duplicates(subset=['Laporan'])

In [None]:
print('Invoice = ', data_invoice.shape)
print('Sukucadang = ', data_sukucadang.shape)
print('Kerusakan = ', data_kerusakan.shape)
print('Laporan = ', data_laporan.shape)

Invoice =  (199, 2)
Sukucadang =  (200, 2)
Kerusakan =  (11, 3)
Laporan =  (121, 2)


### Preview

In [None]:
data_invoice

Unnamed: 0,Uraian Pekerjaan,Harga Satuan
0,shell hx 6 10 40 4ltr,420 000
1,filter oli yzze1,60 000
2,filter udara avanza vvti,175 000
3,service rem depan belakang,300 000
4,tune up injection cleaner,350 000
...,...,...
520,bohlam forglamp h11 55watt,75 000
521,joint copel,650 000
522,karet stabilizer r l,325 000
523,sikring 20 ah,20 000


In [None]:
data_sukucadang

Unnamed: 0,Komponen,Keterangan
0,shell hx 6 10 40 4ltr,oli mesin kendara roda empat bobot 4 liter fun...
1,filter oli yzze1,filter oli kendara roda empat guna nyaring kot...
2,filter udara avanza vvti,filter udara kualitas tinggi kendara roda empa...
3,service rem depan belakang,layan awat rem depan belakang kendara roda empat
4,tune up injection cleaner,layan tune up teknologi injection cleaner bers...
...,...,...
205,joint copel,joint copel kendara roda empat
206,karet stabilizer r l,karet stabilizer kendara roda empat kanan kiri
207,sikring 20 ah,sikring kapasitas 20 ah kendara roda empat
208,mobil derek towing,layan derek kendara roda empat


In [None]:
data_kerusakan.head()

Unnamed: 0,Kerusakan,Keterangan,Kategori
0,masalah sistem knalpot,sistem buang tanggung jawab hilang gas limbah ...,filter udara oli bahan bakar cleaner pasang
1,masalah transmisi,masalah transmisi kendara roda empat kondisi m...,oli transmisi pasang kopling joint bearing pas...
2,masalah suspensi,suspensi kendara roda empat kumpul komponen fu...,shockbreaker spooring balancing hidrolik pasang
3,ganggu listrik,ganggu listrik kendara roda empat jadi ada mas...,lampu listrik pasang
4,masalah sistem bahan bakar,masalah sistem bahan bakar kendara roda empat ...,bahan bakar cleaner pasang kruk as


In [None]:
data_laporan.head()

Unnamed: 0,Laporan,Masalah
0,lihat indikator check engine nyala dashboard m...,masalah mesin
1,saya injak rem dengar suara decit tidak biasa,gagal rem
2,rasa getar tidak normal kemudi cepat tinggi,masalah kemudi masalah ban
3,mobil stir nya stabil lalu menstater kadang ka...,ganggu listrik masalah kemudi masalah mesin
4,kaca mobil bisa gerak,rusa aksesoris interior eksterior


## 4.3.3 Konstruksi Data

### Penentuan Fitur

Akan dibuat sebuah dataset untuk mendeteksi konteks
1. Menggabungkan data invoice dan suku cadang
2. menggabungkan beberapa atribut untuk memperkuat konteks:
- 'Uraian Pekerjaan', 'Kategori', 'Total'
- 'Komponen', 'Keterangan'
- 'Kerusakan', 'Keterangan', 'Kategori'
- 'Laporan', 'Kategori'
3. menggabungkan data diatas dengan sukucadang berdasar pekerjaan

dataset 1:
dataset_1 = konteks,

dataset 2:

### Pemilihan Atribut

In [None]:
data_invoice_construct = data_invoice.copy().rename(columns={
    'Uraian Pekerjaan': 'suku_cadang',
    'Harga Satuan': 'biaya'
})

data_sukucadang_construct = data_sukucadang.copy().rename(columns={
    'Komponen': 'suku_cadang',
    'Keterangan': 'kategori'
})

# Menggabungkan data invoice dan suku cadang
dataset_1 = pd.merge(
    data_invoice_construct,
    data_sukucadang_construct,
    on='suku_cadang',
    how='left'
)

dataset_1['konteks'] = ''

dataset_1.head()

Unnamed: 0,suku_cadang,biaya,kategori,konteks
0,shell hx 6 10 40 4ltr,420 000,oli mesin kendara roda empat bobot 4 liter fun...,
1,filter oli yzze1,60 000,filter oli kendara roda empat guna nyaring kot...,
2,filter udara avanza vvti,175 000,filter udara kualitas tinggi kendara roda empa...,
3,service rem depan belakang,300 000,layan awat rem depan belakang kendara roda empat,
4,tune up injection cleaner,350 000,layan tune up teknologi injection cleaner bers...,


In [None]:
dataset_2 = data_laporan.copy().rename(columns={
    'Laporan': 'konteks',
    'Masalah': 'masalah'
})
dataset_2['suku_cadang'] = ''
dataset_2['kategori'] = ''
dataset_2['biaya'] = ''
dataset_2 = dataset_2[['suku_cadang', 'kategori', 'biaya', 'konteks', 'masalah']]
dataset_2.head()

Unnamed: 0,suku_cadang,kategori,biaya,konteks,masalah
0,,,,lihat indikator check engine nyala dashboard m...,masalah mesin
1,,,,saya injak rem dengar suara decit tidak biasa,gagal rem
2,,,,rasa getar tidak normal kemudi cepat tinggi,masalah kemudi masalah ban
3,,,,mobil stir nya stabil lalu menstater kadang ka...,ganggu listrik masalah kemudi masalah mesin
4,,,,kaca mobil bisa gerak,rusa aksesoris interior eksterior


## 4.3.4 Pelabelan Data

### Penentuan Label

In [None]:
# Ekstraksi label kerusakan dan level dari dataset
LABELS = data_kerusakan['Kerusakan'].tolist()
print("Label kerusakan = ", LABELS)

Label kerusakan =  ['masalah sistem knalpot', 'masalah transmisi', 'masalah suspensi', 'ganggu listrik', 'masalah sistem bahan bakar', 'masalah kemudi', 'masalah ban', 'masalah mesin', 'gagal rem', 'masalah sistem dingin', 'rusa aksesoris interior eksterior']


### Penerapan Label

In [None]:
# Menambahkan Kolom Label
dataset_1_labeled = dataset_1.copy()
dataset_1_labeled[LABELS] = 0
dataset_1_labeled.head()

Unnamed: 0,suku_cadang,biaya,kategori,konteks,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior
0,shell hx 6 10 40 4ltr,420 000,oli mesin kendara roda empat bobot 4 liter fun...,,0,0,0,0,0,0,0,0,0,0,0
1,filter oli yzze1,60 000,filter oli kendara roda empat guna nyaring kot...,,0,0,0,0,0,0,0,0,0,0,0
2,filter udara avanza vvti,175 000,filter udara kualitas tinggi kendara roda empa...,,0,0,0,0,0,0,0,0,0,0,0
3,service rem depan belakang,300 000,layan awat rem depan belakang kendara roda empat,,0,0,0,0,0,0,0,0,0,0,0
4,tune up injection cleaner,350 000,layan tune up teknologi injection cleaner bers...,,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Menambahkan Kolom Label
dataset_2_labeled = dataset_2.copy()
dataset_2_labeled[LABELS] = 0
dataset_2_labeled.head()

Unnamed: 0,suku_cadang,kategori,biaya,konteks,masalah,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior
0,,,,lihat indikator check engine nyala dashboard m...,masalah mesin,0,0,0,0,0,0,0,0,0,0,0
1,,,,saya injak rem dengar suara decit tidak biasa,gagal rem,0,0,0,0,0,0,0,0,0,0,0
2,,,,rasa getar tidak normal kemudi cepat tinggi,masalah kemudi masalah ban,0,0,0,0,0,0,0,0,0,0,0
3,,,,mobil stir nya stabil lalu menstater kadang ka...,ganggu listrik masalah kemudi masalah mesin,0,0,0,0,0,0,0,0,0,0,0
4,,,,kaca mobil bisa gerak,rusa aksesoris interior eksterior,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Fungsi Labeling
def labeling(val, key):
  labels = {}
  # update_konteks = konteks

  for index, row in data_kerusakan.iterrows():
    array1 = row[key].split(' ')
    array2 = val.split(' ')

    kerusakan = row['Kerusakan']
    konteks_tambahan = row['Keterangan']

    kondisi = 1 if any(x in array2 for x in array1) else 0

    print(kondisi, ' - ', row[key], ' - ', val)
    # update_konteks += ' ' + konteks_tambahan if kondisi == 1 else ""
    labels.update({kerusakan: kondisi})

  # labels.update({'konteks': update_konteks})
  return labels

# Fungsi Labeling
def labeling2(val, key):
  labels = {}

  for index, row in data_kerusakan.iterrows():
    array1 = row[key].split(' ')
    array2 = val.replace('masalah', '') if 'masalah' in val else val

    kerusakan = row['Kerusakan']
    konteks_tambahan = row['Keterangan']

    kondisi = 1 if any(x in array2 for x in array1) else 0

    print(kondisi, ' - ', row[key], ' - ', val)
    labels.update({kerusakan: kondisi})

  return labels

In [None]:
for index, row in dataset_1_labeled.iterrows():
  kategori = row['kategori']
  update_value = labeling(kategori, 'Kategori')
  dataset_1_labeled.loc[index, list(update_value.keys())] = list(update_value.values())

1  -  filter udara oli bahan bakar cleaner pasang  -  oli mesin kendara roda empat bobot 4 liter fungsi bagai lumas mesin mobil teknologi sintetis bantu lindung mesin kendara dari kotor sisa bakar aus beri performa maksimal tahan lama oli telah teknologi active cleansing technology mampu jaga mesin tetap bersih mesin selalu asa baru itu oli dapat kurang gejala knocking mesin turbocharger
1  -  oli transmisi pasang kopling joint bearing pasang cleaner  -  oli mesin kendara roda empat bobot 4 liter fungsi bagai lumas mesin mobil teknologi sintetis bantu lindung mesin kendara dari kotor sisa bakar aus beri performa maksimal tahan lama oli telah teknologi active cleansing technology mampu jaga mesin tetap bersih mesin selalu asa baru itu oli dapat kurang gejala knocking mesin turbocharger
0  -  shockbreaker spooring balancing hidrolik pasang  -  oli mesin kendara roda empat bobot 4 liter fungsi bagai lumas mesin mobil teknologi sintetis bantu lindung mesin kendara dari kotor sisa bakar aus

In [None]:
for index, row in dataset_2_labeled.iterrows():
  masalah = row['masalah']
  update_value = labeling2(masalah, 'Kerusakan')
  dataset_2_labeled.loc[index, list(update_value.keys())] = list(update_value.values())

0  -  masalah sistem knalpot  -  masalah mesin
0  -  masalah transmisi  -  masalah mesin
0  -  masalah suspensi  -  masalah mesin
0  -  ganggu listrik  -  masalah mesin
0  -  masalah sistem bahan bakar  -  masalah mesin
0  -  masalah kemudi  -  masalah mesin
0  -  masalah ban  -  masalah mesin
1  -  masalah mesin  -  masalah mesin
0  -  gagal rem  -  masalah mesin
0  -  masalah sistem dingin  -  masalah mesin
0  -  rusa aksesoris interior eksterior  -  masalah mesin
0  -  masalah sistem knalpot  -  gagal rem
0  -  masalah transmisi  -  gagal rem
0  -  masalah suspensi  -  gagal rem
0  -  ganggu listrik  -  gagal rem
0  -  masalah sistem bahan bakar  -  gagal rem
0  -  masalah kemudi  -  gagal rem
0  -  masalah ban  -  gagal rem
0  -  masalah mesin  -  gagal rem
1  -  gagal rem  -  gagal rem
0  -  masalah sistem dingin  -  gagal rem
0  -  rusa aksesoris interior eksterior  -  gagal rem
0  -  masalah sistem knalpot  -  masalah kemudi masalah ban
0  -  masalah transmisi  -  masalah kemudi

In [None]:
dataset_1_labeled.shape

(199, 15)

In [None]:
dataset_1_labeled.head()

Unnamed: 0,suku_cadang,biaya,kategori,konteks,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior
0,shell hx 6 10 40 4ltr,420 000,oli mesin kendara roda empat bobot 4 liter fun...,,1,1,0,0,1,0,0,1,0,0,0
1,filter oli yzze1,60 000,filter oli kendara roda empat guna nyaring kot...,,1,1,0,0,0,0,0,1,0,1,0
2,filter udara avanza vvti,175 000,filter udara kualitas tinggi kendara roda empa...,,1,0,0,0,0,0,0,1,0,1,0
3,service rem depan belakang,300 000,layan awat rem depan belakang kendara roda empat,,0,0,0,0,0,0,0,0,1,0,0
4,tune up injection cleaner,350 000,layan tune up teknologi injection cleaner bers...,,1,1,0,0,1,0,0,1,0,0,0


In [None]:
dataset_2_labeled = dataset_2_labeled.drop(columns=['masalah'])
dataset_2_labeled

Unnamed: 0,suku_cadang,kategori,biaya,konteks,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior
0,,,,lihat indikator check engine nyala dashboard m...,0,0,0,0,0,0,0,1,0,0,0
1,,,,saya injak rem dengar suara decit tidak biasa,0,0,0,0,0,0,0,0,1,0,0
2,,,,rasa getar tidak normal kemudi cepat tinggi,0,0,0,0,0,1,1,0,0,0,0
3,,,,mobil stir nya stabil lalu menstater kadang ka...,0,0,0,1,0,1,0,1,0,0,0
4,,,,kaca mobil bisa gerak,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,,,,kondisi ban kendara terlalu licin perlu ganti,0,0,0,0,0,0,1,0,0,0,0
134,,,,mesin kendara keluar suara decit perlu baik,0,0,0,0,0,0,0,1,0,0,0
135,,,,kabel gas kendara terlalu kendor perlu ganti,1,1,0,1,1,0,0,0,0,1,0
136,,,,kondisi ac kendara terlalu dingin perlu setel,1,0,0,0,1,0,0,0,0,1,0


In [None]:
dataset_2_labeled.shape

(121, 15)

## 4.3.5 Integrasi Data

In [None]:
# Menggabungkan data
dataset_gabungan = pd.merge(dataset_1_labeled, dataset_2_labeled, on=LABELS, how='left')
dataset_gabungan.head()

Unnamed: 0,suku_cadang_x,biaya_x,kategori_x,konteks_x,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior,suku_cadang_y,kategori_y,biaya_y,konteks_y
0,shell hx 6 10 40 4ltr,420 000,oli mesin kendara roda empat bobot 4 liter fun...,,1,1,0,0,1,0,0,1,0,0,0,,,,
1,filter oli yzze1,60 000,filter oli kendara roda empat guna nyaring kot...,,1,1,0,0,0,0,0,1,0,1,0,,,,
2,filter udara avanza vvti,175 000,filter udara kualitas tinggi kendara roda empa...,,1,0,0,0,0,0,0,1,0,1,0,,,,
3,service rem depan belakang,300 000,layan awat rem depan belakang kendara roda empat,,0,0,0,0,0,0,0,0,1,0,0,,,,saya injak rem dengar suara decit tidak biasa
4,service rem depan belakang,300 000,layan awat rem depan belakang kendara roda empat,,0,0,0,0,0,0,0,0,1,0,0,,,,rem kendara kerja baik perlu rem jarak jauh


In [None]:
dataset_gabungan = dataset_gabungan.drop(columns=['konteks_x', 'suku_cadang_y', 'biaya_y', 'kategori_y'])
dataset_gabungan = dataset_gabungan.rename(columns={'suku_cadang_x': 'suku_cadang', 'biaya_x': 'biaya', 'kategori_x': 'kategori', 'konteks_y': 'konteks'})
dataset_gabungan = dataset_gabungan[['suku_cadang', 'biaya', 'konteks'] + LABELS]

dataset_gabungan.head()

Unnamed: 0,suku_cadang,biaya,konteks,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior
0,shell hx 6 10 40 4ltr,420 000,,1,1,0,0,1,0,0,1,0,0,0
1,filter oli yzze1,60 000,,1,1,0,0,0,0,0,1,0,1,0
2,filter udara avanza vvti,175 000,,1,0,0,0,0,0,0,1,0,1,0
3,service rem depan belakang,300 000,saya injak rem dengar suara decit tidak biasa,0,0,0,0,0,0,0,0,1,0,0
4,service rem depan belakang,300 000,rem kendara kerja baik perlu rem jarak jauh,0,0,0,0,0,0,0,0,1,0,0


In [None]:
# Menghapus rows yang tidak memiliki satupun label bernilai 1
has_label_1 = dataset_gabungan.iloc[:, 4:].any(axis=1)
dataset = dataset_gabungan[has_label_1].reset_index(drop=True)
print(dataset.shape)

(688, 14)


In [None]:
# Fill NaN values in konteks column using values from other dataset
dataset['konteks'] = dataset.apply(lambda row: row['konteks'] if pd.notna(row['konteks']) else data_sukucadang.loc[data_sukucadang['Komponen'] == row['suku_cadang'], 'Keterangan'].values[0], axis=1)
dataset.head()

Unnamed: 0,suku_cadang,biaya,konteks,masalah sistem knalpot,masalah transmisi,masalah suspensi,ganggu listrik,masalah sistem bahan bakar,masalah kemudi,masalah ban,masalah mesin,gagal rem,masalah sistem dingin,rusa aksesoris interior eksterior
0,shell hx 6 10 40 4ltr,420 000,oli mesin kendara roda empat bobot 4 liter fun...,1,1,0,0,1,0,0,1,0,0,0
1,filter oli yzze1,60 000,filter oli kendara roda empat guna nyaring kot...,1,1,0,0,0,0,0,1,0,1,0
2,filter udara avanza vvti,175 000,filter udara kualitas tinggi kendara roda empa...,1,0,0,0,0,0,0,1,0,1,0
3,service rem depan belakang,300 000,saya injak rem dengar suara decit tidak biasa,0,0,0,0,0,0,0,0,1,0,0
4,service rem depan belakang,300 000,rem kendara kerja baik perlu rem jarak jauh,0,0,0,0,0,0,0,0,1,0,0


In [None]:
dataset[LABELS].sum()

masalah sistem knalpot               104
masalah transmisi                    218
masalah suspensi                      31
ganggu listrik                       111
masalah sistem bahan bakar            74
masalah kemudi                        31
masalah ban                           70
masalah mesin                        163
gagal rem                             52
masalah sistem dingin                 88
rusa aksesoris interior eksterior    122
dtype: int64

In [None]:
dataset[LABELS].sum().sum()

1064

In [None]:
dataset.to_csv('dataset.csv', index=False)

# 4.4 Modeling

In [None]:
!pip install transformers tensorflow

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.4 MB/s[0m eta [36m0:00:0

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
BERT_NAME = 'indobenchmark/indobert-lite-base-p1'

In [None]:
def prediction_each_labels(scenario, predictions, y_test):
  binary_predictions = np.where(predictions >= threshold, 1, 0)

  # Compute precision, recall, and F1-score for each label
  label_precisions = precision_score(y_test, binary_predictions, average=None)
  label_recalls = recall_score(y_test, binary_predictions, average=None)
  label_f1_scores = f1_score(y_test, binary_predictions, average=None)

  print("Scenario: ", scenario)
  # Print accuracy metrics for each label
  for label, precision, recall, f1 in zip(LABELS, label_precisions, label_recalls, label_f1_scores):
      print(f"Label: {label}")
      print(f"Precision: {precision}")
      print(f"Recall: {recall}")
      print(f"F1-score: {f1}")
      print()

## 4.4.1 Skenario Modeling

### Balancing Oversampling

In [None]:
from sklearn.utils import resample
df = dataset.copy()

labels_summary = dataset.iloc[:, 3:].sum()
labels_summary

max_count = labels_summary.max()

balanced_df = pd.DataFrame(columns=df.columns)
for label in labels_summary.index:
  label_count = labels_summary[label]
  if label_count < max_count:
    label_df = df[df[label] == 1]
    oversampled_df = resample(label_df, replace=True, n_samples=max_count, random_state=42)
    balanced_df = pd.concat([balanced_df, oversampled_df], ignore_index=True)
  else:
    label_df = df[df[label == 1]].sample(n=max_count, random_state=42)
    balanced_df = pd.concat([balanced_df, label_df], ignore_index=True)

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df

KeyError: ignored

### Balancing SMOTE

In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)
X_sm, y_sm

### Tokenisasi

In [None]:
class Preprocess():
    def __init__(self, max_len=128):
        self.stemmer = StemmerFactory().create_stemmer()
        self.stopword = StopWordRemoverFactory().create_stop_word_remover()
        self.tokenizer = BertTokenizer.from_pretrained(BERT_NAME)
        self.max_len = max_len

    def casefolding(self, val):
        return str(val).lower()

    def stemming(self, val):
        return self.stemmer.stem(str(val))

    def stopwordremove(self, val):
        return self.stopword.remove(str(val))

    def tokenizing(self, val):
        return self.tokenizer.encode_plus(
            val,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='tf'
        )

    def single_preprocessing(self, sentence):
      input = self.casefolding(sentence)
      input = self.stemming(input)
      input = self.stopwordremove(input)
      output = self.tokenizing(input)
      return {
          'input_ids': tf.convert_to_tensor(np.asarray(output['input_ids']).squeeze(), dtype=tf.int32),
          'attention_mask': tf.convert_to_tensor(np.asarray(output['attention_mask']).squeeze(), dtype=tf.int32),
      }

    def preprocessing(self, sentences):
        input_ids, attention_mask = [], []
        # melakukan looping nilai sentence
        for sentence in sentences:
            # pemanggilan fungsi casefolding, stemming, stopwordremove, tokenizing secara berurutan
            input = self.casefolding(sentence)
            input = self.stemming(input)
            input = self.stopwordremove(input)
            output = self.tokenizing(input)
            input_ids.append(output['input_ids'])
            attention_mask.append(output['attention_mask'])

        # mengembalikan hasil preprocessing dengan nilai multiple
        return {
            'input_ids': tf.convert_to_tensor(np.asarray(input_ids).squeeze(), dtype=tf.int32),
            'attention_mask': tf.convert_to_tensor(np.asarray(attention_mask).squeeze(), dtype=tf.int32)
        }

    def preprocess_get_token(self, sentences, display_len=20):
        # pemanggilan fungsi preprocessing
        tokenized = self.preprocessing(sentences)
        # mengembalikan hasil preprocessing berbentuk token
        return [self.tokenizer.convert_ids_to_tokens(tokenized['input_ids'][i][:display_len]) for i in range(len(sentences))]

In [None]:
preprocess = Preprocess()
sample_texts = hasil_stopword_remove[0]
preprocess.single_preprocessing(sample_texts)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


{'input_ids': <tf.Tensor: shape=(128,), dtype=int32, numpy=
 array([    2,  1173,  5659,  1980,  6637, 14906, 26259,   895,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   

In [None]:
sample_texts = hasil_stopword_remove

preprocess = Preprocess()
hasil_tokenisasi = preprocess.preprocessing(sample_texts)
hasil_token = preprocess.preprocess_get_token(sample_texts)

print('shape = ', hasil_tokenisasi['input_ids'].shape, hasil_tokenisasi['attention_mask'].shape)
print('\n')
print('input_ids = ', hasil_tokenisasi['input_ids'][0][:20])
print('\n')
print('attention_mask = ', hasil_tokenisasi['attention_mask'][0][:20])
print('\n')
print('token = ', hasil_token)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


shape =  (3, 128) (3, 128)


input_ids =  tf.Tensor(
[    2  1173  5659  1980  6637 14906 26259   895     3     0     0     0
     0     0     0     0     0     0     0     0], shape=(20,), dtype=int32)


attention_mask =  tf.Tensor([1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0], shape=(20,), dtype=int32)


token =  [['[CLS]', 'lihat', 'indikator', 'check', 'engine', 'nyala', 'dashboard', 'mobil', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[CLS]', 'inj', '##ak', 'rem', 'dengar', 'suara', 'dec', '##it', 'biasa', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'], ['[CLS]', 'rasa', 'getar', 'normal', 'kemudi', 'cepat', 'tinggi', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']]


### Split Dataset

In [None]:

def split_dataset(token_max_len, test_split, batch_size):
  preprocess = Preprocess(max_len=token_max_len)
  X_train, X_test, y_train, y_test = train_test_split(
    dataset['konteks'],
    dataset[LABELS],
    test_size=test_split,
    shuffle=True,
    random_state=42
  )

  X_train = preprocess.preprocessing(X_train)
  X_test = preprocess.preprocessing(X_test)

  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
  test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)


  return train_dataset, test_dataset, y_test

## 4.4.2 Pembangunan Model

In [None]:
bert_model = TFBertModel.from_pretrained(BERT_NAME)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading tf_model.h5:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

Some layers from the model checkpoint at indobenchmark/indobert-lite-base-p1 were not used when initializing TFBertModel: ['albert', 'sop_classifier', 'predictions']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at indobenchmark/indobert-lite-base-p1 and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def classification_model(bert_encoder, num_labels, max_len, learning_rate):
  input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
  attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

  encoding_layer = bert_encoder(input_ids, attention_mask)[0]

  l = tf.keras.layers.GlobalAveragePooling1D(name='pooling_layer')(encoding_layer)
  l = tf.keras.layers.Dropout(0.1, name='dropout_layer')(l)
  l = tf.keras.layers.Dense(num_labels, activation='sigmoid', name='output_layer')(l)

  model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[l])

  OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=learning_rate)
  LOSS = tf.keras.losses.BinaryCrossentropy()
  METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

  model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)

  return model

### Skenario 1

#### Split Dataset

In [None]:
MAX_LEN=128
BATCH_SIZE=32
TRAIN_SPLIT=0.8
TEST_SPLIT=0.2

train_dataset_1, test_dataset_1, y_test_1 = split_dataset(token_max_len=MAX_LEN, test_split=TEST_SPLIT, batch_size=BATCH_SIZE)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


#### Hyperparameter

In [None]:
EPOCHS=10
LEARNING_RATE=2e-05

#### Build Model dan Training

In [None]:
model_1 = classification_model(
    bert_encoder=bert_model,
    num_labels=len(LABELS),
    max_len=MAX_LEN,
    learning_rate=LEARNING_RATE,
)

In [None]:
model_1.summary()

In [None]:
model_1.fit(
    train_dataset_1,
    epochs=EPOCHS,
    validation_data=test_dataset_1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7b24b04d87c0>

#### Model Evaluation

In [None]:
model_1.evaluate(test_dataset_1)



[0.005590539425611496, 0.9986824989318848]

In [None]:
predictions_1 = model_1.predict(test_dataset_1)  # Get model predictions
prediction_each_labels(scenario=1, predictions=predictions_1, y_test=y_test_)

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 11), dtype=tf.int64, name=None))>

#### Save Model

In [None]:
model_json = model_1.to_json()
with open('model_1.json', 'w') as json_file:
  json_file.write(model_json)

model_1.save_weights('model_1_weight.keras')
model_1.save('model_1.keras')

In [None]:
!cp model_1.json '/content/drive/MyDrive/TA'
!cp model_1_weight.keras '/content/drive/MyDrive/TA'
!cp model_1.keras '/content/drive/MyDrive/TA'

### Skenario 2

#### Split Dataset

In [None]:
MAX_LEN=128
BATCH_SIZE=32
TRAIN_SPLIT=0.8
TEST_SPLIT=0.2

train_dataset_2, test_dataset_2, y_test_2 = split_dataset(token_max_len=MAX_LEN, test_split=TEST_SPLIT, batch_size=BATCH_SIZE)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


#### Hyperparameter

In [None]:
EPOCHS=30
LEARNING_RATE=2e-05

#### Build Model dan Training

In [None]:
model_2 = classification_model(
    bert_encoder=bert_model,
    num_labels=len(LABELS),
    max_len=MAX_LEN,
    learning_rate=LEARNING_RATE,
)

In [None]:
model_2.summary()

In [None]:
model_2.fit(
    train_dataset_2,
    epochs=EPOCHS,
    validation_data=test_dataset_2,
)

Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7b25c88c3b20>

#### Model Evaluation

In [None]:
model_2.evaluate(test_dataset_2)



[0.0046027773059904575, 0.9986824989318848]

In [None]:
predictions_2 = model_2.predict(test_dataset_2)  # Get model predictions
prediction_each_labels(scenario=2, predictions=predictions_2, y_test=y_test_2)

#### Save Model

In [None]:
!rm model_2.json
!rm model_2_weight.keras
!rm model_2.keras

In [None]:
model_json = model_2.to_json()
with open('model_2.json', 'w') as json_file:
  json_file.write(model_json)

model_2.save_weights('model_2_weight.keras')
model_2.save('model_2.keras')

In [None]:
!cp model_2.json '/content/drive/MyDrive/TA'
!cp model_2_weight.keras '/content/drive/MyDrive/TA'
!cp model_2.keras '/content/drive/MyDrive/TA'

### Skenario 3

#### Split Dataset

In [None]:
MAX_LEN=128
BATCH_SIZE=32
TRAIN_SPLIT=0.8
TEST_SPLIT=0.2

train_dataset_3, test_dataset_3, y_test_3 = split_dataset(token_max_len=MAX_LEN, test_split=TEST_SPLIT, batch_size=BATCH_SIZE)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


#### Hyperparameters

In [None]:
EPOCHS=20
LEARNING_RATE=2e-05

#### Build Model dan Training

In [None]:
model_3 = classification_model(
    bert_encoder=bert_model,
    num_labels=len(LABELS),
    max_len=MAX_LEN,
    learning_rate=LEARNING_RATE,
)

In [None]:
model_3.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109081344   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [None]:
model_3.fit(
    train_dataset_3,
    epochs=EPOCHS,
    validation_data=test_dataset_3,
)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7b23885ac9a0>

#### Model Evaluation

In [None]:
model_3.evaluate(test_dataset_3)



[0.0044221701100468636, 0.9986824989318848]

In [None]:
predictions_3 = model_3.predict(test_dataset_3)  # Get model predictions
prediction_each_labels(scenario=3, predictions=predictions_3, y_test=y_test_3)

Scenario:  3
Label: masalah sistem knalpot
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: masalah transmisi
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: masalah suspensi
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: ganggu listrik
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: masalah sistem bahan bakar
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: masalah kemudi
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: masalah ban
Precision: 0.95
Recall: 1.0
F1-score: 0.9743589743589743

Label: masalah mesin
Precision: 1.0
Recall: 0.9761904761904762
F1-score: 0.9879518072289156

Label: gagal rem
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: masalah sistem dingin
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Label: rusa aksesoris interior eksterior
Precision: 1.0
Recall: 1.0
F1-score: 1.0



#### Save Model

In [None]:
!rm model_3.json
!rm model_3_weight.keras
!rm model_3.keras

rm: cannot remove 'model_3.json': No such file or directory
rm: cannot remove 'model_3_weight.keras': No such file or directory
rm: cannot remove 'model_3.keras': No such file or directory


In [None]:
model_json = model_3.to_json()
with open('model_3.json', 'w') as json_file:
  json_file.write(model_json)

model_3.save_weights('model_3_weight.keras')
model_3.save('model_3.keras')

In [None]:
!cp model_3.json '/content/drive/MyDrive/TA'
!cp model_3_weight.keras '/content/drive/MyDrive/TA'
!cp model_3.keras '/content/drive/MyDrive/TA'

### Skenario 4

#### Split Dataset

In [None]:
MAX_LEN=128
BATCH_SIZE=32
TRAIN_SPLIT=0.8
TEST_SPLIT=0.2

train_dataset_4, test_dataset_4, y_test_4 = split_dataset(token_max_len=MAX_LEN, test_split=TEST_SPLIT, batch_size=BATCH_SIZE)

#### Hyperparameters

In [None]:
EPOCHS=20
LEARNING_RATE=2e-05

#### Build Model dan Training

In [None]:
model_4 = classification_model(
    bert_encoder=bert_model,
    num_labels=len(LABELS),
    max_len=MAX_LEN,
    learning_rate=LEARNING_RATE,
)

In [None]:
model_4.summary()

In [None]:
model_4.fit(
    train_dataset_3,
    epochs=EPOCHS,
    validation_data=test_dataset_3,
)

#### Model Evaluation

In [None]:
model_4.evaluate(test_dataset_4)

In [None]:
predictions_4 = model_4.predict(test_dataset_4)  # Get model predictions
prediction_each_labels(scenario=4, predictions=predictions_4, y_test=y_test_4)

# 4.5 Model Evaluation

In [None]:
print(model_1.evaluate(test_dataset_1))
print(model_2.evaluate(test_dataset_2))
print(model_3.evaluate(test_dataset_3))

[0.3863104581832886, 0.8129117488861084]
[0.10986222326755524, 0.970355749130249]
[0.0044221701100468636, 0.9986824989318848]


In [None]:
inputs = ['AC mobil tidak dingin', 'Rem pada mobil tidak bekerja', 'Kaca spion rusak harus diganti']

preprocess = Preprocess()
tokenized = preprocess.preprocessing(inputs)
predictions = model.predict(tokenized)

# Define the threshold
threshold = 0.5

# Apply the threshold to the predictions
binary_predictions = np.where(predictions > threshold, 1, 0)

# Print the binary predictions
print(LABELS)
print(binary_predictions)

In [None]:
# Assuming you have predictions and ground truth labels
predictions = model.predict(test_dataset)  # Get model predictions
threshold = 0.5  # Set threshold for label prediction

# Convert probability predictions to binary predictions
binary_predictions = np.where(predictions >= threshold, 1, 0)

# Compute precision, recall, and F1-score for each label
label_precisions = precision_score(y_test, binary_predictions, average=None)
label_recalls = recall_score(y_test, binary_predictions, average=None)
label_f1_scores = f1_score(y_test, binary_predictions, average=None)

# Print accuracy metrics for each label
for label, precision, recall, f1_score in zip(LABELS, label_precisions, label_recalls, label_f1_scores):
    print(f"Label: {label}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1_score}")
    print()


In [None]:
!python --version

# 4.6 Deployment

In [None]:
model_json_path = '/content/drive/MyDrive/TA/model_2.json'
model_weight_path = '/content/drive/MyDrive/TA/model_2_weight.keras'
model_path = '/content/drive/MyDrive/TA/model_2.keras'

In [None]:
from keras.models import model_from_json

In [None]:
def initmodel():
  json_file = open(model_json_path, 'r')
  loaded_model_json = json_file.read()
  json_file.close()
  loaded_model = model_from_json(loaded_model_json)
  loaded_model.load_weights(model_weight_path)
  print('loaded from disk')

In [None]:
class Process():
  def __init__(self):
    self.model = tf.keras.models.load_model(
        model_path,
        custom_objects={'TFBertModel': TFBertModel.from_pretrained(BERT_NAME)},
        compile=False
    )
    self.threshold = 0.5

  def rounded_predictions(self, inputs):
    predictions = self.model.predict(inputs)
    return np.where(predictions > self.threshold, 1, 0)

  def measure_severity(self, inputs, labels):
    return {}

  def predict(self, inputs):
    predictions = self.model.predict(inputs)
    return predictions

In [None]:
preprocess = Preprocess()
process = Process()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some layers from the model checkpoint at indobenchmark/indobert-lite-base-p1 were not used when initializing TFBertModel: ['predictions', 'albert', 'sop_classifier']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenc

In [None]:
inputs = ['mesin tidak bisa menyala', 'AC mobil tidak dingin', 'Mesin bersuara kasar', 'ban bocor dan harus diganti dengan yang baru', 'asap knalpot berwarna putih', 'kaca spion pecah dan harus diganti', 'lampu sen kendaraan tidak berfungsi']
tokenized = preprocess.preprocessing(inputs)
predictions = process.predict(tokenized)

print(predictions)



[[5.7581360e-03 3.4292528e-04 1.1853851e-05 4.7294979e-04 4.0363078e-03
  2.8953236e-05 2.3000853e-06 9.9991870e-01 4.5643435e-05 2.4791766e-04
  7.5313210e-04]
 [9.9117762e-01 3.4052016e-05 1.6319862e-05 5.7285564e-05 7.7381301e-01
  1.2171114e-05 2.8914443e-05 4.2167326e-06 9.4593415e-05 9.9900705e-01
  8.3200671e-02]
 [8.7896525e-04 5.6536461e-04 2.0502155e-05 1.8198411e-04 5.5323355e-04
  2.3590741e-05 3.0188050e-05 9.9998331e-01 8.4890817e-05 1.5073216e-04
  4.3375748e-03]
 [2.6745334e-01 2.7247134e-04 1.3167493e-05 1.7715325e-06 3.8541973e-01
  1.2378296e-03 9.9052191e-01 1.3124907e-03 3.5302513e-05 1.9427199e-02
  1.5668347e-05]
 [9.9764794e-01 2.3928756e-05 1.3295289e-05 4.0559247e-05 9.9876618e-01
  2.7809350e-04 5.2364605e-05 1.6323893e-05 2.0670118e-04 9.9884951e-01
  1.3656662e-02]
 [2.6080935e-04 1.2404699e-04 7.6049888e-05 3.4940499e-06 4.5508679e-04
  2.6743283e-04 4.2134398e-05 3.0116341e-04 6.3191226e-05 2.8927089e-04
  9.9995136e-01]
 [2.6165679e-05 3.1972144e-05 2.25

## Implementasi pada Flask

In [None]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

preprocess = Preprocess()
process = Process()

@app.route("/hello")
def hello():
  return jsonify({
      'data': 'Hello world'
  })

@app.route("/predict", methods=['POST'])
def predict():
  body = request.get_json()
  inputs = body.get('inputs')
  tokenized = preprocess.preprocessing(inputs)
  predictions = process.predict(tokenized)
  return jsonify({
      'data': predictions
  })

app.run()

## Implementasi Rumus

In [None]:
# menentukan tingkat kerusakan

LABEL_WEIGHTS = dict.fromkeys(LABELS, 0)

def measure_severity(predictions):
  outputs = []
  for i in range(len(predictions)):
      prediction = predictions[i]
      total_bobot = 0
      total_masalah = 0
      label_detected = []

      for j in range(len(prediction)):
          if prediction[j] == 1:
              total_bobot += labels[j]['bobot']
              total_masalah += 1
              label_detected.append(labels[j]['label'])

      output = {
          'label': label_detected,
          'prediction': prediction,
          'severity': total_bobot / total_masalah,
          'accuracy': 0
      }
      outputs.append(output)

  return {
      'outputs': outputs,
      'sum_severity': sum([output['severity'] for output in outputs]) / len(outputs)
  }

### Export Requirements

In [None]:
!pip freeze > requirements.txt

In [None]:
!cp requirements.txt '/content/drive/MyDrive/Kuliah/TA Akhdan Musyaffa Firdaus/Bimbingan - Akhdan Musyaffa Firdaus/Produk/Model'

In [None]:
!pip install session-info
import session_info
session_info.show()
!pipreqs

# Custom Dataset and Classifier

In [None]:
class CustomDataset(tf.keras.utils.Sequence):
  def __init__(self, data):
    self.data = data
    self.preprocess = Preprocess()

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    row = self.data.iloc[index]

    konteks = row['konteks']
    suku_cadang = row['suku_cadang']
    biaya = row['biaya']
    labels = row[LABELS].tolist()

    output = self.preprocess.single_preprocessing(konteks)

    return {
      'input_ids': output['input_ids'],
      'attention_mask': output['attention_mask'],
      'labels': labels,
      'suku_cadang': suku_cadang,
      'biaya': biaya
    }

In [None]:
class ClassificationModelWithRecommendation(tf.keras.Model):
  def __init__(self, bert_encoder, num_labels, num_spare_parts, max_len, learning_rate):
    super(ClassificationModelWithRecommendation, self).__init__()
    self.bert_encoder = bert_encoder
    self.num_labels = num_labels
    self.num_spare_parts = num_spare_parts
    self.max_len = max_len
    self.learning_rate = learning_rate

    # Init classification submodel
    self.classification_model = self.build_classification_model()

    # Init spareparts prediction submodel
    self.spare_part_model = self.build_spare_part_model()

    # Init estimated price submodel
    self.price_estimation_model = self.build_price_estimation_model()

  def build_classification_model(self):
    input_ids = tf.keras.layers.Input(shape=(self.max_len,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(self.max_len,), dtype=tf.int32, name='attention_mask')

    encoding_layer = self.bert_encoder(input_ids, attention_mask)[0]

    l = tf.keras.layers.GlobalAveragePooling1D(name='pooling_layer')(encoding_layer)
    l = tf.keras.layers.Dropout(0.1, name='dropout_layer')(l)
    l = tf.keras.layers.Dense(self.num_labels, activation='sigmoid', name='output_layer')(l)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[l])

    OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
    LOSS = tf.keras.losses.BinaryCrossentropy()
    METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

    model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)

    return model

  def build_spare_part_model(self):
    spare_part_input = tf.keras.layers.Input(shape=(self.num_labels,), name='suku_cadang')
    spare_part_output = tf.keras.layers.Dense(self.num_spare_parts, activation='softmax', name='spare_part_output')(spare_part_input)
    model = tf.keras.Model(inputs=spare_part_input, outputs=spare_part_output)
    model.compile(
      optimizer='adam',
      loss='categorical_crossentropy',  # You may need to adapt this based on your spare part data
      metrics=['accuracy']
    )
    return model

  def build_price_estimation_model(self):
    price_input = tf.keras.layers.Input(shape=(self.num_labels,), name='biaya')
    price_output = tf.keras.layers.Dense(self.num_spare_parts, activation='linear', name='price_output')(price_input)
    model = tf.keras.Model(inputs=price_input, outputs=price_output)
    model.compile(
      optimizer='adam',
      loss='mean_squared_error',  # You may need to adapt this based on your price data
      metrics=['mse']
    )
    return model

  def call(self, inputs):
    classification_output = self.classification_model(inputs)
    spare_part_probs = self.spare_part_model(inputs['suku_cadang'])
    price_estimations = self.price_estimation_model(inputs['biaya'])

    return classification_output, spare_part_probs, price_estimations


model_with_recommendation = ClassificationModelWithRecommendation(
    bert_encoder=bert_model,
    num_labels=len(LABELS),
    num_spare_parts=len(data_sukucadang),
    max_len=MAX_LEN,
    learning_rate=LEARNING_RATE
)

In [None]:
from sklearn.model_selection import train_test_split

# Assuming your data is stored in a DataFrame called "df"
train_df, temp_df = train_test_split(dataset, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print("Training set:", len(train_df), "samples")
print("Validation set:", len(val_df), "samples")
print("Test set:", len(test_df), "samples")

Training set: 481 samples
Validation set: 103 samples
Test set: 104 samples


In [None]:
train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)
test_dataset = CustomDataset(test_df)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.


In [None]:
train_dataset[0]

{'input_ids': <tf.Tensor: shape=(128,), dtype=int32, numpy=
 array([   2, 1699, 4068, 3168,    3,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0], dtype=int32)>,
 'attention_mask': <tf.Tensor: shape=(128,), dtype=int32, nu

In [None]:
# Compile the model
model_with_recommendation.compile(
  loss=[tf.keras.losses.BinaryCrossentropy(), tf.keras.losses.CategoricalCrossentropy(), tf.keras.losses.MeanSquaredError()],
  optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
  metrics=['accuracy']
)

In [None]:
# Now, you can start training the model
history = model_with_recommendation.fit(
    train_dataset,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=val_dataset
)


  inputs = self._flatten_to_reference_inputs(inputs)


ValueError: ignored

### Model Summary