**Model Pengkategorisasian Secara Otomatis**

In [1260]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,classification_report
import joblib
import warnings
import os
warnings.filterwarnings('ignore')

**Persiapkan Dataset**

In [1261]:
data = pd.read_csv(r'C:\Users\jovan\OneDrive\Documents\Vs Code\Python VSCODE\Bangkit\Capstone\Data\dataset_final.csv')
# Memilih kolom tertentu dari dataframe
df = data[['nama', 'kategori']]

# Kolom 'Makanan dan Minuman'
df['Makanan dan Minuman'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Makanan dan Minuman' in row.values else 0, axis=1)

# Kolom 'Shopping'
df['Shopping'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Shopping' in row.values else 0, axis=1)

# Kolom 'Hiburan'
df['Hiburan'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Hiburan' in row.values else 0, axis=1)

# Kolom 'Lainnya'
df['Lainnya'] = pd.DataFrame(df['kategori']).apply(lambda row: 1 if 'Lainnya' in row.values else 0, axis=1)

In [1262]:
df

Unnamed: 0,nama,kategori,Makanan dan Minuman,Shopping,Hiburan,Lainnya
0,Abon,Makanan dan Minuman,1,0,0,0
1,Abon haruwan,Makanan dan Minuman,1,0,0,0
2,Agar-agar,Makanan dan Minuman,1,0,0,0
3,Akar tonjong segar,Makanan dan Minuman,1,0,0,0
4,Aletoge segar,Makanan dan Minuman,1,0,0,0
...,...,...,...,...,...,...
6006,Penghapus,Shopping,0,1,0,0
6007,Penggaris,Shopping,0,1,0,0
6008,Rautan Pensil,Shopping,0,1,0,0
6009,Pensil Warna,Shopping,0,1,0,0


**Split Dataset**

In [1263]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split

In [1264]:
def multilabel_train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None):
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size, train_size=train_size, random_state=random_state, stratify=None, shuffle=shuffle)

    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25)

    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return [(_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays]


In [1265]:
df = df.drop_duplicates()

print(df[df.index.duplicated()])


Empty DataFrame
Columns: [nama, kategori, Makanan dan Minuman, Shopping, Hiburan, Lainnya]
Index: []


In [1266]:
X = df['nama']
Y = df.drop(columns=['nama','kategori'])
X_train,X_test,Y_train,Y_test = multilabel_train_test_split(np.array(X), np.array(Y),test_size=0.2)

**Vectorized Dataset**

In [1267]:
# Create vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

# Vectorizer X_train
X_train_values = tfidf_vectorizer.fit_transform(X_train)

# Vectorizer X_test
X_test_values = tfidf_vectorizer.transform(X_test)


In [1268]:
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier


**SVC**

In [1269]:
Model = LinearSVC(random_state=42)
ModelSVC = MultiOutputClassifier(Model)
ModelSVC.fit(X_train_values, Y_train)

In [1270]:
predict = ModelSVC.predict(X_test_values)

In [1271]:
accuracy_score(predict,Y_test)

0.882903981264637

In [1272]:
f1_score(predict,Y_test,average='micro')

0.8986889153754469

In [1273]:
recall_score(predict,Y_test,average='micro')

0.9106280193236715

In [1274]:
precision_score(predict,Y_test,average='micro')

0.8870588235294118

In [1275]:
classification_report(predict,Y_test)

'              precision    recall  f1-score   support\n\n           0       0.99      0.88      0.93       304\n           1       0.17      1.00      0.29         1\n           2       0.81      0.99      0.89        77\n           3       0.58      0.97      0.73        32\n\n   micro avg       0.89      0.91      0.90       414\n   macro avg       0.64      0.96      0.71       414\nweighted avg       0.92      0.91      0.91       414\n samples avg       0.88      0.88      0.88       414\n'

In [1276]:
joblib.dump(ModelSVC,'ModelFinal.pkl')


['ModelFinal.pkl']

**Result**

In [1277]:
Y_train_columns = Y.columns

In [1278]:
def modelFuction(nama,tfidf_vectorizer):

    loadmodel = joblib.load('ModelFinal.pkl')

    text_vectorizer = tfidf_vectorizer.transform([nama])
    prediction = loadmodel.predict(text_vectorizer)

    re = []

    for pre in np.where(prediction==1)[1]:

        re.append(Y_train_columns[pre])

    return  re

In [1279]:
j = modelFuction("Bayar BBM Rp.5000",tfidf_vectorizer)
j

['Lainnya']

In [1280]:
print(df.columns)

Index(['nama', 'kategori', 'Makanan dan Minuman', 'Shopping', 'Hiburan',
       'Lainnya'],
      dtype='object')


In [1281]:
# Minta input dari pengguna untuk teks yang ingin diprediksi
user_input = input("Masukkan teks yang ingin diprediksi: ")

# Panggil fungsi modelFuction dengan input dari pengguna
j = modelFuction(user_input, tfidf_vectorizer)

# Cetak hasil prediksi
print("Hasil Prediksi:", j)


Hasil Prediksi: ['Makanan dan Minuman']


In [1282]:
print(df.shape)
print(df.head)

(2134, 6)
<bound method NDFrame.head of                     nama             kategori  Makanan dan Minuman  Shopping  \
0                   Abon  Makanan dan Minuman                    1         0   
1           Abon haruwan  Makanan dan Minuman                    1         0   
2              Agar-agar  Makanan dan Minuman                    1         0   
3     Akar tonjong segar  Makanan dan Minuman                    1         0   
4          Aletoge segar  Makanan dan Minuman                    1         0   
...                  ...                  ...                  ...       ...   
6006           Penghapus             Shopping                    0         1   
6007           Penggaris             Shopping                    0         1   
6008       Rautan Pensil             Shopping                    0         1   
6009        Pensil Warna             Shopping                    0         1   
6010             Pemutih             Shopping                    0         1   
