In [10]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle

# Load data from CSV file
data = pd.read_csv('D:/kuliah/Skripsi/ipynb/Data/textpreprocessing3.csv',  encoding="unicode_escape")

# Split data into features and target
X = data['text_stemmed']
y = data['Category']

# Convert text data to numerical features (Term Frequency)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

filename = 'word_vectorizer.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

filename = 'finalized_models_withstemmed.sav'
pickle.dump(clf, open(filename, 'wb'))

# Make predictions on test data
y_pred = clf.predict(X_test)
print(y_pred)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

report = classification_report(y_test, y_pred)
print(report)

print(confusion_matrix(y_test, y_pred))

['news' 'finance' 'news' ... 'news' 'otomotif' 'news']
Accuracy: 0.7821565407772304
              precision    recall  f1-score   support

     finance       0.72      0.73      0.72      2249
        food       0.86      0.84      0.85       705
      health       0.71      0.72      0.71       805
        news       0.85      0.87      0.86      5139
    otomotif       0.68      0.62      0.65       714
       sport       0.75      0.69      0.72       370
      travel       0.66      0.62      0.64       980

    accuracy                           0.78     10962
   macro avg       0.75      0.73      0.74     10962
weighted avg       0.78      0.78      0.78     10962

[[1634   14   63  367   62   24   85]
 [  30  592   27   21    4    1   30]
 [  76   21  578   91    9    4   26]
 [ 358   24   72 4464   80   19  122]
 [  72    9   26  117  442   18   30]
 [  22    5    9   37   19  255   23]
 [  89   26   42  163   33   18  609]]


In [None]:
import numpy as np
a = confusion_matrix(y_test, y_pred)
np.savetxt("confusion_matrix.csv",a,delimiter=",")


In [1]:
# stemmed text_stemmed
# ['news' 'oto' 'news' ... 'news' 'oto' 'news']
# Accuracy: 0.7817916438606094

# Removal of Rare words text_wo_stopfreqrare
# ['news' 'news' 'health' ... 'finance' 'oto' 'news']
# Accuracy: 0.7768655354862252

# Removal of Frequent words text_wo_stopfreq
# ['sport' 'news' 'health' ... 'finance' 'oto' 'news']
# Accuracy: 0.7747673782156541

# Removal of Punctuations text_wo_punct
# ['news' 'news' 'health' ... 'health' 'oto' 'news']
# Accuracy: 0.7687465790914066

(54809, 8)

In [12]:
import pickle

with open('finalized_models_withstemmed.sav', 'rb') as file:
    model = pickle.load(file)

with open('word_vectorizer.sav', 'rb') as file:
    vektor = pickle.load(file)

# Retrieve the input data from the request
data = ["Sejurus kemudian, mobil langsung menabrak moveable concrete barrier (MCB) beton di sebelah kiri jalan. Hal ini mengakibatkan ban mobil Mercy listrik ini tersangkut.Tak ada korban jiwa dalam kecelakaan mobil listrik ini. Sementara mobil Mercy listrik mengalami kerusakan pada bagian bumper."]
X = vektor.transform(data)

# Use the loaded model to make predictions
predictions = model.predict(X)
print(predictions)


['otomotif']


In [11]:
with open('finalized_models_withstemmedtitle.sav', 'rb') as file:
    model = pickle.load(file)

with open('word_vectorizers.sav', 'rb') as file:
    vectorizer = pickle.load(file)

data = ["Kementerian Keuangan (Kemenkeu) mencatat posisi utang pemerintah hingga 31 Mei 2023 mencapai Rp 7.787,51 triliun atau 37,85% terhadap produk domestik bruto (PDB). Jumlah itu turun Rp 62,38 triliun dari posisi utang bulan sebelumnya yang sebesar Rp 7.849,89 triliun.Baik secara nominal maupun rasio, posisi utang hingga Mei 2023 mengalami penurunan dibandingkan bulan sebelumnya. Hal itu dikarenakan pemerintah membayar utang lebih besar dibandingkan menerbitkan utang baru"]
X = vektor.transform(data)

# Use the loaded model to make predictions
predictions = model.predict(X)
print(predictions)

['finance']
