In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import tensorflow as tf
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from nltk.stem import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
import joblib

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Load the dataset
df = pd.read_csv('tabel_artikel_1.csv')
khasiat = pd.concat([df['id'],df['nama_obat'], df['khasiat']], axis=1)
khasiat.to_csv('khasiat.csv')

In [None]:
# Shuffle the dataset
df = shuffle(df, random_state=42)

def preProcessing(df, col):
    # Lowercase the text
    df[f'{col}'] = df[f'{col}'].str.lower()

    # Remove special characters and numbers
    df[f'{col}'] = df[f'{col}'].apply(lambda x: re.sub(r'\d+|\$', '', x))
    df[f'{col}'] = df[f'{col}'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    # Tokenization
    df[f'{col}'] = df[f'{col}'].apply(lambda x: word_tokenize(x))

    # Remove stop words
    stop_words = set(stopwords.words('indonesian'))
    df[f'{col}'] = df[f'{col}'].apply(lambda x: [word for word in x if word not in stop_words])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df[f'{col}'] = df[f'{col}'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # Convert the preprocessed data back to string
    df[f'{col}'] = df[f'{col}'].apply(lambda x: ' '.join(x))
    return df

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

def plot(data, title):
    val_count = data.khasiat.value_counts()
    plt.figure(figsize=(8,4))
    plt.bar(val_count.index, val_count.values)
    plt.title(title)



In [None]:
khasiat = preProcessing(khasiat, 'khasiat')
khasiat.to_csv('khasiat2.csv')

In [None]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(khasiat['khasiat'])

In [None]:
# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, khasiat['khasiat'])

In [None]:
# Using TensorFlow for evaluation

# Convert labels to numerical values
all_labels = np.unique(khasiat['nama_obat'])
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
train_labels_encoded = label_encoder.fit_transform(khasiat['nama_obat'])

# Convert labels to one-hot encoding
num_classes = len(label_encoder.classes_)
y_train_encoded = tf.keras.utils.to_categorical(train_labels_encoded, num_classes=num_classes)
num_classes

In [None]:
# Define a simple neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile and train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_tfidf.toarray(), y_train_encoded, epochs=1000, batch_size=32, verbose=1)

In [None]:
# Extract accuracy and loss values from the training history
train_accuracy = history.history['accuracy']
train_loss = history.history['loss']


# Plot accuracy curves
plt.figure(figsize=(8, 4))
plt.plot(train_accuracy, label='Train Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot loss curves
plt.figure(figsize=(8, 4))
plt.plot(train_loss, label='Train Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Save the model in HDF5 format (.h5)
model.save('artikel-obat-model.h5')

# Save the model in Protocol Buffer format (.pb)
tf.saved_model.save(model, 'artikel-obat-model.pb')

# Save the vectorizer and label encoder
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

# Load the saved model
model = tf.keras.models.load_model('artikel-obat-model.h5')


In [None]:
def preprocess_input_text(input_text):
    # Lowercase the text
    input_text = input_text.lower()

    # Remove special characters and numbers
    input_text = re.sub(r'\d+|\$', '', input_text)
    input_text = input_text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    input_text = word_tokenize(input_text)

    # Remove stop words
    stop_words = set(stopwords.words('indonesian'))
    input_text = [word for word in input_text if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    input_text = [lemmatizer.lemmatize(word) for word in input_text]

    # Convert the preprocessed data back to string
    input_text = ' '.join(input_text)

    return input_text

In [14]:
# Preprocess the input data
input_text = 'usus buntu'
input_text = preprocess_input_text(input_text) # Replace 'your input text' with the actual text you want to predict

# Transform the input text using the same vectorizer used during training
input_text_tfidf = vectorizer.transform([input_text])

# Make predictions using the loaded model
predictions = model.predict(input_text_tfidf.toarray())

# Get the top 5 predicted labels and their corresponding probabilities
top_k = 5
top_k_indices = np.argsort(predictions, axis=1)[:, -top_k:][0]
top_k_probabilities = predictions[0, top_k_indices]

# Convert the predicted label indices to category labels
predicted_labels = label_encoder.inverse_transform(top_k_indices)

# Print the top 5 predicted labels and their probabilities
out = []
print(f"User input: {input_text}")
print("Top 5 Predicted Labels:")
for label, probability in zip(predicted_labels, top_k_probabilities):
    out.append(f"{label}: {probability:.4f}")
out = out[::-1]
print('\n'.join(str(element) for element in out))


User input: usus buntu
Top 5 Predicted Labels:
Bidara Upas: 0.7517
Rumput Mutiara: 0.1455
Ekor Kucing: 0.0623
Pegagan: 0.0176
Arairut (Garut): 0.0062
