<a href="https://colab.research.google.com/github/AbrahamWillemH/capstone-catbot/blob/main/catbot_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
!pip install tensorflow tensorflow_hub tensorflow_text keras tf_keras keras-preprocessing keras-models



In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Looking at the dataset

In [None]:
dataset = "data.csv"

## Concatenate Dataset

In [None]:
df=pd.read_csv(dataset, on_bad_lines='skip')
df.head()

Unnamed: 0,question,focus_area
0,What is dermatitis in cats?,Dermatitis
1,What are the common symptoms of dermatitis in ...,Dermatitis
2,How can I treat dermatitis in my cat?,Dermatitis
3,Is dermatitis contagious to humans?,Dermatitis
4,What are the main causes of dermatitis in cats?,Dermatitis


In [None]:
df.head()

Unnamed: 0,question,focus_area
0,What is dermatitis in cats?,Dermatitis
1,What are the common symptoms of dermatitis in ...,Dermatitis
2,How can I treat dermatitis in my cat?,Dermatitis
3,Is dermatitis contagious to humans?,Dermatitis
4,What are the main causes of dermatitis in cats?,Dermatitis


In [None]:
df.groupby('focus_area').count()


Unnamed: 0_level_0,question
focus_area,Unnamed: 1_level_1
Dermatitis,66
Eye sickness,66
Hairloss,66
Lump and bump,65
Scabies,66


In [None]:
df.sample(10)

Unnamed: 0,question,focus_area
143,"What role do hormones play in coat health?”,Ha...",Hairloss
100,What are the newest pharmaceutical treatments ...,Scabies
231,Can lumps indicate neurological conditions?,Lump and bump
284,What are the newest diagnostic imaging techniq...,Eye sickness
277,“How often should I have my cat's eyes checked...,Eye sickness
260,What are the long-term management strategies f...,Lump and bump
113,What are the economic considerations for scabi...,Scabies
119,What are the psychological support strategies ...,Scabies
28,Can hormonal changes cause dermatitis in cats?,Dermatitis
218,Can lumps indicate metabolic disorders?,Lump and bump


# Tokenization

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize
def tokenize_text(text):
    tokens = word_tokenize(str(text))
    return tokens
for col in df.columns:
    df[col + '_tokens'] = df[col].apply(tokenize_text)
df.to_csv("tokenized_data.csv", index=False)

## Concatenate Tokenized Dataset

In [None]:
data = pd.read_csv("tokenized_data.csv")

## Transform Into Lowercase

In [None]:
def lowercase_text(text):
    return str(text).lower()
df = data.map(lowercase_text)

# Removing noise

In [None]:
import re

def remove_noise(text):
    text = re.sub(r'<.*?>', '', str(text))
    text = re.sub(r'[^\w\s]', '', text)
    return text
dff = df.map(remove_noise)

In [None]:
dff.groupby('focus_area').count()

Unnamed: 0_level_0,question,question_tokens,focus_area_tokens
focus_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dermatitis,66,66,66
eye sickness,66,66,66
hairloss,66,66,66
lump and bump,65,65,65
scabies,66,66,66


## Applying Stopwords

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()  # Split text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)  # Join filtered words back into a string

# Apply stopword removal to all columns
stdf = dff.map(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stdf.to_csv("stopwords_data.csv", index=False)

## Applying Porterstemmer

In [None]:
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
stemmer = PorterStemmer()
def stem_text(text):
    words = nltk.word_tokenize(str(text))  # Tokenize text
    stemmed_words = [stemmer.stem(word) for word in words]  # Stem each word
    return ' '.join(stemmed_words)  # Join stemmed words back into a string
stmdf = stdf.map(stem_text)
stmdf.to_csv("stemmed_data.csv", index=False)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Applying Textblob

In [None]:
from textblob import TextBlob
def spell_check_text(text):
    blob = TextBlob(str(text))
    corrected_text = blob.correct()
    return str(corrected_text)
sss = stmdf.map(spell_check_text)

## Remove bad lines

In [None]:
dff.to_csv("noise_tokenized_data.csv", index=False)

In [None]:
# Remove rows where the 'question' starts with whitespace using regex
cleaned_df = df[~df['question'].str.match(r'^\s')]

# Overwrite the original file or save the cleaned dataset to a new file
cleaned_df.to_csv("cleaned_tokenized_data.csv", index=False)

print(f"Cleaned dataset saved to 'cleaned_tokenized_data.csv'. Remaining rows: {len(cleaned_df)}")

Cleaned dataset saved to 'cleaned_tokenized_data.csv'. Remaining rows: 329


# Preparing Data for Training

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Tokenize the text data (assuming you want to use the 'question' column for training)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(stdf['question'])  # Fit the tokenizer on the 'question' column
X = tokenizer.texts_to_sequences(stdf['question'])  # Convert text to sequences

# 2. Pad sequences to ensure uniform length
max_len = 100  # You can adjust this based on your data length
X_padded = pad_sequences(X, maxlen=max_len, padding='post', truncating='post')

# 3. Convert labels to numerical format
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(stdf['focus_area'])  # Convert the 'label' column into numerical labels

# 4. Check the shape of X_padded and y
print(f"Shape of X_padded: {X_padded.shape}")
print(f"Shape of y: {y.shape}")


Shape of X_padded: (329, 100)
Shape of y: (329,)


# Building the Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

# Hyperparameters
embedding_dim = 100  # Dimension for word embeddings
num_classes = len(label_encoder.classes_)  # Number of classes in your 'label' column

# Build the model
model = Sequential()

# 1. Embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_dim,
                    input_length=X_padded.shape[1]))

# 2. Flatten the embedding output
model.add(Flatten())

# 3. Dense layer with ReLU activation
model.add(Dense(128, activation='relu'))

# 4. Dropout layer to reduce overfitting
model.add(Dropout(0.2))

# 5. Output layer with softmax activation
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

# Summary of the model
model.summary()

# Train the model
model.fit(X_padded, y, epochs=20, batch_size=32, validation_split=0.2)




Epoch 1/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.2525 - loss: 1.5506 - val_accuracy: 0.0000e+00 - val_loss: 3.2837
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.2894 - loss: 1.4200 - val_accuracy: 0.0000e+00 - val_loss: 4.0317
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.3302 - loss: 1.3594 - val_accuracy: 0.0000e+00 - val_loss: 4.6596
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.4296 - loss: 1.2974 - val_accuracy: 0.0000e+00 - val_loss: 4.7020
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6279 - loss: 1.1686 - val_accuracy: 0.0000e+00 - val_loss: 4.8955
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.8065 - loss: 1.0256 - val_accuracy: 0.0000e+00 - val_loss: 4.9607
Epoch 7/20
[1m9/9[0m

<keras.src.callbacks.history.History at 0x7ff25e73c700>

In [None]:
model.summary()

# Testing the model

In [None]:
# Remove noise
def remove_noise(text):
    text = re.sub(r'<.*?>', '', str(text))
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Remove stopwords
def remove_stopwords(text):
    words = text.split()  # Split text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)  # Join filtered words back into a string

# Define a new question
new_question = "how to handle scabies"

# Preprocess and tokenize
new_question_seq = tokenizer.texts_to_sequences([new_question])
new_question_padded = pad_sequences(new_question_seq, maxlen=max_len, padding='post', truncating='post')
new_question = remove_stopwords(new_question)
new_question = remove_noise(new_question)

# Predict
new_prediction = model.predict(new_question_padded)
confidence_scores = new_prediction[0]  # Confidence scores for each class
max_confidence = np.max(confidence_scores)  # Get the highest confidence score
predicted_label_idx = confidence_scores.argmax()  # Index of the predicted class

# Set confidence threshold
confidence_threshold = 0.2

if max_confidence >= confidence_threshold:
    predicted_label = label_encoder.inverse_transform([predicted_label_idx])[0]
    print(f"Question: {new_question}")
    print(f"Predicted Focus Area: {predicted_label} (Confidence: {max_confidence:.2f})")
else:
    print(f"Question: {new_question}")
    print("Predicted Focus Area: Tidak yakin (Confidence terlalu rendah)")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
Question: handle scabies
Predicted Focus Area: scabies (Confidence: 0.98)


# Save the Model

In [None]:
import pickle

# Save the trained tokenizer to a file
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [None]:
model.save('best5.h5')



In [None]:
!pip install tensorflowjs

!tensorflowjs_converter --input_format keras \
                        my_model.keras \
                        tfjs_model

# Import Model and Use it

In [None]:
from tensorflow.keras.models import load_model

model = load_model('best3.h5')



In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
tokenizer = Tokenizer()
max_len = 100  # You can adjust this based on your data length
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
new_question = "How to deal with scabies"
# nanti tambahin preprocessing di sini...

new_question_seq = tokenizer.texts_to_sequences([new_question])
new_question_padded = pad_sequences(new_question_seq, maxlen=max_len, padding='post', truncating='post')

prediction = model.predict(new_question_padded)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])[0]
print(f"Question: {new_question}")
print(f"Predicted Focus Area: {predicted_label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step
Question: How to deal with scabies
Predicted Focus Area: scabies
