In [21]:
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import json
import pickle

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import GlobalMaxPooling1D, Dense, Activation, Dropout, Embedding,Conv1D
import random

In [22]:
# Load dataset
data = pd.read_csv('./stsv.csv')
data.dropna(inplace=True) 

In [23]:
data.head()

Unnamed: 0,question,answer
0,How much is the tuition?,"On average, each credit (tc) has a tuition fee..."
1,How much is the tuition?,"On average, each credit (tc) has a tuition fee..."
2,When is the deadline to pay tuition?,"On average, each credit (tc) has a tuition fee..."
3,School tuition?,"On average, each credit (tc) has a tuition fee..."
4,What are the school's tuition fees?,"On average, each credit (tc) has a tuition fee..."


In [24]:
# EDA - Check distribution of intents
intent_counts = data['answer'].value_counts()
intent_counts

answer
Portal account for students to register for courses, look up timetables, exam schedules, view study scores, training scores, evaluate course surveys...                                                                              72
Hello, how can BeeBot help you?                                                                                                                                                                                                      68
Use email MSSV@student.hcmus.edu.vn to send to the Technical Department support@fit.hcmus.edu.vn for support.                                                                                                                        61
Hello, Please contact the Technical Department support@fit.hcmus.edu.vn for answers.                                                                                                                                                 60
If you have any problems with Moodle and the Faculty website, ple

In [25]:
# Data Preparation
X = data['question'].astype(str).values
y = data['answer'].astype(str).values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [26]:
# Prepare text data for CNN
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=20, padding='post', truncating='post')

In [27]:
# Train-test split for CNN
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [28]:
# Define CNN Model Architecture
cnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=16, input_length=20),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(24, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

In [29]:
cnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 16)            80000     
                                                                 
 conv1d_1 (Conv1D)           (None, 16, 128)           10368     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_2 (Dense)             (None, 24)                3096      
                                                                 
 dropout_1 (Dropout)         (None, 24)                0         
                                                                 
 dense_3 (Dense)             (None, 67)                1675      
                                                      

In [30]:
# Train CNN Model
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, validation_data=(X_test_cnn, y_test_cnn), batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x200636cda50>

In [31]:
# Prepare text data for SVM using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(data['question'].astype(str)).toarray()

In [32]:
# Train-test split for SVM
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

In [33]:
# Define and train SVM Model
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_svm, y_train_svm)

In [34]:
# Save models and other components
cnn_model.save('student_advisory_cnn_model.h5')
svm_model_path = 'student_advisory_svm_model.sav'
tokenizer_path = 'tokenizer.pickle'
tfidf_path = 'tfidf.pickle'
label_encoder_path = 'label_encoder.pickle'

with open(svm_model_path, 'wb') as svm_file:
    pickle.dump(svm_model, svm_file)

with open(tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

with open(tfidf_path, 'wb') as tfidf_file:
    pickle.dump(tfidf, tfidf_file)

with open(label_encoder_path, 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

In [36]:
# Function to make a prediction using CNN and SVM
def predict_question_category(question: str) -> str:
    # Preprocess for CNN
    seq = tokenizer.texts_to_sequences([question])
    padded = pad_sequences(seq, maxlen=20, padding='post', truncating='post')
    cnn_pred = cnn_model.predict(padded)
    
    # Preprocess for SVM
    tfidf_vector = tfidf.transform([question]).toarray()
    svm_pred = svm_model.predict_proba(tfidf_vector)

    # Combine CNN and SVM predictions (e.g., averaging probabilities)
    combined_pred = np.mean([cnn_pred, svm_pred], axis=0)
    pred_label_index = np.argmax(combined_pred)
    pred_label = label_encoder.inverse_transform([pred_label_index])
    
    return pred_label[0]

In [39]:

# Load the trained CNN model, SVM model, tokenizer, TF-IDF vectorizer, and label encoder
def load_model_and_resources(cnn_model_path, svm_model_path, tokenizer_path, tfidf_path, encoder_path):
    try:
        cnn_model = tf.keras.models.load_model(cnn_model_path)
        with open(svm_model_path, 'rb') as svm_file:
            svm_model = pickle.load(svm_file)
        with open(tokenizer_path, 'rb') as tokenizer_file:
            tokenizer = pickle.load(tokenizer_file)
        with open(tfidf_path, 'rb') as tfidf_file:
            tfidf = pickle.load(tfidf_file)
        with open(encoder_path, 'rb') as encoder_file:
            label_encoder = pickle.load(encoder_file)
            
        return cnn_model, svm_model, tokenizer, tfidf, label_encoder
    except Exception as e:
        print(f"An error occurred while loading the resources: {e}")
        return None, None, None, None, None

# Example usage
if __name__ == "__main__":
    # Paths to the saved model and resources
    cnn_model_path = 'student_advisory_cnn_model.h5'
    svm_model_path = 'student_advisory_svm_model.sav'
    tokenizer_path = 'tokenizer.pickle'
    tfidf_path = 'tfidf.pickle'
    encoder_path = 'label_encoder.pickle'
    
    # Load the trained model and resources
    cnn_model, svm_model, tokenizer, tfidf, label_encoder = load_model_and_resources(
        cnn_model_path, svm_model_path, tokenizer_path, tfidf_path, encoder_path
    )
    
    # Ensure the resources are loaded successfully before proceeding
    if cnn_model and svm_model and tokenizer and tfidf and label_encoder:
        question = "Can students who pay off debt receive tuition exemptions?"
        predicted_category = predict_question_category(question)
        print(f"Predicted Answer: {predicted_category}")
    else:
        print("Resources were not loaded properly.")


Predicted Answer: According to regulations, only students who are ethnic minorities and belong to poor or near-poor households are eligible for tuition exemption and reduction. Tuition exemption and reduction only applies to official courses, non-debt repayment courses. Tuition fee exemption.
