## Nguyễn Đức Đạt - 23000109
### Bước 0: Thiết lập môi trường và tải dữ liệu

In [4]:
import pandas as pd
# Dữ liệu có thể được phân tách bằng tab và không có header
df_train = pd.read_csv('../data/data/train.csv', sep='\t', header=None, names=['text', 'intent'])
df_val = pd.read_csv('../data/data/val.csv', sep='\t', header=None, names=['text', 'intent'])
df_test = pd.read_csv('../data/data/test.csv', sep='\t', header=None, names=['text', 'intent'])

# tách data thành 2 cột text và intent
df_train[['text', 'intent']] = df_train['text'].str.rsplit(',', n=1, expand=True)
df_val[['text', 'intent']] = df_val['text'].str.rsplit(',', n=1, expand=True)
df_test[['text', 'intent']] = df_test['text'].str.rsplit(',', n=1, expand=True)

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

df_train.head()

Train shape: (8955, 2)
Validation shape: (1077, 2)
Test shape: (1077, 2)


Unnamed: 0,text,intent
0,text,category
1,what alarms do i have set right now,alarm_query
2,checkout today alarm of meeting,alarm_query
3,report alarm settings,alarm_query
4,see see for me the alarms that you have set to...,alarm_query


In [5]:
# Tiền xử lí
from sklearn.preprocessing import LabelEncoder

# Loại bỏ trích dẫn khỏi intent
df_train['intent'] = df_train['intent'].str.replace('"', '', regex=False)
df_val['intent'] = df_val['intent'].str.replace('"', '', regex=False)
df_test['intent'] = df_test['intent'].str.replace('"', '', regex=False)

# Khởi tạo labelencoder
label_encoder = LabelEncoder()

# huấn luyện cột intent
df_train['intent_encoded'] = label_encoder.fit_transform(df_train['intent'])
df_val['intent_encoded'] = label_encoder.transform(df_val['intent'])
df_test['intent_encoded'] = label_encoder.transform(df_test['intent'])

# Hiển thị vài dòng đầu
display(df_train.head())

# Hiển thị ánh xạ sau khi huấn luyện
print("Label mapping:")
for i, intent in enumerate(label_encoder.classes_):
    print(f"{intent}: {i}")

Unnamed: 0,text,intent,intent_encoded
0,text,category,9
1,what alarms do i have set right now,alarm_query,0
2,checkout today alarm of meeting,alarm_query,0
3,report alarm settings,alarm_query,0
4,see see for me the alarms that you have set to...,alarm_query,0


Label mapping:
alarm_query: 0
alarm_remove: 1
alarm_set: 2
audio_volume_down: 3
audio_volume_mute: 4
audio_volume_up: 5
calendar_query: 6
calendar_remove: 7
calendar_set: 8
category: 9
cooking_recipe: 10
datetime_convert: 11
datetime_query: 12
email_addcontact: 13
email_query: 14
email_querycontact: 15
email_sendemail: 16
general_affirm: 17
general_commandstop: 18
general_confirm: 19
general_dontcare: 20
general_explain: 21
general_joke: 22
general_negate: 23
general_praise: 24
general_quirky: 25
general_repeat: 26
iot_cleaning: 27
iot_coffee: 28
iot_hue_lightchange: 29
iot_hue_lightdim: 30
iot_hue_lightoff: 31
iot_hue_lighton: 32
iot_hue_lightup: 33
iot_wemo_off: 34
iot_wemo_on: 35
lists_createoradd: 36
lists_query: 37
lists_remove: 38
music_likeness: 39
music_query: 40
music_settings: 41
news_query: 42
play_audiobook: 43
play_game: 44
play_music: 45
play_podcasts: 46
play_radio: 47
qa_currency: 48
qa_definition: 49
qa_factoid: 50
qa_maths: 51
qa_stock: 52
recommendation_events: 53
re

### Nhiệm vụ 1: (Warm-up Ôn bài cũ) Pipeline TF-IDF + Logistic Regression

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Tạo một pipeline với TfidfVectorizer và LogisticRegression
tfidf_lr_pipeline = make_pipeline(TfidfVectorizer(max_features=5000), LogisticRegression(max_iter=1000))

# Huấn luyện trên tập train
tfidf_lr_pipeline.fit(df_train['text'], df_train['intent_encoded'])

# Dự đoán trên tập test
yPred = tfidf_lr_pipeline.predict(df_test['text'])

# Tính các thông số đánh giá
accuracy = accuracy_score(df_test['intent_encoded'], yPred)
precision = precision_score(df_test['intent_encoded'], yPred, average='weighted')
recall = recall_score(df_test['intent_encoded'], yPred, average='weighted')
f1 = f1_score(df_test['intent_encoded'], yPred, average='weighted')
report = classification_report(df_test['intent_encoded'], yPred)
# In
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Classification report: {report}")

Accuracy: 0.8356545961002786
Precision: 0.8405884523852166
Recall: 0.8356545961002786
F1-score: 0.834645398866917
Classification report:               precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       1.00      0.73      0.84        11
           2       0.81      0.89      0.85        19
           3       1.00      0.75      0.86         8
           4       0.92      0.80      0.86        15
           5       0.93      1.00      0.96        13
           6       0.48      0.53      0.50        19
           7       0.89      0.89      0.89        19
           8       0.82      0.74      0.78        19
           9       0.00      0.00      0.00         1
          10       0.59      0.68      0.63        19
          11       0.67      0.75      0.71         8
          12       0.74      0.89      0.81        19
          13       0.78      0.88      0.82         8
          14       0.83      0.79      0.81        1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Nhiệm vụ 2: (Warm-up Ôn bài cũ) Pipeline Word2Vec (Trung bình) + DenseLayer

In [7]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

#  Huấn luyện mô hình Word2Vec trên dữ liệu text của bạn
sentences = [text.split() for text in df_train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
#  Viết hàm để chuyển mỗi câu thành vector trung bình
def sentence_to_avg_vector(text, model):
    vec_size = model.vector_size
    if text is None:
        return np.zeros(vec_size, dtype=np.float32)
    tokens = [t for t in str(text).lower().split() if t in model.wv.key_to_index]
    if len(tokens) == 0:
        return np.zeros(vec_size, dtype=np.float32)
    vecs = [model.wv[t] for t in tokens]
    return np.mean(vecs, axis=0).astype(np.float32)
    
# Tạo dữ liệu train/val/test X_train_avg, X_val_avg, X_test_avg
X_train_avg = np.stack([sentence_to_avg_vector(t, w2v_model) for t in df_train['text']])
X_val_avg   = np.stack([sentence_to_avg_vector(t, w2v_model) for t in df_val['text']])
X_test_avg  = np.stack([sentence_to_avg_vector(t, w2v_model) for t in df_test['text']])

y_train = df_train['intent_encoded'].astype(int).values
y_val   = df_val['intent_encoded'].astype(int).values
y_test  = df_test['intent_encoded'].astype(int).values

num_classes = len(label_encoder.classes_)

#  Xây dựng mô hình Sequential của Keras
model = Sequential([Dense(128, activation='relu', input_shape=(w2v_model.vector_size,)), 
    Dropout(0.5), Dense(num_classes, activation='softmax')])

#  Compile, huấn luyện và đánh giá mô hình
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train_avg, y_train, 
    validation_data=(X_val_avg, y_val), epochs=30, batch_size=32, callbacks=[es], verbose=2)

# Đánh giá trên tập test
loss, acc = model.evaluate(X_test_avg, y_test, verbose=0)
print(f"Test loss: {loss:.4f}  Test accuracy: {acc:.4f}")

y_pred_probs = model.predict(X_test_avg)
y_pred = np.argmax(y_pred_probs, axis=1)

print("Classification report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject