<a href="https://colab.research.google.com/github/DVPhong/studentFeedback_sentimentAnalysis/blob/main/studentFeedback_sentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
%%capture
pip install datasets transformers underthesea

In [60]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from underthesea import word_tokenize

In [61]:
from datasets import load_dataset

dataset = load_dataset("uitnlp/vietnamese_students_feedback")
dataset = dataset.remove_columns(['topic'])

# Describe Dataset
total = 16k rows <br>
label : Positive - Neutral - Negative <br>
Positive and Negative labels account for 90%
topic : lecture - facilities - teaching program - others


 Data is almost cleaned before

In [62]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['sentence', 'sentiment'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['sentence', 'sentiment'],
        num_rows: 3166
    })
})

In [63]:
dataset['train'][:3]

{'sentence': ['slide giáo trình đầy đủ .',
  'nhiệt tình giảng dạy , gần gũi với sinh viên .',
  'đi học đầy đủ full điểm chuyên cần .'],
 'sentiment': [2, 2, 0]}

In [64]:
def loadDataset(dataset):
  data = {}
  data['train'] = pd.DataFrame(dataset['train'])
  data['train'].rename(columns={'sentiment':'label'}, inplace=True)

  data['val'] = pd.DataFrame(dataset['validation'])
  data['val'].rename(columns={'sentiment':'label'}, inplace=True)

  data['test'] = pd.DataFrame(dataset['test'])
  data['test'].rename(columns={'sentiment':'label'}, inplace=True)

  return data['train'], data['val'], data['test']

train_data, val_data, test_data = loadDataset(dataset)

In [65]:
train_data.head()

Unnamed: 0,sentence,label
0,slide giáo trình đầy đủ .,2
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",2
2,đi học đầy đủ full điểm chuyên cần .,0
3,chưa áp dụng công nghệ thông tin và các thiết ...,0
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",2


### Clean for BOW and TFIDF

In [66]:
freq_train = pd.Series(' '.join(train_data['sentence']).split()).value_counts()
freq_val = pd.Series(' '.join(val_data['sentence']).split()).value_counts()
freq_test = pd.Series(' '.join(test_data['sentence']).split()).value_counts()
freq_train.head(10)

.        11009
,         6827
viên      4803
giảng     3711
dạy       3156
thầy      3095
sinh      3082
học       2940
bài       2336
tình      2266
Name: count, dtype: int64

In [67]:
less_5_freq_train = freq_train[freq_train <= 10].index
less_5_freq_train

Index(['chí', 'chả', 'tóm', 'trưởng', 'tân', 'trị', 'khắt', 'kit', 'môi',
       'phạm',
       ...
       'mỉm', 'tựu', '9dot5', 'case', 'study', 'av1', 'av2', 'av3', 'kệ',
       'ráng'],
      dtype='object', length=1707)

In [68]:
#remove words have less frequent in dataset
def remove_less_freq(df:pd.DataFrame, freq:int):
  freq_data = pd.Series(' '.join(df['sentence']).split()).value_counts()
  words_less_freq = freq_data[freq_data <= freq].index

  def filter_sentence(sentence:str):
    return ' '.join([word for word in sentence.split() if word not in words_less_freq])

  df['sentence'] = df['sentence'].apply(filter_sentence)
  return df

train_data = remove_less_freq(train_data, 5)
val_data = remove_less_freq(val_data, 5)
test_data = remove_less_freq(test_data, 5)

In [69]:
freq_train = pd.Series(' '.join(train_data['sentence']).split()).value_counts()
less_5_freq_train = freq_train[freq_train <=10]
less_5_freq_train

trưởng    10
chả       10
bó        10
tóm       10
copy      10
          ..
vẽ         6
rãi        6
lao        6
đặn        6
thất       6
Name: count, Length: 168, dtype: int64

#Vectorization
> BOW

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, max_df = 0.85,ngram_range=(1,2))
bow_train_features = vectorizer.fit_transform(train_data['sentence'])
bow_test_features = vectorizer.transform(test_data['sentence'])

In [71]:
#vectorizer.get_feature_names_out()

In [72]:
bow_train_features.shape

(11426, 29452)

> TF-IDF

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(lowercase = True, max_df = 0.85)
tfidf_vectorizer.fit_transform(train_data['sentence'])


<11426x955 sparse matrix of type '<class 'numpy.float64'>'
	with 131299 stored elements in Compressed Sparse Row format>

In [74]:
tfidf_vectorizer.vocabulary_

{'slide': 611,
 'giáo': 217,
 'trình': 737,
 'đầy': 923,
 'đủ': 947,
 'nhiệt': 493,
 'tình': 774,
 'giảng': 223,
 'dạy': 190,
 'gần': 238,
 'gũi': 237,
 'với': 847,
 'sinh': 609,
 'viên': 822,
 'đi': 893,
 'học': 290,
 'điểm': 895,
 'chuyên': 58,
 'cần': 148,
 'chưa': 78,
 'áp': 882,
 'dụng': 199,
 'công': 131,
 'nghệ': 471,
 'thông': 680,
 'tin': 708,
 'và': 828,
 'các': 121,
 'thiết': 659,
 'bị': 37,
 'hỗ': 296,
 'trợ': 759,
 'cho': 55,
 'việc': 824,
 'thầy': 692,
 'bài': 17,
 'hay': 250,
 'có': 129,
 'nhiều': 491,
 'tập': 798,
 'ví': 832,
 'dụ': 198,
 'ngay': 462,
 'trên': 735,
 'lớp': 410,
 'đảm': 921,
 'bảo': 32,
 'thời': 701,
 'gian': 213,
 'lên': 380,
 'tích': 776,
 'cực': 165,
 'trả': 747,
 'lời': 411,
 'câu': 127,
 'hỏi': 292,
 'của': 158,
 'thường': 686,
 'xuyên': 865,
 'đặt': 928,
 'em': 204,
 'sẽ': 641,
 'môn': 436,
 'này': 522,
 'nhưng': 503,
 'lại': 392,
 'kỳ': 356,
 'kế': 352,
 'tiếp': 714,
 'lượng': 389,
 'quá': 581,
 'dài': 179,
 'không': 325,
 'thu': 666,
 'hiệu': 255

> phoBERT

In [None]:
z = train_data.apply(lambda x: word_tokenize(x['sentence'], format='text'), axis=1)
z.head()

0                            slide giáo_trình đầy_đủ .
1       nhiệt_tình giảng_dạy , gần_gũi với sinh_viên .
2                      đi học đầy_đủ điểm chuyên cần .
3    chưa áp_dụng công_nghệ_thông_tin và các thiết_...
4    thầy giảng bài hay , có nhiều bài_tập ví_dụ ng...
dtype: object

In [None]:
phobertTokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
train_data_phobert = train_data.apply(lambda x: word_tokenize(x['sentence'], format = 'text'), axis=1)



In [None]:
input_ids = []
attention_masks = []

for sent in list(train_data_phobert):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = phobertTokenizer.encode_plus(
                        sent,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64, # only few sentences have the length > 64 (14)
                        truncation = True,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'tf',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

11426
11426


In [None]:
phobert = TFAutoModelForSequenceClassification.from_pretrained("vinai/phobert-base")

tf_model.h5:   0%|          | 0.00/740M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation

In [92]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [76]:
# A function to plot and print result
def printResult(y_pred, y_prob):
    acc = accuracy_score(test_data["label"], y_pred)
    # Result
    print("Accuracy: {:.2f}".format(acc*100),end='\n\n')
    cm = confusion_matrix(test_data["label"],y_pred)
    print('Confusion Matrix:\n', cm)
    print(classification_report(test_data["label"],y_pred))


# Naive bayes

In [77]:
model = MultinomialNB() #class_prior=np.array([0.45, 0.1, 0.45])
# Training
model.fit(bow_train_features.toarray(), train_data["label"])

# Evaluation
y_pred_bow_nb = model.predict(bow_test_features.toarray())

y_prob_bow_nb = model.predict_proba(bow_test_features.toarray())[:,1]

In [78]:
printResult(y_pred_bow_nb, y_prob_bow_nb)

Accuracy: 85.79

Confusion Matrix:
 [[1312    0   97]
 [ 104    4   59]
 [ 188    2 1400]]
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      1409
           1       0.67      0.02      0.05       167
           2       0.90      0.88      0.89      1590

    accuracy                           0.86      3166
   macro avg       0.79      0.61      0.60      3166
weighted avg       0.85      0.86      0.84      3166



In [79]:
from imblearn.over_sampling import RandomOverSampler

train_data2 = train_data.copy()
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(train_data2['sentence'].values.reshape(-1, 1), train_data2['label'])

# Create a new DataFrame with the resampled data
resampled_data = pd.DataFrame({'sentence': X_resampled.flatten(), 'label': y_resampled})

print(f"Original data length: {len(train_data2)}")
print(f"Resampled data length: {len(resampled_data)}")

Original data length: 11426
Resampled data length: 16929


In [80]:
resampled_data

Unnamed: 0,sentence,label
0,slide giáo trình đầy đủ .,2
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",2
2,đi học đầy đủ điểm chuyên cần .,0
3,chưa áp dụng công nghệ thông tin và các thiết ...,0
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",2
...,...,...
16924,không có hoạt động giảng dạy nào không hài lòng .,1
16925,cô đi bài tập hơi chậm nhưng như vậy cũng được .,1
16926,không có gì ạ .,1
16927,cô dạy hay nhưng hơi nhanh .,1


In [81]:
vectorizer_2 = CountVectorizer(lowercase=True, max_df = 0.85,ngram_range=(1,2))
bow_train_features = vectorizer.fit_transform(resampled_data['sentence'])
bow_test_features = vectorizer.transform(test_data['sentence'])

In [82]:
model_2 = MultinomialNB(class_prior=np.array([0.4, 0.2, 0.4])) #
# Training
model_2.fit(bow_train_features, resampled_data['label'])

# Evaluation
y_pred_bow_nb = model_2.predict(bow_test_features.toarray())

y_prob_bow_nb = model_2.predict_proba(bow_test_features.toarray())[:,1]

Trying to apply over-sampling for minority(neutral) label <br>
The recall of neutral label was increased significantly (0.02 to 0.27) but accuracy decreased a bit (1%).

In [83]:
printResult(y_pred_bow_nb, y_prob_bow_nb)

Accuracy: 84.18

Confusion Matrix:
 [[1248   69   92]
 [  77   45   45]
 [ 169   49 1372]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1409
           1       0.28      0.27      0.27       167
           2       0.91      0.86      0.89      1590

    accuracy                           0.84      3166
   macro avg       0.67      0.67      0.67      3166
weighted avg       0.84      0.84      0.84      3166



# Decision Tree

In [89]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
# Training
model.fit(bow_train_features, resampled_data["label"])

# Evaluation
y_pred_bow_dt = model.predict(bow_test_features.toarray())

y_prob_bow_dt = model.predict_proba(bow_test_features.toarray())[:,1]

In [90]:
printResult(y_pred_bow_dt, y_prob_bow_dt)

Accuracy: 83.99

Confusion Matrix:
 [[1194   82  133]
 [  51   73   43]
 [ 137   61 1392]]
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1409
           1       0.34      0.44      0.38       167
           2       0.89      0.88      0.88      1590

    accuracy                           0.84      3166
   macro avg       0.70      0.72      0.71      3166
weighted avg       0.85      0.84      0.84      3166



# MLP

In [93]:
model = MLPClassifier(solver='adam', alpha=2e-4, hidden_layer_sizes=(5, 2), max_iter=400)

model.fit(bow_train_features, resampled_data["label"])

# Evaluation
y_pred_bow_mlp = model.predict(bow_test_features.toarray())

y_prob_bow_mlp = model.predict_proba(bow_test_features.toarray())[:,1]

In [None]:
printResult(y_pred_bow_mlp, y_prob_bow_mlp)