<a href="https://colab.research.google.com/github/DNThuan/AspectBasedSentimentAnalysis/blob/main/Aspect_Based_Sentiment_Analysis_Hotel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone -q https://github.com/DNThuan/AspectBasedSentimentAnalysis.git
!pwd
!ls

/content
AspectBasedSentimentAnalysis  sample_data


# 1) Read data

In [2]:
import numpy as np

In [3]:
def read_data(path):
  with open(path) as f:
    data = f.read().split("\n\n")
  f.close()
  for i in range(len(data)):
    temp=data[i].split("\n")
    data[i]=temp[1:3]
  return data

In [4]:
path_train = "/content/AspectBasedSentimentAnalysis/VLSP2018_SA_Hotel/1-VLSP2018-SA-Hotel-train (7-3-2018).txt"
path_dev =  "/content/AspectBasedSentimentAnalysis/VLSP2018_SA_Hotel/2-VLSP2018-SA-Hotel-dev (7-3-2018).txt"
path_test = "/content/AspectBasedSentimentAnalysis/VLSP2018_SA_Hotel/3-VLSP2018-SA-Hotel-test (8-3-2018).txt"

train = np.array(read_data(path_train))
dev = np.array(read_data(path_dev))
test = np.array(read_data(path_test))

print("Train: ",train.shape)
print("Dev: ",dev.shape)
print("Test: ",test.shape)


Train:  (3000, 2)
Dev:  (2000, 2)
Test:  (600, 2)


#2) Preprocessing data

## 2.1 Review data

### 2.1.1 Delete emoji

In [5]:
!pip install -q emoji
import emoji
from sklearn.preprocessing import  FunctionTransformer

def Delete_emoji(texts):
    return np.array([emoji.get_emoji_regexp().sub('', text) for text in texts])

delete_emoji = FunctionTransformer(Delete_emoji)

[?25l[K     |██                              | 10 kB 22.3 MB/s eta 0:00:01[K     |███▉                            | 20 kB 28.8 MB/s eta 0:00:01[K     |█████▉                          | 30 kB 15.5 MB/s eta 0:00:01[K     |███████▊                        | 40 kB 11.6 MB/s eta 0:00:01[K     |█████████▋                      | 51 kB 5.5 MB/s eta 0:00:01[K     |███████████▋                    | 61 kB 5.6 MB/s eta 0:00:01[K     |█████████████▌                  | 71 kB 5.4 MB/s eta 0:00:01[K     |███████████████▍                | 81 kB 6.0 MB/s eta 0:00:01[K     |█████████████████▍              | 92 kB 5.8 MB/s eta 0:00:01[K     |███████████████████▎            | 102 kB 5.1 MB/s eta 0:00:01[K     |█████████████████████▏          | 112 kB 5.1 MB/s eta 0:00:01[K     |███████████████████████▏        | 122 kB 5.1 MB/s eta 0:00:01[K     |█████████████████████████       | 133 kB 5.1 MB/s eta 0:00:01[K     |███████████████████████████     | 143 kB 5.1 MB/s eta 0:00:01[K 

### 2.1.2 Replace value of money by special character

In [6]:
import re

def Replace_Symbol(texts):
  texts_result = []
  for text in texts:
    distance_pattern = "([0-9.,]{1,9}?.km)|([0-9.,]{1,9}?.cây số)|([0-9.,]{1,9}?.cây)|([0-9.,]{1,9}?.mét)|([0-9.,]{1,3}?.m)"
    text_result = re.sub(distance_pattern, 'khoang_cach', text)
    money_pattern = "(\d{1,3}k.{0})|([0-9.]{1,9}?.vnd)|([0-9.]{1,9}?.việt nam đồng)|([0-9.]{1,9}?.đồng)"
    text_result = re.sub(money_pattern, 'gia_tien', text_result)
    texts_result.append(text_result)
  return texts_result
replace_symbol = FunctionTransformer(Replace_Symbol)

### 2.1.3 Delete special character

In [7]:
def Delete_Special_Character(texts):
  texts_result = []
  for text in texts:
    special_character_pattern = "[+=<>@#$%^&~]"
    text_result = re.sub(special_character_pattern, '', text)
    words = text_result.split()
    text_result = ' '.join(words)
    texts_result.append(text_result)
  return texts_result
delete_special_character = FunctionTransformer(Delete_Special_Character)

### 2.1.4 Normalize elongate words

In [8]:
def Normalize_Elongate_Words(texts):
  texts_result = []
  for text in texts:
    elongate_pattern = r"(\w)\1*"
    text_result = re.sub(elongate_pattern, r'\1', text)
    texts_result.append(text_result)
  return texts_result
normalize_elongate_words = FunctionTransformer(Normalize_Elongate_Words)

### 2.1.5 Replace negative words

In [9]:
def Replace_Negative_Words(texts):
  texts_result = []
  for text in texts:
    negative_pattern = r"\bkh|\bko|\bkhg|\bkhong|\bk|\bhông|\bhem"
    text_result = re.sub(negative_pattern, 'không', text)
    texts_result.append(text_result)
  return texts_result
replace_negative_words = FunctionTransformer(Replace_Negative_Words)

## 2.2 Tag data

### 2.2.1 Label separation

In [10]:
# Tìm vị trí các cặp dấu ngoặc
# Input: một nhãn dạng string
def find_start_end(label):
  start = 0
  end = 0
  lst_start=[]
  lst_end=[]
  for index ,char in enumerate(label):
    if char == "{":
      start = index
      lst_start.append(start)
    elif char == "}":
      end = index
      lst_end.append(end)
  return tuple(zip(lst_start,lst_end))

In [11]:
# Xóa dấu ngoặc
# Input: một nhãn dạng string
def Label_str_to_list(label):
  index = tuple(find_start_end(label))
  aspect_temp=[]
  polarity_temp=[]
  for i in index:
    temp = label[i[0]+1:i[1]].replace(" ","").split(",")
    aspect_temp.append(temp[0])
    polarity_temp.append(temp[1])
  return aspect_temp, polarity_temp

In [12]:
# Tách thành 2 tầng label/ 1 tầng
def separate_label(labels):
  aspect= []
  polarity = []
  SA = []
  for label in labels:
    temp = Label_str_to_list(label)
    aspect.append(temp[0])
    polarity.append(temp[1])

    sa_temp= []
    for i in range(len(temp[0])):
      sa = temp[0][i]+":"+temp[1][i]
      sa_temp.append(sa)
    SA.append(sa_temp)

  return np.array(aspect, dtype=object), np.array(polarity, dtype=object), np.array(SA, dtype=object)

### 2.2.2 Binary Label

In [13]:
# Load list label
import json

def read_label(path):
  with open(path) as f:
    data = json.load(f)
  f.close()
  return data

aspect_path = "/content/AspectBasedSentimentAnalysis/Label/aspect.json"
polarity_path = "/content/AspectBasedSentimentAnalysis/Label/polarity.json"
SA_path = "/content/AspectBasedSentimentAnalysis/Label/SA.json"

aspect_labels = read_label(aspect_path)
polarity_labels = read_label(polarity_path)
SA_labels = read_label(SA_path)


In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
transform_label = MultiLabelBinarizer().fit([SA_labels])


In [15]:
def make_dict_classes(list_classes):
  dic_classes = dict()
  for index, classes in enumerate(list_classes):
    dic_classes[index] = classes
  return dic_classes

In [16]:
def show_label(label_binary):
  dic_classes = make_dict_classes(transform_label.classes_)
  lst_index = []
  for index, value in enumerate(label_binary):
    if value == 1:
      lst_index.append(dic_classes[index])
  print(lst_index)

# 3) Training

## 3.1 Get data

In [17]:
X_train, y_train = train[:,0], train[:,1]
X_dev,   y_dev   = dev[:,0],   dev[:,1]
X_test,  y_test  = test[:,0],  test[:,1]

In [18]:
print(X_train[0], type(X_train[0]))

Rộng rãi KS mới nhưng rất vắng. Các dịch vụ chất lượng chưa cao và thiếu. <class 'numpy.str_'>


In [19]:
print(y_train[1],type(y_train[1]))

{LOCATION#GENERAL, positive} <class 'numpy.str_'>


## 3.2 Transform label

### 3.1.1 Nomalizer label

In [20]:
y_train_merged = separate_label(y_train)[2]
y_dev_merged = separate_label(y_dev)[2]
y_test_merged = separate_label(y_test)[2]

### 3.1.2 Covert label to binary

In [21]:
y_train_tf = transform_label.transform(y_train_merged)
y_dev_tf = transform_label.transform(y_dev_merged)
y_test_tf = transform_label.transform(y_test_merged)

## 3.3 Make Pipeline

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

preproceesing_data = make_pipeline(delete_emoji,
                              replace_symbol,
                              delete_special_character,
                              normalize_elongate_words,
                              replace_negative_words,
                              TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), stop_words='english'))

RFC = RandomForestClassifier(random_state=42)

model = make_pipeline(preproceesing_data,
                      MultiOutputClassifier(RFC, n_jobs=-1))
model.fit(X_train,y_train_tf)

Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('functiontransformer-1',
                                  FunctionTransformer(accept_sparse=False,
                                                      check_inverse=True,
                                                      func=<function Delete_emoji at 0x7fe6325c4e60>,
                                                      inv_kw_args=None,
                                                      inverse_func=None,
                                                      kw_args=None,
                                                      validate=False)),
                                 ('functiontransformer-2',
                                  FunctionTransformer(accept_sparse=False,
                                                      check_inverse=True,
                                                      func=...
                                            

In [23]:
from sklearn import metrics

def score(y_true, y_pred):
  print("Precison: ",metrics.precision_score(y_true, y_pred, average='micro'))
  print("Recall: ",metrics.recall_score(y_true, y_pred, average='micro'))
  print("F1: ",metrics.f1_score(y_true, y_pred, average='micro'))


In [27]:
predict = model.predict(X_dev)
score(y_dev_tf,predict)

Precison:  0.8813775510204082
Recall:  0.3886935733370834
F1:  0.5394749682834


In [28]:
print(metrics.classification_report(y_dev_tf,predict, target_names=transform_label.classes_))

                                         precision    recall  f1-score   support

        FACILITIES#CLEANLINESS:negative       0.00      0.00      0.00         8
         FACILITIES#CLEANLINESS:neutral       0.00      0.00      0.00         0
        FACILITIES#CLEANLINESS:positive       0.00      0.00      0.00        15
            FACILITIES#COMFORT:negative       0.00      0.00      0.00        37
             FACILITIES#COMFORT:neutral       0.00      0.00      0.00         0
            FACILITIES#COMFORT:positive       0.00      0.00      0.00        32
    FACILITIES#DESIGN&FEATURES:negative       1.00      0.03      0.05       119
     FACILITIES#DESIGN&FEATURES:neutral       0.00      0.00      0.00         8
    FACILITIES#DESIGN&FEATURES:positive       0.85      0.12      0.21        93
            FACILITIES#GENERAL:negative       0.00      0.00      0.00         3
             FACILITIES#GENERAL:neutral       0.00      0.00      0.00         1
            FACILITIES#GENE

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
predict = model.predict(X_test)
score(y_test_tf,predict)

Precison:  0.8646209386281588
Recall:  0.37074303405572756
F1:  0.5189599133261105


In [30]:
print(metrics.classification_report(y_test_tf,predict, target_names=transform_label.classes_))

                                         precision    recall  f1-score   support

        FACILITIES#CLEANLINESS:negative       0.00      0.00      0.00         3
         FACILITIES#CLEANLINESS:neutral       0.00      0.00      0.00         0
        FACILITIES#CLEANLINESS:positive       0.00      0.00      0.00         2
            FACILITIES#COMFORT:negative       0.00      0.00      0.00         6
             FACILITIES#COMFORT:neutral       0.00      0.00      0.00         0
            FACILITIES#COMFORT:positive       0.00      0.00      0.00        20
    FACILITIES#DESIGN&FEATURES:negative       0.00      0.00      0.00        22
     FACILITIES#DESIGN&FEATURES:neutral       0.00      0.00      0.00         5
    FACILITIES#DESIGN&FEATURES:positive       0.67      0.05      0.10        38
            FACILITIES#GENERAL:negative       0.00      0.00      0.00         1
             FACILITIES#GENERAL:neutral       0.00      0.00      0.00         1
            FACILITIES#GENE

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
text = "Khách sạn đẹp quá. Nhân viên thân thiện, ví trí thuận tiện. Nhưng giá hơi cao, nên chỉ đánh giá 4 sao."
result = model.predict([text])
show_label(result[0])

['HOTEL#DESIGN&FEATURES:positive', 'LOCATION#GENERAL:positive', 'SERVICE#GENERAL:positive']
