In [79]:
import pandas as pd
import numpy as np

In [80]:
data_path = "/content/drive/MyDrive/DS102/DoAn/data/"
NA = data_path + "NAnh_TrueVocab.csv"
QA = data_path + "QuocAnh_TrueVocab.csv"
DT = data_path + "tueje_vocab.csv"

In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
NA_df = pd.read_csv(NA)
QA_df = pd.read_csv(QA)
DT_df = pd.read_csv(DT)

In [83]:
QA_df.head()

Unnamed: 0,word,label
0,room,3
1,staff,2
2,hotel,6
3,location,1
4,breakfast,5


In [84]:
def processLabel(labels):
  if len(labels) == 1:
    return [int(labels)]
  else:
    return [int(label) for label in labels.replace(" ", "").split(",")]

dfs = [NA_df, QA_df, DT_df]
labels = []
for i, df in enumerate(dfs):
  label = df["label"].apply(lambda x: processLabel(x)).to_list()
  labels.append(label)

In [85]:
labels[0][14] + labels[1][14] + labels[2][14]

[1, 2, 6, 2]

# IAA with fleiss kappa

In [86]:
label_num = 6
labels_vec = []
for i in range(len(labels[0])):
  vec = [0 for _ in range(label_num)]
  for j in range(len(labels)):
    for label in labels[j][i]:
      vec[label-1] += 1
  labels_vec.append(vec)

In [87]:
labels_vec = np.array(labels_vec)
labels_vec

array([[0, 0, 3, 0, 0, 0],
       [0, 3, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 1],
       ...,
       [2, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 3],
       [2, 0, 0, 0, 0, 1]])

In [88]:
def fleiss_kappa(labels):
  Pj = np.sum(labels_vec, axis=0) / (np.sum(labels_vec))
  Pe = np.sum(Pj ** 2)

  Pi = 0
  for word in labels:
    Pi += (np.sum(word ** 2) - np.sum(word)) / (np.sum(word) * (np.sum(word)-1))
  P = Pi / labels.shape[0]
  k = (P - Pe) / (1- Pe)
  return k

In [89]:
fleiss_kappa(labels_vec)

0.5505094255127742

# True labels

In [90]:
# plus one because the label start from 1 not 0
true_labels = np.argmax(labels_vec, axis=1) + 1
true_labels

array([3, 2, 1, ..., 1, 6, 1])

In [91]:
labels, counts = np.unique(true_labels, return_counts=True)
for label, count in zip(labels, counts):
  print(f"label {label}: {count}")

label 1: 192
label 2: 187
label 3: 170
label 4: 70
label 5: 70
label 6: 312


In [92]:
words = QA_df["word"].to_numpy()
hotel_ids = np.where(true_labels!=6)
vocab = np.concatenate([words.reshape(-1, 1), true_labels.reshape(-1, 1)], axis=1)
vocab = vocab[np.where(true_labels!=6)]
vocab.shape

(689, 2)

In [93]:
vocab_df = pd.DataFrame(vocab, columns=["word", "label"])
vocab_df.head()

Unnamed: 0,word,label
0,room,3
1,staff,2
2,hotel,1
3,location,1
4,breakfast,5


In [94]:
vocab_path = data_path + "true_labels.csv"
vocab_df.to_csv(vocab_path, index=False)