In [1]:
import warnings
warnings.filterwarnings(action='ignore') 
import numpy as np
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
# metrics
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

Using TensorFlow backend.


In [2]:
import pickle
with open('./data/train.pickle', 'rb') as f:
    train = pickle.load(f)

with open('./data/val.pickle', 'rb') as f:
    val = pickle.load(f)

with open('./data/test.pickle', 'rb') as f:
    test = pickle.load(f)

In [3]:
np.random.shuffle(train)
np.random.shuffle(val)
np.random.shuffle(test)

## input x

In [5]:
train_bert = [ text[0][0] + ' ' + text[0][1] + ' ' + text[0][2] for text in train]
val_bert = [ text[0][0] + ' ' + text[0][1] + ' ' + text[0][2]  for text in val]
test_bert = [ text[0][0] + ' ' + text[0][1] + ' ' + text[0][2]  for text in test]

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = False)

tokenizer_train = [tokenizer.tokenize(s) for s in train_bert]
tokenizer_val = [tokenizer.tokenize(s) for s in val_bert]
tokenizer_test = [tokenizer.tokenize(s) for s in test_bert]

In [7]:
word_dic = list(set([t for token in tokenizer_train for t in token]))

In [8]:
len(word_dic)

1697

In [9]:
i2w = {i+1:word for i, word in enumerate(word_dic)}
w2i = {word:i+1 for i, word in enumerate(word_dic)}
i2w[0] = '[UNK]'
w2i['[UNK]'] = 0

In [10]:
train_x = []

In [11]:
for sentence in train_bert:
    token = tokenizer.tokenize(sentence)
    bow = np.zeros(len(w2i)+1)
    for t in token:
        if t in w2i.keys():
            bow[w2i[str(t)]] += 1
    train_x.append(bow)

In [12]:
len(train_x), len(train_bert)

(11053, 11053)

In [13]:
for i in range(len(train_bert)):
    if int(sum(train_x[i])) != len(tokenizer.tokenize(train_bert[i])):
        print(train_bert[i])

In [14]:
# val, test
val_x = []
test_x = []

for sentence in val_bert:
    token = tokenizer.tokenize(sentence)
    bow = np.zeros(len(w2i)+1)
    for t in token:
        if t in w2i.keys():
            bow[w2i[str(t)]] += 1
    val_x.append(bow)
    
    
for sentence in test_bert:
    token = tokenizer.tokenize(sentence)
    bow = np.zeros(len(w2i)+1)
    for t in token:
        if t in w2i.keys():
            bow[w2i[str(t)]] += 1
    test_x.append(bow)

## input y

In [16]:
train_y = [t[1] for t in train]
val_y = [t[1] for t in val]
test_y = [t[1] for t in test]

# model

## RF

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf = RandomForestClassifier()

In [21]:
rf.fit(train_x, train_y)

RandomForestClassifier()

In [22]:
idx_0 = [i for i, y in enumerate(train_y) if y==0] # end
idx_1 = [i for i, y in enumerate(train_y) if y==1] # love

In [23]:
# Accuracy
print(rf.score(train_x, train_y))
print(rf.score(val_x, val_y))
print('test')
print(rf.score(test_x, test_y)*100)

0.994571609517778
0.758125
test
70.61004381530165


In [24]:
test_pred = rf.predict(test_x)

In [25]:
# Precision
print(precision_score(test_y, test_pred, pos_label=1)*100, precision_score(test_y, test_pred, pos_label=0)*100)

# Recall
print(recall_score(test_y, test_pred, pos_label=1)*100, recall_score(test_y, test_pred, pos_label=0)*100)

# fl score
print(f1_score(test_y, test_pred, pos_label=1)*100, f1_score(test_y, test_pred, pos_label=0)*100)

73.62869198312237 67.831715210356
67.81088082901555 73.64722417427969
70.60013486176669 70.61994609164421


## SVM

In [27]:
from sklearn.svm import SVC

In [28]:
svm = SVC(kernel = 'linear')

In [29]:
svm.fit(train_x, train_y)

SVC(kernel='linear')

In [30]:
# Accuracy
print(svm.score(train_x, train_y))
print(svm.score(val_x, val_y))
print('test')
print(svm.score(test_x, test_y)*100)

0.9258119967429657
0.691875
test
69.19447253117627


In [31]:
test_pred = svm.predict(test_x)

In [32]:
# Precision
print(precision_score(test_y, test_pred, pos_label=1)*100, precision_score(test_y, test_pred, pos_label=0)*100)

# Recall
print(recall_score(test_y, test_pred, pos_label=1)*100, recall_score(test_y, test_pred, pos_label=0)*100)

# fl score
print(f1_score(test_y, test_pred, pos_label=1)*100, f1_score(test_y, test_pred, pos_label=0)*100)

71.66437414030261 66.82088565763384
67.48704663212435 71.04708362614195
69.51300867244828 68.8692098092643


## LR

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [42]:
lr = LogisticRegression()

In [43]:
lr.fit(train_x, train_y)

LogisticRegression()

In [47]:
# Accuracy
print(lr.score(train_x, train_y))
print(lr.score(val_x, val_y))
print(lr.score(test_x, test_y)*100)

0.9135981181579662
0.71
69.63262554769128


In [45]:
y_predict = lr.predict(test_x)
test_pred = np.transpose([1 if pred > 0.5 else 0 for pred in y_predict])

In [46]:
# Precision
print(precision_score(test_y, test_pred, pos_label=1)*100, precision_score(test_y, test_pred, pos_label=0)*100)

# Recall
print(recall_score(test_y, test_pred, pos_label=1)*100, recall_score(test_y, test_pred, pos_label=0)*100)

# fl score
print(f1_score(test_y, test_pred, pos_label=1)*100, f1_score(test_y, test_pred, pos_label=0)*100)

72.24913494809688 67.1484888304862
67.61658031088082 71.82009838369642
69.85613917698227 69.40577249575551
