In [60]:
import json

json_data = {}
with open('./labelled_data.json') as f:
    try:
        json_data = json.load(f)
    finally:
        f.close()

In [61]:
tweets = list(map(lambda t: t['tweet'], json_data['tweets']))
labels = list(map(lambda t: t['label'], json_data['tweets']))

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=13)

enc = LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.fit_transform(y_test)

print(y_train)
print(y_test)

[0 1 0 1 0 1 1 0 1 1 0 0 1 1 0 0]
[0 0 0 1 1]


In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
vect.fit(tweets)

X_train_tfidf = vect.transform(X_train)
X_test_tfidf = vect.transform(X_test)

print(vect.vocabulary_)

{'sudah': 250, 'baca': 19, 'surat': 254, 'belum': 30, 'kok': 117, 'aku': 7, 'ga': 73, 'liat': 130, 'kata': 104, 'yg': 293, 'arti': 14, 'perintah': 184, 'ya': 291, 'serius': 237, 'nanya': 163, 'lihat': 132, 'mui': 158, 'hanya': 81, 'suara': 248, 'ketidaksetujuannya': 113, 'dengan': 51, 'uu': 286, 'omnibus': 174, 'law': 125, 'dan': 47, 'sampai': 220, 'harap': 83, 'tiap': 269, 'dari': 49, 'kita': 114, 'wni': 289, 'hak': 78, 'right': 211, 'koordinator': 120, 'pusat': 202, 'bem': 31, 'seluruh': 230, 'indonesia': 89, 'juga': 97, 'lobi': 134, 'oleh': 172, 'orang': 175, 'yang': 292, 'bagai': 20, 'utus': 285, 'agar': 3, 'tak': 256, 'demo': 50, 'tolak': 271, 'diimingimingi': 57, 'akan': 4, 'biaya': 34, 'didik': 56, 'beri': 33, 'jumlah': 98, 'uang': 276, 'asal': 15, 'turun': 273, 'ke': 106, 'jalan': 94, 'uji': 279, 'material': 148, 'cipta': 43, 'kerja': 111, 'mahkamah': 138, 'konstitusi': 119, 'sama': 217, 'tidak': 270, 'efektif': 64, 'minta': 154, 'presiden': 195, 'keluar': 109, 'perpu': 187, 's

In [64]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_c = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_c.fit(X_train_tfidf, y_train)
pred = svm_c.predict(X_test_tfidf)

print(accuracy_score(pred, y_test))

0.6


In [65]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
pred = nb.predict(X_test_tfidf)

print(accuracy_score(pred, y_test))

0.6
