In [1]:
import json, random
import numpy as np
from codecs import open
from joblib import dump
from pythainlp.corpus import thai_stopwords
from pythainlp.tokenize import word_tokenize
from pythainlp.word_vector import sentence_vectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from os.path import abspath, join

In [2]:
def remove_duplicated_sent(sentence):
    sents = sentence.split(' ')
    sents = list(dict.fromkeys(sents)) # remove duplicated by keep the order of keys
    return ''.join(sents)
def remove_stopwords(sentence):
    words = list(filter(lambda word: not word in thai_stopwords(), word_tokenize(sentence)))
    return ''.join(words)
def clean_text(sentence):
    sentence = remove_duplicated_sent(sentence)
    #sentence = remove_stopwords(sentence)
    return sentence
def load_dataset():
    dataset = []
    with open(abspath(join('..', 'dataset', 'useful-tweets.json')), 'r', encoding='utf-8-sig') as f:
        positive_tweets_json = json.load(f)
    with open(abspath(join('..', 'dataset', 'useless-tweets.json')), 'r', encoding='utf-8-sig') as f:
        negative_tweets_json = json.load(f)
    for index in range(max(len(positive_tweets_json), len(negative_tweets_json))):
        if index < len(positive_tweets_json):
            # clean text
            text = clean_text(positive_tweets_json[index]['text'])
            # convert a Thai sentence into vector shape (1, 300)
            vect = sentence_vectorizer(text)
            dataset.append((vect, 1))
        if index < len(negative_tweets_json):
            # clean text
            text = clean_text(negative_tweets_json[index]['text'])
            # convert a Thai sentence into vector shape (1, 300)
            vect = sentence_vectorizer(text)
            dataset.append((vect, 0))
    random.shuffle(dataset)
    return np.array([X[0] for X in dataset]), np.array([y[1] for y in dataset])

In [3]:
X, y = load_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_train, X_test = X_train.reshape(-1, X_train.shape[2]), X_test.reshape(-1, X_test.shape[2])
print(f'X_train shape : {X_train.shape}\nX_test shape : {X_test.shape}\n')

X_train shape : (1006, 300)
X_test shape : (432, 300)



In [4]:
# Support vector machine classifier
classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
classifier.fit(X_train, y_train)
# predict the labels on validation dataset
y_pred = classifier.predict(X_test)
print("SVM Accuracy : ", accuracy_score(y_pred, y_test)*100)
dump(classifier, abspath(join('..', 'model', 'svm_classifier.joblib'))) # save model

SVM Accuracy :  92.5925925925926


['D:\\Coding\\Python\\BeneficialTweetCOVID-19\\model\\svm_classifier.joblib']

In [5]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10, random_state=1, criterion='entropy')
clf.fit(X_train, y_train)
y_pred2 = clf.predict(X_test)
print("RF Accuracy : ", accuracy_score(y_pred2, y_test)*100)
dump(classifier, abspath(join('..', 'model', 'rf_classifier.joblib'))) # save model

RF Accuracy :  93.05555555555556


['D:\\Coding\\Python\\BeneficialTweetCOVID-19\\model\\rf_classifier.joblib']