In [11]:
import pandas as pd 
import numpy as np
import nltk
# 1: +ve, 0: -ve

In [12]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from collections import Counter
import string
import re
import stopwordsiso as sw

# download required library from nltk
# nltk.download('punkt')

# create stemmer for bahasa
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stop words consist of malay, indo, english
stop_words_main = list(sw.stopwords(["ms", "id", "en"]))
# custom stopwords such as shortform
stop_words_custom = ['kau', 'yg', 'mcm', 'gak', 'nak', 'ni', 'tu', 'la', 'je', 'kat', 'ya', 'dgn', 'tau', 'org', 'rt', 'aja', 'nk', 'dah',
                        'orang', 'sy', 'ga', 'kalo', 'kena']
STOP_WORDS = np.unique(stop_words_main+stop_words_custom)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def text_preprocessing(text):

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # remove links
    text = re.sub('http[s]?://\S+', '', text)

    # remove word with tweethandle @name
    text = re.sub('[^ ]*@[^ ]*', '', text)

    # remove emoji
    text = remove_emoji(text)

    # tokennization
    tokens = word_tokenize(text)

    # stemmer and remove punctuation
    words = []
    for token in tokens:
        if token not in string.punctuation:
            temp = stemmer.stem(token)
            words.append(temp)

    # remove stopwords
    cleaned = []
    for word in words:
        if word not in STOP_WORDS:
            cleaned.append(word)

    # join all words into a complete sentence 
    complete_sentence = ' '.join([str(word) for word in cleaned])

    # remove extra line spaces between words in a sentence
    complete_sentence = " ".join(complete_sentence.split())
    
    return complete_sentence

In [17]:
test_data1 = pd.read_csv('dataset/news-test-data.csv')
test_data1 = test_data1[test_data1.label != 'Neutral']
test_data1.loc[test_data1['label'] == 'Positive', 'label'] = 1
test_data1.loc[test_data1['label'] == 'Negative', 'label'] = 0
test_data1.loc[test_data1['label'] == 'negative', 'label'] = 0
test_data1['label'] = test_data1['label'].astype('int64')
test_data1 = test_data1[['text', 'label']]
print(test_data1.head(10))
print(len(test_data1))

                                                 text  label
0   Permohonan ini juga bertujuan memohon penjelas...      1
1   Projek ini bukti komitmen berterusan kerajaan ...      1
2   Menerusi pembentangan Bajet 2018 yang lalu, ke...      1
4   Pada masa sama, kerajaan negeri juga telah bua...      1
5   Kita bawa pembangunan, bukan sahaja untuk Saba...      1
6   Harapan kita agar isu berkaitan golongan muda ...      1
8   Kalau dah bersara dan tiada kaitan lagi dengan...      1
9   Kesediaan negara-negara ITRC untuk mematuhi ko...      1
10  Apa yang menjadi masalah kita, apabila masyara...      0
12  Jabatan Kehakiman (DOJ) Amerika Syarikat sebel...      0
2942


In [22]:
text_cleaning = lambda x: text_preprocessing(x)
test_data1['cleaned_Text'] = pd.DataFrame(test_data1['text'].apply(text_cleaning))
test_data1.head(10)

Unnamed: 0,text,label,cleaned_Text
0,Permohonan ini juga bertujuan memohon penjelas...,1,mohon tuju mohon crcc pegang tuju khas spv
1,Projek ini bukti komitmen berterusan kerajaan ...,1,bukti komitmen raja baris sekutu jaga kebaji m...
2,"Menerusi pembentangan Bajet 2018 yang lalu, ke...",1,terusi bentang bajet raja sedia promosi tingka...
4,"Pada masa sama, kerajaan negeri juga telah bua...",1,raja bayar rm mangsa banjir
5,"Kita bawa pembangunan, bukan sahaja untuk Saba...",1,bawa bangun
6,Harapan kita agar isu berkaitan golongan muda ...,1,harap kait golong muda didik perhati manifesto
8,Kalau dah bersara dan tiada kaitan lagi dengan...,1,sara kait felda tindak sivil
9,Kesediaan negara-negara ITRC untuk mematuhi ko...,1,sedia itrc patuh kouta mekanisme impak positif...
10,"Apa yang menjadi masalah kita, apabila masyara...",0,gemar kongsi
12,Jabatan Kehakiman (DOJ) Amerika Syarikat sebel...,0,jabat hakim doj dakwa aset mewah beli curi mdb


In [23]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

def sentiment_model_predict(model,data_test_input,data_test_target):
    data_prediction=model.predict(data_test_input)
    conf_matrix = confusion_matrix(data_test_target,data_prediction)
    acc_score = accuracy_score(data_test_target, data_prediction)
    pre_score = precision_score(data_test_target, data_prediction, average="macro")
    re_score = recall_score(data_test_target, data_prediction, average="macro")
    f_score = f1_score(data_test_target, data_prediction, average="macro")

    print("Accuracy : "+str(round(acc_score*100,2)))
    print("Precision : "+str(round(pre_score*100,2)))
    print("Recall : "+str(round(re_score*100,2)))
    print("F1-Score :"+str(round(f_score*100,2)))
    print(conf_matrix)

In [26]:
import joblib

X_test = test_data1['cleaned_Text']
y_test = test_data1['label']

# load model
joblib_SVM_model2 = joblib.load("model/bahasa_sentiment_svm_model2.pkl")
sentiment_model_predict(joblib_SVM_model2,X_test,y_test)

Accuracy : 61.01
Precision : 64.84
Recall : 67.02
F1-Score :60.54
[[ 736  158]
 [ 989 1059]]


In [33]:
import json

with open('dataset/media-test-data-negatives.json') as fopen:
    file_test = json.load(fopen)

test_data2 = pd.DataFrame(file_test, columns = ['text'])
# negative_data = negative_data[:100000]
# test_data2['label'] = 0
test_data2 = test_data2.head(3000)
print(test_data2.head(10))
print(len(test_data2))

                                                text
0  @Iwwannnnn apa lancau mintak izin kat awek . a...
1  Mas Bowo di marahin tukang parkir!\n\n"Ancene ...
2      Ala bodohnya jilat. Gersang sgt ka. Geli aku 
3  @lalajoeee Si lancau ni mmg kuat cari pasal. B...
4  Eyh babila\nDah pandai masok umah tu tutup la ...
5  Ilmu gratis tapi bukan berarti boleh ngebodoh ...
6  Makin melampo mintak murah !! Butoh !! Dah nak...
7  @ainohyeah Amboi marahnya.  Tapi durian tu nam...
8  SI ADAM NI NAFSU KUDA AKU RASA?!!! HA RETI PUL...
9  Bodohlah. Janji lain, buat lain. Aku dah lah b...
3000


In [34]:
text_cleaning = lambda x: text_preprocessing(x)
test_data2['cleaned_Text'] = pd.DataFrame(test_data2['text'].apply(text_cleaning))
test_data2.head(10)

Unnamed: 0,text,cleaned_Text
0,@Iwwannnnn apa lancau mintak izin kat awek . a...,lancau mintak izin awek mak halang bini perempuan
1,"Mas Bowo di marahin tukang parkir!\n\n""Ancene ...",mas bowo marahin tukang parkir ancene capres c...
2,Ala bodohnya jilat. Gersang sgt ka. Geli aku,ala bodoh jilat gersang sgt ka geli
3,@lalajoeee Si lancau ni mmg kuat cari pasal. B...,lancau mmg kuat cari pasal ade cari pasal
4,Eyh babila\nDah pandai masok umah tu tutup la ...,eyh babila pandai masok umah tutup pintu shibe...
5,Ilmu gratis tapi bukan berarti boleh ngebodoh ...,ilmu gratis arti ngebodoh bodohin nih gym bawa...
6,Makin melampo mintak murah !! Butoh !! Dah nak...,melampo mintak murah butoh mcam trade kt kedai...
7,@ainohyeah Amboi marahnya. Tapi durian tu nam...,amboi marah durian nampak sedap
8,SI ADAM NI NAFSU KUDA AKU RASA?!!! HA RETI PUL...,adam nafsu kuda ha ret pulak allahuakhbar babi...
9,"Bodohlah. Janji lain, buat lain. Aku dah lah b...",bodoh janji takde ragam


In [41]:
import joblib

# load model
joblib_SVM_model2 = joblib.load("model/bahasa_sentiment_svm_model2.pkl")
pred_data=joblib_SVM_model2.predict(test_data2['cleaned_Text'])
test_data2['Predicted'] = pred_data
test_data2['Predicted'].value_counts()

0    2317
1     683
Name: Predicted, dtype: int64