In [478]:
import pandas

df_train = pandas.read_csv('./train.csv', quotechar="\'")
df_test = pandas.read_csv('./test.labeled.csv', quotechar="\'")

In [479]:
import re
import string
import demoji
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer

demoji.download_codes()

def clean_tweets(data):
    new_data = data.copy()

    username_hash = r'[#@]\w+'
    punctuation = '[%s]+' % re.escape(string.punctuation)
    special_char = r'[^0-9a-zA-Z\s]+'
    number = r'[0-9]+'
    space = r'\s{2,}'
    space_begin_end = r'^\s+|\s+$'
    url = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)'
    char_ref = r'&\w+;'

    for i in range(len(new_data)):
        old_str = str(new_data[i])

        old_str = re.sub(username_hash, '', old_str)
        old_str = re.sub(url, '', old_str)
        old_str = re.sub(char_ref, ' ', old_str)
        old_str = re.sub(punctuation, '', old_str)
        old_str = re.sub(number, '', old_str)
        old_str = re.sub(space_begin_end, '', old_str)
        old_str = re.sub(space, '', old_str)
        old_str = demoji.replace(old_str, '')
        old_str = re.sub(special_char, '', old_str)

        new_data[i] = old_str

    return new_data

def case_fold(data):
    new_data = data.copy()
    return list(map(lambda s: s.lower(), new_data))

def tokenize(data):
    new_data = data.copy()
    return list(map(lambda s: s.split(' '), new_data))

def stem(data):
    new_data = data.copy()
    stemmer = StemmerFactory().create_stemmer()

    return list(map(lambda s: stemmer.stem(s), new_data))

Downloading emoji data ...
... OK (Got response in 0.44 seconds)
Writing emoji data to /Users/bangkodir/.demoji/codes.json ...
... OK


In [480]:
cleaned_train = clean_tweets(df_train['tweet'])
case_folded_train = case_fold(cleaned_train)
tweet_train = stem(case_folded_train)

cleaned_test = clean_tweets(df_test['tweet'])
case_folded_test = case_fold(cleaned_test)
tweet_test = stem(case_folded_test)

In [481]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(use_idf=True)
tweet_train_vect = tfidf_vect.fit_transform(tweet_train)
tweet_test_vect = tfidf_vect.transform(tweet_test)

In [482]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_c = SVC(kernel='linear')
svm_c.fit(tweet_train_vect, df_train['category'])

pred = svm_c.predict(tweet_test_vect)

pos = []
neg = []
total_raw_tweets = len(tweet_test)

for i in range(total_raw_tweets):
    tweet = tweet_test[i]
    if pred[i] == 'positif':
        pos.append(tweet)
    elif pred[i] == 'negatif':
        neg.append(tweet)

pos_percent = (len(pos) / total_raw_tweets) * 100
neg_percent = (len(neg) / total_raw_tweets) * 100
accuracy = accuracy_score(df_test['category'], pred) * 100

print(f"Positive Percentage : {pos_percent}")
print(f"Negative Percentage : {neg_percent}")
print(f"Classification Accuracy : {accuracy}")

Positive Percentage : 25.0
Negative Percentage : 75.0
Classification Accuracy : 80.06912442396313


In [484]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_words(data, n=25):
    cv = CountVectorizer()
    cv.fit(data)

    bag_words = cv.transform(data)
    sum_words = bag_words.sum(axis=0)

    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def print_top_words(data):
    for word, n in data:
        print(f"{word} - {n} kata")

pos_top_words = get_top_words(pos)
neg_top_words = get_top_words(neg)

print("Kata Positif")
print_top_words(pos_top_words)
print("------")
print("Kata Negatif")
print_top_words(neg_top_words)

Kata Positif
law - 200 kata
omnibus - 199 kata
uu - 118 kata
yang - 108 kata
luhut - 73 kata
saya - 65 kata
kerja - 54 kata
ungkap - 52 kata
di - 49 kata
halal - 49 kata
dan - 49 kata
cipta - 48 kata
bentuk - 46 kata
awal - 44 kata
jadi - 43 kata
sertifikat - 38 kata
pak - 37 kata
ini - 36 kata
mulai - 34 kata
indonesia - 33 kata
menko - 32 kata
oleh - 31 kata
satu - 31 kata
ubah - 30 kata
saat - 30 kata
------
Kata Negatif
omnibus - 624 kata
law - 618 kata
yang - 261 kata
uu - 231 kata
dan - 178 kata
di - 175 kata
ini - 175 kata
yg - 158 kata
kerja - 151 kata
ada - 148 kata
cipta - 141 kata
tolak - 137 kata
tidak - 123 kata
itu - 95 kata
untuk - 92 kata
aksi - 89 kata
dari - 85 kata
jokowi - 82 kata
kata - 80 kata
rakyat - 70 kata
demo - 68 kata
nyata - 66 kata
ke - 64 kata
presiden - 63 kata
saya - 56 kata


In [485]:
# TODO : test our model here