In [2]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("data/dataset_klasifikasi_topik.csv", sep=";")
df.head()

Unnamed: 0,tweet,label
0,MICE memiliki potensi yang sangat besar untuk ...,Ekonomi
1,"Paus Fransiskus pada Senin (13/9/2021), menyer...",Ekonomi
2,Pemerintah sudah menanggung bunga dan pokok ut...,Ekonomi
3,Menurut Ekonom Institute for Development of Ec...,Ekonomi
4,Kementerian Pariwisata dan Ekonomi Kreatif opt...,Ekonomi


In [8]:
len(df)

1250

In [9]:
df['label'].value_counts()

Politik       250
Ekonomi       250
Sosial        250
Kesehatan     250
Pendidikan    250
Name: label, dtype: int64

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.dropna(inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1250 entries, 0 to 1249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   1250 non-null   object
 1   label   1250 non-null   object
dtypes: object(2)
memory usage: 29.3+ KB


In [13]:
import re

with open("dict/id_stopwords.txt") as f:
    id_stopwords = f.read().splitlines()

with open("dict/en_stopwords.txt") as f:
    en_stopwords = f.read().splitlines()

stopwords = id_stopwords + en_stopwords

In [14]:
import re

# text preprocessing
def clean_text(text):
    if type(text) == np.float:
        return ""
    temp = text.lower()
    temp = re.sub("'", "", temp)
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp

df['cleaned'] = df['tweet'].apply(clean_text)

In [15]:
# what are the top words from each class?

for label in df['label'].unique():
    temp = df[df['label']==label]
    
    print(label)
    all_text = ' '.join(temp['cleaned'])
    tokens = all_text.split()
    
    top_words = pd.Series(tokens).value_counts().nlargest(10)
    print(top_words)
    print("\n")

Ekonomi
ekonomi       112
pandemi        60
keuangan       45
covid          45
19             45
pemerintah     40
indonesia      38
saham          36
2021           36
tahun          28
dtype: int64


Kesehatan
kesehatan     150
19             95
covid          91
penyakit       46
vaksinasi      39
vaksin         36
pandemi        33
protokol       31
masyarakat     30
indonesia      29
dtype: int64


Pendidikan
sekolah         95
mahasiswa       91
muka            53
tatap           52
universitas     46
siswa           44
2021            30
kuliah          29
tahun           28
pembelajaran    26
dtype: int64


Politik
partai        119
politik        76
presiden       57
pemilu         37
demokrasi      33
jokowi         32
kebijakan      28
pemerintah     26
pan            25
menteri        24
dtype: int64


Sosial
masyarakat    169
sosial         75
bantuan        53
19             29
covid          29
pandemi        27
pemerintah     26
indonesia      23
media          18
ppkm

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [17]:
X = df['cleaned'] # features
y = df['label'] # classes

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=99)

# vectorizer
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train.apply(clean_text))
vectorizer.get_feature_names()[400:420]

['batas',
 'batu',
 'batubara',
 'bau',
 'bawah',
 'bayar',
 'bayi',
 'bbm',
 'bbvet',
 'bca',
 'bea',
 'beasiswa',
 'beban',
 'bebas',
 'beberapa',
 'begini',
 'bei',
 'bekal',
 'bekasi',
 'bekerja']

In [25]:
X_test_vect = vectorizer.transform(X_test.apply(clean_text))

# classification
# clf = MultinomialNB()
# clf = SVC(kernel='rbf')
clf = RandomForestClassifier()
# clf = LogisticRegression()
clf.fit(X_train_vect, y_train)

RandomForestClassifier()

In [26]:
# result
y_pred = clf.predict(X_test_vect)
acc = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print(acc)
print(conf)
print("\n")


0.8913738019169329
[[59  3  0  0  4]
 [ 2 50  1  4  4]
 [ 1  0 58  0  1]
 [ 2  0  0 59  3]
 [ 6  2  0  1 53]]




In [27]:
# evaluate your predictions
for text, label in zip(X_test, y_test):
    pred = clf.predict(vectorizer.transform([text]))[0]
    if pred!=label:
        print(text, ">> should be", label.upper(), ">> predicted",pred.upper())
        print("\n")

pgri menekankan apapun alasannya pemerintah membina sekolah berskala kecil banyak diisi masyarakat kurang mampu terjadi lost generation masa mendatang >> should be PENDIDIKAN >> predicted SOSIAL


penyebaran varian 32 negara menjadi perhatian pemerintah presiden jokowi meminta jajarannya mencegah masuknya varian baru virus sars cov 2 disebut lebih berbahaya >> should be KESEHATAN >> predicted POLITIK


menkeu sri mulyani perayaan idul adha selalu mengingatkan pentingnya berbagi sesama termasuk masa pandemi >> should be SOSIAL >> predicted EKONOMI


anies dunia tercengan indonesia mampu kendalikan covid 19 anies kunci bangsa indonesia mengendalikan pandemi covid 19 kolaborasi pemerintah masyarakat melawan menangani covid 19 >> should be KESEHATAN >> predicted SOSIAL


pemerintah belajar momen kritis ledakan varian delta juli agustus 2021 ketidaksiapan menangani dampak kesehatan sosial ekonomi masyarakat >> should be KESEHATAN >> predicted EKONOMI


terpaksa isolasi mandiri gambaran inil