# Preparation

In [1]:
#importing libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer

# Prepare Dataset

In [2]:
#reading the data
dataset = pd.read_csv('dataset.csv', header=0, engine='python')
del dataset['No']
dataset = dataset.dropna()
dataset['kategori'] = dataset['kategori'].astype("category")
dataset['label'] = dataset['kategori'].cat.codes
dataset = dataset.replace('\n',' ', regex=True)
dataset = dataset.replace('\t',' ', regex=True)
dataset = dataset.replace('"',' " ', regex=True)
dataset.head()

Unnamed: 0,jenis,uraian,pertanyaan,opsi1,opsi2,opsi3,opsi4,kunci_jawaban,kategori,label
0,Makna kata/istilah,Padi yang luas menguning adalah hasil tanam ya...,Makna kata jasa pada paragraf di atas adalah,Pelayanan yang terbaik bagi kita.,Manfaat yang melimpah bagi kita,Perbuatan yang berguna bagi orang lain,Jerih payah yang sangat menguntungkan,Perbuatan yang berguna bagi orang lain,Mudah,0
1,Antonim/sinonim,Suara radio tetangga sebelah sangat keras pada...,Antonim kata keras pada kalimat tersebut adalah,pelan,lunak,kaku,kencang,pelan,Mudah,0
2,Informasi tersurat teks,Pembiasaan Hidup Sehat Sejak KecilPola hidup s...,Bagaimana cara memenuhi gizi anak pada usia ba...,Menambah gizi anak seiring bertambahnya usia a...,"Makan makanan bergizi, istirahat cukup, dan ol...",Mempunyai kekebalan yang baik terhadap seranga...,Pemenuhan gizi dengan pemberian ASI saat anak...,Pemenuhan gizi dengan pemberian ASI saat anak...,Sedang,1
3,Informasi tersirat teks,Mulai tahun 2016 kuota haji di Indonesssia dit...,Berdasarkan data kuota jumlah jamaah haji untu...,77,77.7,78,77800,77,Sedang,1
4,Informasi tersurat teks,Kebiasaan makan makanan bergizi dengan kadar s...,Kalimat dalam paragraf di atas yang tidak padu...,Kebutuhan gizi anak akan semakin bertambah sei...,Kebutuhan gizi anak baru terpenuhi saat anak b...,Tubuh anak mudah terserang berbagai macam peny...,Pertumbuhan dan perkembangan anak akan semakin...,Tubuh anak mudah terserang berbagai macam peny...,Sedang,1


In [3]:
df_x = dataset['jenis'].map(str) + ' ' + dataset['uraian'].map(str)+ ' ' + dataset['pertanyaan'].map(str)+ ' ' + dataset['opsi1'].map(str)+ ' ' + dataset['opsi2'].map(str)+ ' ' + dataset['opsi3'].map(str)+ ' ' + dataset['opsi4'].map(str)+ ' ' + dataset['kunci_jawaban'].map(str)   # '0' refers to the review text
train_y = dataset['label']

df_final=[]

df_x.head()

0    Makna kata/istilah Padi yang luas menguning ad...
1    Antonim/sinonim Suara radio tetangga sebelah s...
2    Informasi tersurat teks Pembiasaan Hidup Sehat...
3    Informasi tersirat teks Mulai tahun 2016 kuota...
4    Informasi tersurat teks Kebiasaan makan makana...
dtype: object

# Data Pre-processing

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
stopwords = nltk.corpus.stopwords.words('indonesian')
lemmatizer = WordNetLemmatizer()

## Stopwords & Lemmatizer

In [7]:
#Pre-processing Dataset
for i in range(0, len(df_x)):
    review = re.sub('[^a-zA-Z]', ' ', df_x.iloc[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    df_final.append(review)
print('OK')

OK


In [8]:
df_final[417]

'tanda baca joya wow alangkah indahnya pemandangan pesisir pantai pasir putih perbaikan kesalahan penggunaan tanda baca kalimat joya terkagum wow alangkah indahnya pemandangan pesisir pantai pasir putih joya terkagum wow alangkah indahnya pemandangan pesisir pantai pasir putih joya terkagum wow alangkah indahnya pemandangan pesisir pantai pasir putih joya terkagum wow alangkah indahnya pemandangan pesisir pantai pasir putih joya terkagum wow alangkah indahnya pemandangan pesisir pantai pasir putih'

## TF-IDF

In [9]:
#tf idf
tf_idf = TfidfVectorizer()
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(df_final)
#applying tf idf to training data
X_train_tf = tf_idf.transform(df_final)
## Stopwords
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 418, n_features: 4477


In [10]:
X_train_tf

test = pd.DataFrame(X_train_tf.toarray())
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4467,4468,4469,4470,4471,4472,4473,4474,4475,4476
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.052999,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.066326,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


# Generate CSV Dataset for Evaluation

In [11]:
test['label'] = dataset['label']

test.to_csv('dataset_eval_tfidf.csv', index=False, encoding='utf-8')
test

print("Dataset for evauation successfully generated !")

Dataset for evauation successfully generated !


# Train & Test Dataset Split

In [12]:
from sklearn.model_selection import train_test_split

df_eval = dataset = pd.read_csv('dataset_eval_tfidf.csv', header=0, engine='python')

train_x, test_x, train_y, test_y = train_test_split(df_eval.drop('label', axis = 1), df_eval['label'], test_size=0.15, random_state=42)
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((355, 4477), (355,), (63, 4477), (63,))

# Single Fold Evaluation

In [13]:
model = RandomForestClassifier(n_estimators = 300)
model.fit(train_x, train_y)

test_pred = model.predict(test_x)
from sklearn.metrics import accuracy_score
accuracy_score(test_y, test_pred)

0.8095238095238095

In [14]:
#predicted y
y_pred = model.predict(test_x)

In [15]:
print(metrics.classification_report(test_y, y_pred, target_names=['Mudah', 'Sedang', 'Sulit']))

              precision    recall  f1-score   support

       Mudah       0.74      0.83      0.78        24
      Sedang       0.67      0.62      0.64        13
       Sulit       0.96      0.88      0.92        26

    accuracy                           0.81        63
   macro avg       0.79      0.78      0.78        63
weighted avg       0.82      0.81      0.81        63



# Machine Learning Classification Evaluation

## Random Forest Classifier 

In [16]:
from sklearn.ensemble import RandomForestClassifier

acc_temp = 0
prec_temp = 0
rec_temp = 0
f1_temp = 0
n_fold = 10

for i in range(n_fold):
    model = RandomForestClassifier(n_estimators = 300)
    model.fit(train_x, train_y)

    test_pred = model.predict(test_x)
    from sklearn.metrics import accuracy_score
    temp_res = accuracy_score(test_y, test_pred)
    acc_temp+= temp_res
    
    precision, recall, f_value, support = precision_recall_fscore_support(test_y, test_pred, average='macro')
    
    prec_temp+=precision
    rec_temp += recall
    f1_temp += f_value
    
    print(f'Fold {i+1} Eval...   | ACC : {temp_res} | PREC : {precision} | REC : {recall} | F1-SCORE : {f_value} | SUPP : {support}')

res = acc_temp/n_fold
prec_res = prec_temp/n_fold
rec_res = rec_temp/n_fold
f1_res = f1_temp/n_fold

print(f'Average Accuracy Score({n_fold} Times Evaluation) : {res}')
print(f'Average Precision Score({n_fold} Times Evaluation) : {prec_res}')
print(f'Average Recall Score({n_fold} Times Evaluation) : {rec_res}')
print(f'Average F1 Score({n_fold} Times Evaluation) : {f1_res}')

Fold 1 Eval...   | ACC : 0.8253968253968254 | PREC : 0.8065268065268066 | REC : 0.7905982905982908 | F1-SCORE : 0.7965811965811967 | SUPP : None
Fold 2 Eval...   | ACC : 0.8571428571428571 | PREC : 0.8392307692307691 | REC : 0.8301282051282052 | F1-SCORE : 0.8337254901960783 | SUPP : None
Fold 3 Eval...   | ACC : 0.8253968253968254 | PREC : 0.7986324786324787 | REC : 0.7905982905982908 | F1-SCORE : 0.7937254901960783 | SUPP : None
Fold 4 Eval...   | ACC : 0.8571428571428571 | PREC : 0.8392307692307691 | REC : 0.8301282051282052 | F1-SCORE : 0.8337254901960783 | SUPP : None
Fold 5 Eval...   | ACC : 0.8571428571428571 | PREC : 0.8392307692307691 | REC : 0.8301282051282052 | F1-SCORE : 0.8337254901960783 | SUPP : None
Fold 6 Eval...   | ACC : 0.8571428571428571 | PREC : 0.8392307692307691 | REC : 0.8301282051282052 | F1-SCORE : 0.8337254901960783 | SUPP : None
Fold 7 Eval...   | ACC : 0.8412698412698413 | PREC : 0.8287037037037037 | REC : 0.8173076923076922 | F1-SCORE : 0.8211764705882353

## Naive Bayes Classifier 

In [17]:
from sklearn.naive_bayes import MultinomialNB

acc_temp = 0
prec_temp = 0
rec_temp = 0
f1_temp = 0
n_fold = 10

for i in range(n_fold):
    model = MultinomialNB()
    model.fit(train_x, train_y)

    test_pred = model.predict(test_x)
    from sklearn.metrics import accuracy_score
    temp_res = accuracy_score(test_y, test_pred)
    acc_temp+= temp_res
    
    precision, recall, f_value, support = precision_recall_fscore_support(test_y, test_pred, average='macro')
    
    prec_temp+=precision
    rec_temp += recall
    f1_temp += f_value
    
    print(f'Fold {i+1} Eval...   | ACC : {temp_res} | PREC : {precision} | REC : {recall} | F1-SCORE : {f_value} | SUPP : {support}')

res = acc_temp/n_fold
prec_res = prec_temp/n_fold
rec_res = rec_temp/n_fold
f1_res = f1_temp/n_fold

print(f'Average Accuracy Score({n_fold} Times Evaluation) : {res}')
print(f'Average Precision Score({n_fold} Times Evaluation) : {prec_res}')
print(f'Average Recall Score({n_fold} Times Evaluation) : {rec_res}')
print(f'Average F1 Score({n_fold} Times Evaluation) : {f1_res}')

Fold 1 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP : None
Fold 2 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP : None
Fold 3 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP : None
Fold 4 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP : None
Fold 5 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP : None
Fold 6 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP : None
Fold 7 Eval...   | ACC : 0.6190476190476191 | PREC : 0.607843137254902 | REC : 0.5993589743589745 | F1-SCORE : 0.5874258219727346 | SUPP

## SVM Classifier 

In [18]:
from sklearn.svm import SVC

acc_temp = 0
prec_temp = 0
rec_temp = 0
f1_temp = 0
n_fold = 10

for i in range(n_fold):
    model = SVC(kernel='linear')
    model.fit(train_x, train_y)

    test_pred = model.predict(test_x)
    from sklearn.metrics import accuracy_score
    temp_res = accuracy_score(test_y, test_pred)
    acc_temp+= temp_res
    
    precision, recall, f_value, support = precision_recall_fscore_support(test_y, test_pred, average='macro')
    
    prec_temp+=precision
    rec_temp += recall
    f1_temp += f_value
    
    print(f'Fold {i+1} Eval...   | ACC : {temp_res} | PREC : {precision} | REC : {recall} | F1-SCORE : {f_value} | SUPP : {support}')

res = acc_temp/n_fold
prec_res = prec_temp/n_fold
rec_res = rec_temp/n_fold
f1_res = f1_temp/n_fold

print(f'Average Accuracy Score({n_fold} Times Evaluation) : {res}')
print(f'Average Precision Score({n_fold} Times Evaluation) : {prec_res}')
print(f'Average Recall Score({n_fold} Times Evaluation) : {rec_res}')
print(f'Average F1 Score({n_fold} Times Evaluation) : {f1_res}')

Fold 1 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313 | SUPP : None
Fold 2 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313 | SUPP : None
Fold 3 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313 | SUPP : None
Fold 4 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313 | SUPP : None
Fold 5 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313 | SUPP : None
Fold 6 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313 | SUPP : None
Fold 7 Eval...   | ACC : 0.6190476190476191 | PREC : 0.6071428571428571 | REC : 0.5950854700854701 | F1-SCORE : 0.5979637646304313