# Sistem Rekomendasi Jurnal

In [1]:
import tkinter as tk
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Text Preprocessing
*Text preprocessing* atau prapemrosesan teks adalah serangkaian teknik yang digunakan untuk mempersiapkan dokumen teks untuk diproses oleh mesin pencari yang melibatkan beberapa tahap seperti:
- Special characters removal
- Stopwords removal
- Casefolding
- Tokenization
- Stemming

In [3]:
def remove_punctuation(text):
    # remove punctuation using string library
    text = "".join([char for char in text if char not in string.punctuation])
    # remove certain characters
    text = re.sub(r'\s+', ' ', text)
    return text

def preprocess_text(text):
    # Cleaning & Casefolding
    text = remove_punctuation(text.lower())

    # Tokenization
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return words


Berikut merupakan contoh hasil dari tahapan text preprocessing yang dilakukan:

In [4]:
text = "This is an example of text preprocessing using NLTK. It includes casefolding, tokenization, stopword removal, and stemming."
preprocessed_text = preprocess_text(text)
print(preprocessed_text)

['exampl', 'text', 'preprocess', 'use', 'nltk', 'includ', 'casefold', 'token', 'stopword', 'remov', 'stem']


### Feature Extraction
Ekstraksi fitur dilakukan dengan menerapkan tahap prapemrosesan kata pada data teks judul dan abstrak masing-masing dokumen yang terdapat dalam dataset, lalu menghitung vektor bobot TF-IDF untuk setiap data teks yang sudah terproses.

In [5]:
journalDtf = pd.read_csv("Datasets/data.csv")
journalDtf = journalDtf.drop('no', axis=1)
journalDtf['combined'] = journalDtf['judul'] + " " + journalDtf['abstrak']

features = []

# Melakukan text preprocessing pada setiap teks dokumen
for i in range(len(journalDtf)):
    features.append(preprocess_text(journalDtf.loc[i, 'combined']))

# Menghitung vector bobot TF-IDF masing-masing fitur 
vectorizer = TfidfVectorizer(analyzer=lambda x: x)
tfidfMatrix = vectorizer.fit_transform(features)

# Print contoh hasil preprocessing kata
print("Hasil prapemrosesan teks:")

for i in range(5):
    print(features[i])

# Print vektor tfidf dari dataset
print("\nHasil perhitungan vektor bobot TF-IDF:")
print(tfidfMatrix.toarray()[:5])

Hasil prapemrosesan teks:
['dynam', 'studi', 'export', 'china', 'south', 'korea', 'econom', 'growth', 'china', 'paper', 'appli', 'annual', 'data', '1998', '2016', 'search', 'dynam', 'oper', 'mechan', 'export', 'china', 'south', 'korea', 'econom', 'growth', 'china', 'vector', 'error', 'correct', 'model', 'util', 'conduct', 'empir', 'analysi', 'result', 'indic', 'longrun', 'relationship', 'specif', 'export', 'china', 'south', 'korea', 'increas', '1', 'per', 'cent', 'econom', 'growth', 'china', 'increas', '0769', 'per', 'cent', 'meanwhil', 'result', 'granger', 'causal', 'test', 'also', 'reveal', 'unidirect', 'causal', 'exist', '5', 'per', 'cent', 'signific', 'level', 'importantli', 'result', 'vector', 'error', 'correct', 'mechan', 'show', 'econom', 'growth', 'deriv', 'longrun', 'equilibrium', 'short', 'run', 'return', 'longrun', 'equilibrium', '22', 'percent']
['panel', 'approach', 'govern', 'expenditur', 'influenc', 'human', 'develop', 'index', 'studi', 'investig', 'influenc', 'govern', 

### Pencarian Query and Perbandingan Cosine Similarity
Pencarian dan perangkingan hasil pencarian query dilakukan dengan menghitung vektor bobot TF-IDF dan membandingkan seberapa dekat sudut vektor bobot query dengan vektor bobot lainnya yang ada pada dokumen dalam dataset. 

Perhitungan sudut ini dapat dilakukan dengan menghitung nilai $cos(\theta)$ dari sudut antar vektor query dan vektor dokumen dengan formula berikut:

$$cos(\theta) = \frac{\vec{a} \cdot \vec{b}}{|\vec{a}||\vec{b}|}$$

Hasil perhitungan kemudian akan digunakan sebagai acuan untuk mengurutkan hasil pencarian dokumen.

In [6]:
def getSearchQueryRangking(query):
    queryTfIdf = vectorizer.transform([preprocess_text(query)])
    cosine_sim = cosine_similarity(queryTfIdf, tfidfMatrix)

    return cosine_sim.argsort()[0][::-1]

Implementasi UI sederhana untuk sistem pencarian:

In [25]:
def showSearchResults():
    output_area.delete("1.0", "end")
    
    query = input_field.get("1.0", tk.END).strip()
    rangkingIndex = getSearchQueryRangking(query)

    for i in range(10):
        output_area.insert(tk.END, journalDtf.loc[rangkingIndex[i], 'judul'] + "\n\n")

window = tk.Tk()

input_field = tk.Text(window, height=1)
input_field.pack()

search_button = tk.Button(window, text="Search", command=showSearchResults)
search_button.pack()

output_area = tk.Text(window)
output_area.pack()

window.mainloop()

### Evaluasi

In [26]:
query = "Education"
relevantIndexes = [4, 5, 11, 12, 18, 19, 20]

rangkingIndex = getSearchQueryRangking(query)
searchResults = 10

confMatrix = [[0, 0], [0, 0]]
precision = 0
recall = 0
MAPval = 0


for i in  range(len(rangkingIndex)):
    if rangkingIndex[i] in relevantIndexes:
        confMatrix[0][0] += 1
        MAPval += confMatrix[0][0]/(confMatrix[0][0] + confMatrix[0][1])
    else:
        confMatrix[0][1] += 1
    
    confMatrix[1][0] = len(relevantIndexes) - confMatrix[0][0]
    confMatrix[1][1] = len(rangkingIndex) - i - 1

    if i == searchResults:
        precision = confMatrix[0][0] / (confMatrix[0][0] + confMatrix[0][1])
        recall = confMatrix[0][0] / (confMatrix[0][0] + confMatrix[1][0])

MAPval / confMatrix[0][0]

print("MAP      : ", MAPval)
print("Precision: ", precision)
print("Recall   : ", recall)
print(confMatrix)



MAP      :  3.1686507936507935
Precision:  0.45454545454545453
Recall   :  0.7142857142857143
[[6, 14], [1, 0]]
