Install library Sastrawi untuk preprocessing dan import library yang diperlukan

In [None]:
!pip install Sastrawi



In [None]:
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string, re
import math
import numpy as np
from numpy.linalg import norm

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

# PRE-PROCESSING




Fungsi tokenize untuk melakukan tokenization pada document yang meliputi mengubah document menjadi lowercase, menghapus tanda baca, menghapus spasi yang duplikat, menghapus angka

In [None]:
def tokenize(sentences):
    sentences_lower = sentences.lower()

    #hapus tanda baca
    new_sentences = sentences_lower.translate(str.maketrans("","",string.punctuation))

    #hapus spasi
    new_sentences = new_sentences.strip()

    #hapus spasi yang duplikat menjadi satu
    new_sentences = re.sub('\s+',' ', new_sentences)

    #hapus angka
    new_sentences = re.sub(r'\d+', '', new_sentences)

    #tokenization
    token = word_tokenize(new_sentences)

    return token

Proses filtering untuk menghapus stopword

In [None]:
def filter(token):
    #ambil stopword bahasa indonesia
    indonesian_stopword = set(stopwords.words("indonesian"))

    #filter token hasil tokenization dengan menghapus stopword
    token_without_stopword = []
    for word in token:
        if word not in indonesian_stopword:
            token_without_stopword.append(word)

    return token_without_stopword

Proses stemming untuk mengubah kata kerja menjadi kata baku
di proses stemming ini terdapat kata yang salah untuk UISI karena UISI seharusnya tidak distemming menjadi 'uis' dan juga custom stemming untuk gerakan karena harusnya menjadi 'gerak' bukan 'gera'

In [None]:
def stem(filtered_token):
    exception_word = ['uisi']
    custom_stem = {
        'gerakan': 'gerak'
    }

    #buat stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    #stem masing2 kata dari token
    stem_result = []
    for token in filtered_token:
        if token in exception_word:
            stem_result.append(token)
        else:
            stemmed_token = stemmer.stem(token)
            if token in custom_stem:
                stemmed_token = custom_stem[token]
            stem_result.append(stemmed_token)


    return stem_result

In [None]:
#@title Isi dokumen dan query
docs = {
    "d1": "Ardy Kholil Kharitsi, Guntur Anugroho Putra Abadi bangun pagi. Dia berkuliah di UISI Gresik",
    "d2": "UISI berjaya sebagai peringkat 3 Nasional PTN/ PTS Liga 3 bidang kemahasiswaan",
    "d3": "gerakan membangun Gresik kurang mendapat tanggapan masyarakat",
    "d4": "masyarakat diharapkan tanggap dengan persoalan di sekitar universitas",
    "d5": "usaha di sini sukses karena adanya Universitas Internasional Semen Indonesia (UISI)"
}

Q = "membangun gresik yang sukses dan berjaya"

In [None]:
#@title Preprocess document
preprocessed_docs = []
for doc in docs:
    tokenized = tokenize(docs[doc])
    filtered = filter(tokenized)
    stemmed = stem(filtered)
    preprocessed_docs.append(stemmed)

In [None]:
#@title Preprocess query
q_tokenized = tokenize(Q)
q_filtered = filter(q_tokenized)
q_stemmed = stem(q_filtered)
q_stemmed

['bangun', 'gresik', 'sukses', 'jaya']

In [None]:
#@title Masukkan query dalam docs
preprocessed_docs.append(q_stemmed)

In [None]:
#@title Print isi hasil akhir docs
for doc in preprocessed_docs:
  print(doc)

['ardy', 'kholil', 'kharitsi', 'guntur', 'anugroho', 'putra', 'abadi', 'bangun', 'pagi', 'kuliah', 'uisi', 'gresik']
['uisi', 'jaya', 'peringkat', 'nasional', 'ptn', 'pts', 'liga', 'bidang', 'mahasiswa']
['gerak', 'bangun', 'gresik', 'tanggap', 'masyarakat']
['masyarakat', 'harap', 'tanggap', 'universitas']
['usaha', 'sukses', 'universitas', 'internasional', 'semen', 'indonesia', 'uisi']
['bangun', 'gresik', 'sukses', 'jaya']


In [None]:
#@title Ambil semua terms untuk proses yang lain
terms = []
for doc in preprocessed_docs:
  for term in doc:
    terms.append(term)
terms = list(set(terms))
terms

['ptn',
 'bidang',
 'tanggap',
 'ardy',
 'bangun',
 'liga',
 'universitas',
 'gresik',
 'semen',
 'kholil',
 'nasional',
 'anugroho',
 'pagi',
 'putra',
 'uisi',
 'sukses',
 'peringkat',
 'kuliah',
 'masyarakat',
 'kharitsi',
 'jaya',
 'indonesia',
 'mahasiswa',
 'internasional',
 'harap',
 'usaha',
 'gerak',
 'abadi',
 'pts',
 'guntur']

# Inverted Index

Lakukan inverted index dengan menuliskan angka 1 jika term tersebut ada pada document dan angka 0 jika tidak ada

In [None]:
inverted_index = {}
for term in terms:
  doc = []
  for i in range(len(preprocessed_docs)):
    if term in preprocessed_docs[i]:
      doc.append(preprocessed_docs[i].count(term))
    else:
      doc.append(0)
  inverted_index[term] = doc

inverted_index

{'ptn': [0, 1, 0, 0, 0, 0],
 'bidang': [0, 1, 0, 0, 0, 0],
 'tanggap': [0, 0, 1, 1, 0, 0],
 'ardy': [1, 0, 0, 0, 0, 0],
 'bangun': [1, 0, 1, 0, 0, 1],
 'liga': [0, 1, 0, 0, 0, 0],
 'universitas': [0, 0, 0, 1, 1, 0],
 'gresik': [1, 0, 1, 0, 0, 1],
 'semen': [0, 0, 0, 0, 1, 0],
 'kholil': [1, 0, 0, 0, 0, 0],
 'nasional': [0, 1, 0, 0, 0, 0],
 'anugroho': [1, 0, 0, 0, 0, 0],
 'pagi': [1, 0, 0, 0, 0, 0],
 'putra': [1, 0, 0, 0, 0, 0],
 'uisi': [1, 1, 0, 0, 1, 0],
 'sukses': [0, 0, 0, 0, 1, 1],
 'peringkat': [0, 1, 0, 0, 0, 0],
 'kuliah': [1, 0, 0, 0, 0, 0],
 'masyarakat': [0, 0, 1, 1, 0, 0],
 'kharitsi': [1, 0, 0, 0, 0, 0],
 'jaya': [0, 1, 0, 0, 0, 1],
 'indonesia': [0, 0, 0, 0, 1, 0],
 'mahasiswa': [0, 1, 0, 0, 0, 0],
 'internasional': [0, 0, 0, 0, 1, 0],
 'harap': [0, 0, 0, 1, 0, 0],
 'usaha': [0, 0, 0, 0, 1, 0],
 'gerak': [0, 0, 1, 0, 0, 0],
 'abadi': [1, 0, 0, 0, 0, 0],
 'pts': [0, 1, 0, 0, 0, 0],
 'guntur': [1, 0, 0, 0, 0, 0]}

# TF-IDF

Melakukan perhitungan tf natural, tf logaritma, idf dan hasil tf-idf

In [None]:
#@title TF Natural
tf_nat = {}
for term in terms:
  term_total = 0
  for i in range(len(preprocessed_docs)):
    term_total += preprocessed_docs[i].count(term)
  tf_nat[term] = term_total

tf_nat

{'ptn': 1,
 'bidang': 1,
 'tanggap': 2,
 'ardy': 1,
 'bangun': 3,
 'liga': 1,
 'universitas': 2,
 'gresik': 3,
 'semen': 1,
 'kholil': 1,
 'nasional': 1,
 'anugroho': 1,
 'pagi': 1,
 'putra': 1,
 'uisi': 3,
 'sukses': 2,
 'peringkat': 1,
 'kuliah': 1,
 'masyarakat': 2,
 'kharitsi': 1,
 'jaya': 2,
 'indonesia': 1,
 'mahasiswa': 1,
 'internasional': 1,
 'harap': 1,
 'usaha': 1,
 'gerak': 1,
 'abadi': 1,
 'pts': 1,
 'guntur': 1}

In [None]:
#@title TF LOG10

tf_log = {}
for term in terms:
  term_total = 0
  for i in range(len(preprocessed_docs)):
    term_total += preprocessed_docs[i].count(term)
  tf_log[term] = 1+math.log10(term_total)

tf_log

{'ptn': 1.0,
 'bidang': 1.0,
 'tanggap': 1.3010299956639813,
 'ardy': 1.0,
 'bangun': 1.4771212547196624,
 'liga': 1.0,
 'universitas': 1.3010299956639813,
 'gresik': 1.4771212547196624,
 'semen': 1.0,
 'kholil': 1.0,
 'nasional': 1.0,
 'anugroho': 1.0,
 'pagi': 1.0,
 'putra': 1.0,
 'uisi': 1.4771212547196624,
 'sukses': 1.3010299956639813,
 'peringkat': 1.0,
 'kuliah': 1.0,
 'masyarakat': 1.3010299956639813,
 'kharitsi': 1.0,
 'jaya': 1.3010299956639813,
 'indonesia': 1.0,
 'mahasiswa': 1.0,
 'internasional': 1.0,
 'harap': 1.0,
 'usaha': 1.0,
 'gerak': 1.0,
 'abadi': 1.0,
 'pts': 1.0,
 'guntur': 1.0}

In [None]:
#@title IDF

idf = {}
for term in terms:
  N = len(preprocessed_docs)-1
  idf[term] = math.log10(N/tf_nat[term])

idf

{'ptn': 0.6989700043360189,
 'bidang': 0.6989700043360189,
 'tanggap': 0.3979400086720376,
 'ardy': 0.6989700043360189,
 'bangun': 0.22184874961635637,
 'liga': 0.6989700043360189,
 'universitas': 0.3979400086720376,
 'gresik': 0.22184874961635637,
 'semen': 0.6989700043360189,
 'kholil': 0.6989700043360189,
 'nasional': 0.6989700043360189,
 'anugroho': 0.6989700043360189,
 'pagi': 0.6989700043360189,
 'putra': 0.6989700043360189,
 'uisi': 0.22184874961635637,
 'sukses': 0.3979400086720376,
 'peringkat': 0.6989700043360189,
 'kuliah': 0.6989700043360189,
 'masyarakat': 0.3979400086720376,
 'kharitsi': 0.6989700043360189,
 'jaya': 0.3979400086720376,
 'indonesia': 0.6989700043360189,
 'mahasiswa': 0.6989700043360189,
 'internasional': 0.6989700043360189,
 'harap': 0.6989700043360189,
 'usaha': 0.6989700043360189,
 'gerak': 0.6989700043360189,
 'abadi': 0.6989700043360189,
 'pts': 0.6989700043360189,
 'guntur': 0.6989700043360189}

In [None]:
#@title TF-IDF

tf_idf = {}
for term in terms:
  tf_idf[term] = tf_log[term]*idf[term]

tf_idf

{'ptn': 0.6989700043360189,
 'bidang': 0.6989700043360189,
 'tanggap': 0.5177318877571058,
 'ardy': 0.6989700043360189,
 'bangun': 0.32769750339130055,
 'liga': 0.6989700043360189,
 'universitas': 0.5177318877571058,
 'gresik': 0.32769750339130055,
 'semen': 0.6989700043360189,
 'kholil': 0.6989700043360189,
 'nasional': 0.6989700043360189,
 'anugroho': 0.6989700043360189,
 'pagi': 0.6989700043360189,
 'putra': 0.6989700043360189,
 'uisi': 0.32769750339130055,
 'sukses': 0.5177318877571058,
 'peringkat': 0.6989700043360189,
 'kuliah': 0.6989700043360189,
 'masyarakat': 0.5177318877571058,
 'kharitsi': 0.6989700043360189,
 'jaya': 0.5177318877571058,
 'indonesia': 0.6989700043360189,
 'mahasiswa': 0.6989700043360189,
 'internasional': 0.6989700043360189,
 'harap': 0.6989700043360189,
 'usaha': 0.6989700043360189,
 'gerak': 0.6989700043360189,
 'abadi': 0.6989700043360189,
 'pts': 0.6989700043360189,
 'guntur': 0.6989700043360189}

Ambil tf idf kuadrat masing2 document

In [None]:
docs_tf_idf = []
for i in range(len(preprocessed_docs)-1):
  doc_tf_idf = []
  for term in preprocessed_docs[i]:
    doc_tf_idf.append(tf_idf[term]**2)
  docs_tf_idf.append(doc_tf_idf)

docs_tf_idf

[[0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.10738565372889143,
  0.4885590669614942,
  0.4885590669614942,
  0.10738565372889143,
  0.10738565372889143],
 [0.10738565372889143,
  0.2680463076005364,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942],
 [0.4885590669614942,
  0.10738565372889143,
  0.10738565372889143,
  0.2680463076005364,
  0.2680463076005364],
 [0.2680463076005364,
  0.4885590669614942,
  0.2680463076005364,
  0.2680463076005364],
 [0.4885590669614942,
  0.2680463076005364,
  0.2680463076005364,
  0.4885590669614942,
  0.4885590669614942,
  0.4885590669614942,
  0.10738565372889143]]

# Ambil tf idf kuadrat query
Pada tahap ini isi arraynya disesuaikan dengan isi masing-masing document sehingga memudahkan dalam proses cosine similarity nantinya

Ketika suatu term tidak ada pada salah satu document (query maupun dokumen yang dibandingkan) maka nilainya diisi 0

In [None]:
query_tf_idf = []
for i in range(len(preprocessed_docs)-1):
  q_doc_tf_idf = []
  for term in preprocessed_docs[i]:
    if term in q_stemmed:
      q_doc_tf_idf.append(tf_idf[term]**2)
    else:
      q_doc_tf_idf.append(0)
  query_tf_idf.append(q_doc_tf_idf)
query_tf_idf

[[0, 0, 0, 0, 0, 0, 0, 0.10738565372889143, 0, 0, 0, 0.10738565372889143],
 [0, 0.2680463076005364, 0, 0, 0, 0, 0, 0, 0],
 [0, 0.10738565372889143, 0.10738565372889143, 0, 0],
 [0, 0, 0, 0],
 [0, 0.2680463076005364, 0, 0, 0, 0, 0]]

# Panjang Vektor Tiap Dokumen

In [None]:
docs_vector = []

for doc in docs_tf_idf:
  docs_vector.append(norm(doc))

query_vector = []
for query in query_tf_idf:
  query_vector.append(norm(query))

docs_vector

[1.4774317896515274,
 1.3244660188230815,
 0.6367503162153996,
 0.6739706454773308,
 1.0535602366758567]

# COSINE SIMILARITY ANTARA QUERY DAN TIAP DOCUMENT

In [None]:
#@title Cosine Similarity
cos_sim = {}

for i in range(len(docs_tf_idf)):
  cos_sim[f"doc{i+1}"] = np.dot(query_tf_idf[i], docs_tf_idf[i])/(query_vector[i]*docs_vector[i])

# ubah nan menjadi 0
for sim in cos_sim:
  if np.isnan(cos_sim[sim]):
    cos_sim[sim] = 0

cos_sim

{'doc1': 0.10279070003192427,
 'doc2': 0.20238066042549127,
 'doc3': 0.23850203767519756,
 'doc4': 0,
 'doc5': 0.2544195369846753}

In [None]:
#@title Daftar dokumen teranking yang paling mirip dengan query Q
cos_sim_sorted = dict(sorted(cos_sim.items(), key=lambda item: item[1], reverse=True))
cos_sim_sorted

{'doc5': 0.2544195369846753,
 'doc3': 0.23850203767519756,
 'doc2': 0.20238066042549127,
 'doc1': 0.10279070003192427,
 'doc4': 0}