## preprocessing

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Unduh stopwords dan tokenizer NLTK jika belum tersedia
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset (contoh membaca dari file CSV)
file_path = "CISI_ALL.csv"  # Ganti dengan path file Anda
data = pd.read_csv(file_path)

# Fungsi preprocessing
def preprocess_text(text):
    # 1. Case folding (mengubah teks menjadi huruf kecil)
    text = text.lower()
    
    # 2. Remove special characters, angka, dan tanda baca
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenisasi
    tokens = word_tokenize(text)
    
    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # Gabungkan kembali menjadi teks
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

# Terapkan preprocessing pada kolom teks
data['Preprocessed_Query'] = data['Query'].apply(preprocess_text)

# Simpan hasil preprocessing ke file baru
output_file = "preprocessed_dataset.csv"
data.to_csv(output_file, index=False)

print(f"Preprocessing selesai. Dataset tersimpan di {output_file}.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OPTION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OPTION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\OPTION\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Preprocessing selesai. Dataset tersimpan di preprocessed_dataset.csv.


In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Unduh wordnet jika belum tersedia
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Untuk dukungan bahasa tambahan

# Fungsi untuk mendapatkan part of speech (POS) tag yang cocok untuk lemmatization
def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer."""
    from nltk.corpus import wordnet
    from nltk import pos_tag

    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Fungsi preprocessing dengan Lemmatization
def preprocess_with_lemmatization(text):
    # Case folding
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
    return " ".join(tokens)

# Terapkan preprocessing pada kolom teks
data['Preprocessed_Query'] = data['Query'].apply(preprocess_with_lemmatization)

# Simpan hasilnya
output_file = "lemmatized1_dataset.csv"
data.to_csv(output_file, index=False)
print(f"Dataset dengan lemmatization disimpan di {output_file}.")


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\OPTION\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OPTION\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\OPTION\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Dataset dengan lemmatization disimpan di lemmatized1_dataset.csv.


## word embedding dengan gensim

In [12]:
from gensim.models import Word2Vec
import pandas as pd
import ast

# Load dataset yang telah dipreproses
file_path = "lemmatized1_dataset.csv"  # Ganti dengan path file preprocessed Anda
data = pd.read_csv(file_path)

# Pastikan teks telah diubah menjadi daftar token (jika belum)
def text_to_tokens(text):
    return text.split()

# Konversi kolom Preprocessed_Query menjadi token
data['Tokenized_Query'] = data['Preprocessed_Query'].apply(text_to_tokens)

# Siapkan data untuk pelatihan Word2Vec
sentences = data['Tokenized_Query'].tolist()  # List of tokenized sentences

# Train Word2Vec model
w2v_model = Word2Vec(
    sentences=sentences,     # Data kalimat (tokenized sentences)
    vector_size=100,         # Dimensi vektor embedding
    window=5,                # Jumlah kata sebelum/sesudah (context window)
    min_count=2,             # Minimum kemunculan kata untuk dilibatkan
    workers=4,               # Jumlah thread untuk paralelisasi
    sg=1                     # Gunakan skip-gram (1) atau CBOW (0)
)

# Simpan model Word2Vec ke file
w2v_model.save("word2vec_model.model")
print("Model Word2Vec telah berhasil dilatih dan disimpan ke file.")

# Contoh: Mendapatkan vektor untuk sebuah kata
word = "library"  # Ganti dengan kata yang ingin dilihat
if word in w2v_model.wv:
    print(f"Vektor untuk kata '{word}':\n{w2v_model.wv[word]}")
else:
    print(f"Kata '{word}' tidak ditemukan dalam model Word2Vec.")


Model Word2Vec telah berhasil dilatih dan disimpan ke file.
Vektor untuk kata 'library':
[ 0.03004955  0.02732495  0.3893054   0.26482025 -0.27534643 -0.39929053
  0.49276295  0.43581352 -0.05768473  0.01725278  0.38679412 -0.19372854
 -0.37777913  0.04664098 -0.12557149 -0.34858975  0.11002731 -0.15013285
 -0.19472124 -0.34154102  0.29538974  0.00332176  0.14704472  0.06487579
  0.16324086 -0.32565227  0.17737181  0.21696636 -0.19167136 -0.11441456
 -0.17317359  0.00981066  0.19798115 -0.07728899  0.02967602  0.29597965
  0.089549    0.05817123  0.05290496 -0.60210294 -0.2613334   0.22473913
 -0.12272639 -0.02433754 -0.04112124 -0.1941676   0.0311154   0.26438338
  0.0153446   0.25202006 -0.5001134  -0.3737789  -0.12476476  0.11853876
  0.29325184 -0.13920529  0.06084615  0.03410287  0.01571178  0.22537446
 -0.11768179  0.15422213  0.0294282  -0.2825325  -0.21761458  0.02165202
  0.24223983  0.2450874  -0.27064526  0.09994075  0.04349189  0.21470197
  0.5163331  -0.07059516  0.1611818

In [13]:
# load model
w2v_model = Word2Vec.load("word2vec_model.model")
# Contoh: Mendapatkan vektor untuk sebuah kata
word = "library"  # Ganti dengan kata yang ingin dilihat
if word in w2v_model.wv:
    print(f"Vektor untuk kata '{word}':\n{w2v_model.wv[word]}")
else:
    print(f"Kata '{word}' tidak ditemukan dalam model Word2Vec.")

Vektor untuk kata 'library':
[ 0.03004955  0.02732495  0.3893054   0.26482025 -0.27534643 -0.39929053
  0.49276295  0.43581352 -0.05768473  0.01725278  0.38679412 -0.19372854
 -0.37777913  0.04664098 -0.12557149 -0.34858975  0.11002731 -0.15013285
 -0.19472124 -0.34154102  0.29538974  0.00332176  0.14704472  0.06487579
  0.16324086 -0.32565227  0.17737181  0.21696636 -0.19167136 -0.11441456
 -0.17317359  0.00981066  0.19798115 -0.07728899  0.02967602  0.29597965
  0.089549    0.05817123  0.05290496 -0.60210294 -0.2613334   0.22473913
 -0.12272639 -0.02433754 -0.04112124 -0.1941676   0.0311154   0.26438338
  0.0153446   0.25202006 -0.5001134  -0.3737789  -0.12476476  0.11853876
  0.29325184 -0.13920529  0.06084615  0.03410287  0.01571178  0.22537446
 -0.11768179  0.15422213  0.0294282  -0.2825325  -0.21761458  0.02165202
  0.24223983  0.2450874  -0.27064526  0.09994075  0.04349189  0.21470197
  0.5163331  -0.07059516  0.16118188  0.18950875  0.01418779 -0.07133402
 -0.25276366 -0.304926

In [16]:
import numpy as np

def document_vector(doc):
    """Hitung rata-rata vektor kata-kata dalam dokumen."""
    words = doc.split()
    word_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(w2v_model.vector_size)

# Contoh transformasi untuk dokumen pertama
doc_vector = document_vector(data['Preprocessed_Query'][0])
print(f"Vektor dokumen pertama:\n{doc_vector}")


Vektor dokumen pertama:
[-0.05298809  0.14883873  0.06761188  0.1690371  -0.02752658 -0.33220217
  0.11274979  0.33428666 -0.11857119  0.06671059  0.00186981 -0.18705064
 -0.08967867 -0.03201508 -0.00272982 -0.19893683  0.02848889 -0.31452104
 -0.03122917 -0.3317745   0.12665766  0.13904852  0.11094927  0.07895303
 -0.12379958 -0.1105877  -0.0526808  -0.03794871 -0.12756772 -0.02031747
  0.1555505  -0.0566194   0.05236406 -0.01288007 -0.15826122  0.25646824
  0.12720212 -0.22771665 -0.10631934 -0.40095854 -0.08801356 -0.08335936
 -0.01187212  0.03604522  0.17045309  0.01405964 -0.13940947  0.01502349
 -0.09380537  0.09588465 -0.05034627 -0.18002313 -0.1004464   0.03124702
 -0.02109029  0.02198052  0.16024938  0.0864702  -0.10872496 -0.00747664
 -0.03431132  0.1350171  -0.00344313 -0.08790675 -0.2598946   0.13713263
  0.07769086  0.1490937  -0.18448609  0.29003373 -0.06146934  0.09239207
  0.2964416  -0.01541163  0.14396985  0.10883104 -0.08803122  0.00863197
 -0.19573492 -0.12197206  0

In [17]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fungsi untuk membuat rata-rata vektor kata dalam dokumen
def average_vector(text, model):
    """Menghitung rata-rata vektor kata dalam teks berdasarkan model Word2Vec."""
    words = text.split()
    vectors = []
    for word in words:
        if word in model.wv:  # Hanya gunakan kata yang ada di Word2Vec
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)  # Jika tidak ada kata yang cocok, kembalikan nol
    return np.mean(vectors, axis=0)

# Preprocess semua dokumen dan hitung vektornya
def preprocess_and_vectorize_documents(documents, model):
    """Preprocessing dan konversi dokumen menjadi vektor."""
    doc_vectors = []
    for doc in documents:
        processed_text = preprocess_with_lemmatization(doc)  # Gunakan fungsi preprocess_with_lemmatization yang telah dibuat
        doc_vectors.append(average_vector(processed_text, model))
    return doc_vectors

# Fungsi untuk mencari dokumen relevan
def search_documents(query, model, doc_vectors, documents, top_n=5):
    """Melakukan pencarian dokumen berdasarkan query."""
    # preprocess_with_lemmatization dan vektorkan query
    processed_query = preprocess_with_lemmatization(query)  # Preprocess query
    query_vector = average_vector(processed_query, model)
    
    # Hitung kesamaan cosine antara query dan semua dokumen
    similarities = []
    for i, doc_vector in enumerate(doc_vectors):
        sim = cosine_similarity([query_vector], [doc_vector])[0][0]
        similarities.append((i, sim))
    
    # Urutkan dokumen berdasarkan kesamaan (descending)
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    # Ambil top-n dokumen yang paling relevan
    top_docs = similarities[:top_n]
    
    # Tampilkan hasil
    results = []
    for idx, score in top_docs:
        results.append((documents[idx], score))
    return results

# Contoh Implementasi
# 1. List semua dokumen
documents = data['Preprocessed_Query'].tolist()  # Kolom hasil preprocess

# 2. Hitung vektor dokumen
doc_vectors = preprocess_and_vectorize_documents(documents, w2v_model)

# 3. Masukkan query dan cari dokumen
query = "library management"
results = search_documents(query, w2v_model, doc_vectors, documents)

# 4. Print hasil pencarian
print("Hasil Pencarian:")
for doc, score in results:
    print(f"Skor: {score:.4f}\nDokumen: {doc}\n")


Hasil Pencarian:
Skor: 0.9465
Dokumen: medical library assistance act analysis nlm extramural program cummings martin corn mary e imbalance medical library resource information need health professional lead reexamination mandate national library medicine legislation know medical library assistance act mlaa pass enable nlm initiate program assist nation medical library develop medical library network establishment regional medical library link nlm local institution national library medicine mlaa make available million medical library community competitive grant contract mechanism period july june total project execute resource research development training construction regional medical library publication special scientific project assessment give program impact national library medicine individual medical library aggregate program significantly improve library information service professional health user principal limitation inadequate funding accomplish level originally state objectiv