In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import heapq
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import get_data

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\badru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\badru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
documents = get_data.master_data

In [3]:

preprocessed_documents = []
for text in documents:
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'[\n\t\r]', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    print(text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('indonesian'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    preprocessed_document = " ".join(tokens)

    preprocessed_documents.append(preprocessed_document)



vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

 implementasi teknologi blockchain dalam keamanan data transaksi digital   jurnal ini membahas penerapan teknologi blockchain untuk meningkatkan keamanan transaksi digital  melalui penelitian ini  dijelaskan bagaimana blockchain dapat mengurangi risiko keamanan data pada transaksi online 
 analisis sentimen pada media sosial menggunakan metode pembelajaran mesin   studi ini menginvestigasi berbagai teknik pembelajaran mesin untuk menganalisis sentimen dalam konten media sosial  penelitian ini mengevaluasi kinerja berbagai algoritma dalam mengklasifikasikan sentimen yang diekspresikan dalam konten online 
 teknik penyimpanan data yang memperhatikan kebijakan privasi pada sistem kesehatan   fokus pada bidang kesehatan  penelitian ini mengeksplorasi teknik penyimpanan data yang memperhatikan kebijakan privasi untuk menghasilkan wawasan yang berarti dari data medis sensitif sambil memastikan privasi dan kerahasiaan pasien 
 optimalisasi alokasi sumber daya pada lingkungan komputasi awan   

In [4]:
text = "blockchain"
text = BeautifulSoup(text, "html.parser").get_text()
text = text.lower()
text = re.sub(r'[\n\t\r]', ' ', text)
text = re.sub(r'[^a-zA-Z\s]', ' ', text)

tokens = word_tokenize(text)

stop_words = set(stopwords.words('indonesian'))
tokens = [token for token in tokens if token.lower() not in stop_words]

factory = StemmerFactory()
stemmer = factory.create_stemmer()
tokens = [stemmer.stem(token) for token in tokens]

preprocessed_document = " ".join(tokens)

query = preprocessed_document
query_vector = vectorizer.transform([query])
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)

best_match_indices = heapq.nlargest(10, range(len(similarity_scores[0])), similarity_scores[0].__getitem__)
best_match_scores = [similarity_scores[0][index] for index in best_match_indices]

for index, score in zip(best_match_indices, best_match_scores):
    print(f"Jurnal ke: {index} memiliki kemiripan: {score*100}%")

Jurnal ke: 0 memiliki kemiripan: 46.24170308570497%
Jurnal ke: 31 memiliki kemiripan: 37.35589175334809%
Jurnal ke: 7 memiliki kemiripan: 36.72018284612384%
Jurnal ke: 21 memiliki kemiripan: 36.23420312731056%
Jurnal ke: 1 memiliki kemiripan: 0.0%
Jurnal ke: 2 memiliki kemiripan: 0.0%
Jurnal ke: 3 memiliki kemiripan: 0.0%
Jurnal ke: 4 memiliki kemiripan: 0.0%
Jurnal ke: 5 memiliki kemiripan: 0.0%
Jurnal ke: 6 memiliki kemiripan: 0.0%


In [5]:
def save_model(model, filepath):
    joblib.dump(model, filepath)

def load_model(filepath):
    return joblib.load(filepath)



In [6]:
save_model(vectorizer, "tfidf_document.joblib")
save_model(tfidf_matrix, "tfidf_matrix.joblib")

In [7]:
loaded_vectorizer = load_model("tfidf_document.joblib")
loaded_tfidf_matrix = load_model("tfidf_matrix.joblib")

text = "jurnal"
text = BeautifulSoup(text, "html.parser").get_text()
text = text.lower()
text = re.sub(r'[\n\t\r]', ' ', text)
text = re.sub(r'[^a-zA-Z\s]', ' ', text)

tokens = word_tokenize(text)

stop_words = set(stopwords.words('indonesian'))
tokens = [token for token in tokens if token.lower() not in stop_words]

factory = StemmerFactory()
stemmer = factory.create_stemmer()
tokens = [stemmer.stem(token) for token in tokens]

preprocessed_document = " ".join(tokens)

query = preprocessed_document
query_vector = loaded_vectorizer.transform([query])
similarity_scores = cosine_similarity(query_vector, loaded_tfidf_matrix)

best_match_indices = heapq.nlargest(5, range(len(similarity_scores[0])), similarity_scores[0].__getitem__)
best_match_scores = [similarity_scores[0][index] for index in best_match_indices]

r = zip(best_match_indices, best_match_scores)
for index, score in r:
    print(f"Jurnal ke: {index} memiliki kemiripan: {score*100}%")


Jurnal ke: 0 memiliki kemiripan: 17.930919059190206%
Jurnal ke: 4 memiliki kemiripan: 14.944541856131133%
Jurnal ke: 1 memiliki kemiripan: 0.0%
Jurnal ke: 2 memiliki kemiripan: 0.0%
Jurnal ke: 3 memiliki kemiripan: 0.0%
