In [1]:
pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def preprocessing(sentence):
    # Remove HTML tags
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    sentence = re.sub(CLEANR, ' ', sentence)

    # Remove number
    sentence = re.sub(r"\d+", "", sentence)

    # Case folding
    sentence = sentence.lower()

    # Remove punctuation
    for p in string.punctuation:
        sentence = sentence.replace(p, " ")

    # Remove whitespace leading & trailing
    sentence = sentence.strip()

    # Remove multiple whitespace into single whitespace
    sentence = re.sub('\s+',' ', sentence)

    # Tokenization
    tokens = nltk.tokenize.word_tokenize(sentence)

    # Remove stopwords in Bahasa Indonesia
    bahasa_stopwords = set(stopwords.words('indonesian'))
    tokens_without_bahasa_stopwords = [token for token in tokens if not token in bahasa_stopwords]

    # Remove stopwords in English
    english_stopwords = set(stopwords.words('english'))
    tokens_without_bilingual_stopwords = [token for token in tokens_without_bahasa_stopwords if not token in english_stopwords]

    # Lemmatization for English words
    lemmatizer = WordNetLemmatizer()
    english_base_word_tokens = [lemmatizer.lemmatize(token) for token in tokens_without_bilingual_stopwords]

    # Stemming words in Bahasa Indonesia
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    bilingual_base_word_tokens = [stemmer.stem(token) for token in english_base_word_tokens]

    # Combine the list of string into string separated by a whitespace
    return " ".join(bilingual_base_word_tokens)

In [4]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

file_path = r'/gdrive/My Drive/discussion_dummy_data_v2.xlsx'
df = pd.read_excel(file_path)
dummy_list_of_dict = df.to_dict(orient='records')
dummy_list_of_dict

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


[{'combined': 'error recyclerview data tidak tampil di fragment follower dan following terdapat error recyclerview no adapter attached saat menampilkan detail user di fragment follower dan following recyclerview no adapter attached fragment follower following',
  'combined_processed': 'error recyclerview data tampil fragment follower following error recyclerview adapter attached tampil detail user fragment follower following recyclerview adapter attached fragment follower following',
  'content': 'terdapat error recyclerview no adapter attached saat menampilkan detail user di fragment follower dan following',
  'keywords': 'recyclerview,no-adapter-attached,fragment,follower,following',
  'multinum_value': '[ 3.91798094e-03  2.42242347e-02 -5.84438967e-04 -2.99954467e-04\n  7.61578616e-04 -4.98731574e-03  1.85093991e-02  8.21453892e-03\n  1.20759513e-02 -5.19720721e-04  2.34728120e-03  2.28301007e-02\n  3.58644351e-02  1.00188572e-02 -5.74384211e-03  1.34939281e-02\n  9.22510808e-04 -1.

In [5]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['combined_processed'])
tfidf_matrix.shape

(15, 81)

In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.02353616, 0.15192858, 0.        , 0.22105632,
        0.57342976, 0.01645691, 0.0110249 , 0.04849889, 0.36597704,
        0.        , 0.18919502, 0.0431237 , 0.        , 0.03257435],
       [0.02353616, 1.        , 0.        , 0.        , 0.        ,
        0.06116161, 0.44510214, 0.46857032, 0.18068813, 0.04509022,
        0.        , 0.46537524, 0.        , 0.01998738, 0.        ],
       [0.15192858, 0.        , 1.        , 0.47137438, 0.22764705,
        0.09256814, 0.232362  , 0.        , 0.03195443, 0.09776162,
        0.        , 0.1860535 , 0.07935126, 0.58439959, 0.3034481 ],
       [0.        , 0.        , 0.47137438, 1.        , 0.        ,
        0.10472223, 0.        , 0.3013564 , 0.02911125, 0.        ,
        0.31765577, 0.        , 0.04551531, 0.63357746, 0.        ],
       [0.22105632, 0.        , 0.22764705, 0.        , 1.        ,
        0.19148066, 0.        , 0.        , 0.        , 0.        ,
        0.01729272, 0.26077087, 0.        , 

In [7]:
indices = pd.Series(df['combined_processed'].index, index=df['title']).drop_duplicates()

In [8]:
def search(query, cosine_sim=cosine_sim):

  preprocessed_query = preprocessing(query)

  #Regular expression to get the index of the document in dataset that matches the title
  for curr_idx, data in enumerate(dummy_list_of_dict):
      if re.search(preprocessed_query, data['combined_processed']):
          idx = curr_idx
          break

  #Get the pairwise similarity scores of all document in the dataset with the title
  sim_scores = list(enumerate(cosine_sim[idx]))

  #Sort the document based on the similarity score
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  #Get the scores of the 10 most similar documents
  above_threshold = []
  for idx, score in sim_scores:
    if score >= 0.1:
      above_threshold.append((idx, score))

  #Get the indices of the documents
  doc_indices = [i[0] for i in above_threshold]

  #Return the title of the document based on the indices
  return df['title'].iloc[doc_indices]

In [9]:
search('revisi submission')

10               revisi submission dianggap plagiarisme
7     submission ditolak karena implementasi parcela...
3     dark theme tidak jalan sehingga submission dit...
5                        penggunaan api pada submission
Name: title, dtype: object