# Implementasi Kode Pengambilan Dokumen dengan metode Boolean dan Vector (VSM)

###  dengan cara mengambil dokumen 10 dokumen dengan kecocokan paling tinggi dari query dari dokumen berupa Al-Qur'an 30 juz 

## # inisiasi library

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

# %load_ext autotime

In [2]:
title = "stories"
alpha = 0.3

## Taking all folders
mengambil folder yang dijadikan sebagai data dokumen

In [3]:
folders = [x[0] for x in os.walk(str(os.getcwd())+'/'+title+'/')]
folders[0] = folders[0][:len(folders[0])-1]

In [4]:
folders

['C:\\Users\\user\\Documents\\Python Scripts\\vsm-quran-in-english\\2. TF-IDF Ranking - Cosine Similarity VSM, Matching Score Boolean Model/stories']

## Collecting the Surah and Chapter

mengambil data set lewat html yang disusun di index.html yang terdapat di folder (quran30juz) dan disusun sesuai Nama surat dan isinya. 

In [5]:
dataset = []

c = False

for i in folders:
    file = open(i+"/index.html", 'r')
    text = file.read().strip()
    file.close()

    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)

    if c == False:
        file_name = file_name[2:]
        c = True
        
    print(len(file_name), len(file_title))

    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))

35 37


In [6]:
len(dataset)

35

In [7]:
N = len (dataset)

In [8]:
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

# Preprocessing

melakukan preprocessing di data yang sudah disusun.

In [9]:
def convert_lower_case(data):
    return np.char.lower(data)

In [10]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [11]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [12]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [13]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [14]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [15]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Extracting Data

mengekstrak data.

In [16]:
processed_text = []
processed_title = []

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

## Calculating DF for all Surah

menghitung DF untuk semua kata dalam isi surat.

In [17]:
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [18]:
# DF

In [19]:
total_vocab_size = len(DF)

In [20]:
total_vocab_size

791

In [21]:
total_vocab = [x for x in DF]

In [22]:
print(total_vocab[:20])

['name', 'god', 'graciou', 'merci', 'frown', 'turn', 'away', 'blind', 'man', 'approach', 'know', 'perhap', 'seek', 'purifi', 'remind', 'messag', 'would', 'benefit', 'indiff', 'gave']


In [23]:
#inisiasi fungsi DF

In [24]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

### Calculating TF-IDF for content of surah, we will consider this as the actual tf-idf as we will add the title weight to this.

menghitung tf-idf dari isi surat. 

In [25]:
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens + processed_title[i])
    words_count = len(tokens + processed_title[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [26]:
# tf_idf

### Calculating TF-IDF for Title of Surah

menghitung TF-IDF untuk nama surat

In [27]:
doc = 0

tf_idf_title = {}

for i in range(N):
    
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        
        tf_idf_title[doc, token] = tf*idf

    doc += 1

In [28]:
# tf_idf_title

In [29]:
tf_idf[(0,"day")]

0.021483895395187726

In [30]:
tf_idf_title[(0,"event")]

0.017732342072982604

## Merging the TF-IDF according to weights

penggabungan TF-IDF nama surat dan isi surat.

In [31]:
for i in tf_idf:
    tf_idf[i] *= alpha

In [32]:
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [33]:
len(tf_idf)

1662

# TF-IDF Matching Score Ranking 
# Boolean Model

penilaian berdasarkan model BOOLEAN 

In [34]:
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score / Boolean Model")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    print("hasil pencarian dari queri, bobot terbesar adalah teks ke:")
    for i in query_weights[:10]:
        l.append(i[0])
    
    print(l)
    

matching_score(10, "We created man in the best design. Then reduced him to the lowest of the low.")

Matching Score / Boolean Model

Query: We created man in the best design. Then reduced him to the lowest of the low.

['creat', 'man', 'best', 'design', 'reduc', 'lowest', 'low']

hasil pencarian dari queri, bobot terbesar adalah teks ke:
[15, 16, 6, 33, 0, 18, 2, 19, 10, 9]


In [35]:
 print_doc(16)

('C:\\Users\\user\\Documents\\Python Scripts\\vsm-quran-in-english\\2. TF-IDF Ranking - Cosine Similarity VSM, Matching Score Boolean Model/stories/96.txt', 'The Soothing (ash-Sharh)')
In the name of God, the Gracious, the Merciful.

1. Read: In the Name of your Lord who created.

2. Created man from a clot.

3. Read: And your Lord is the Most Generous.

4. He who taught by the pen.

5. Taught man what he never knew.

6. In fact, man oversteps all bounds.

7. When he considers himself exempt.

8. But to your Lord is the return.

9. Have you seen him who prevents?

10. A servant when he prays?

11. Do you think he is upon guidance?

12. Or advocates righteousness?

13. Do you see how he disbelieved and turned away?

14. Does he not know that God sees?

15. No. If he does not desist, We will drag him by the forelock.

16. A deceitful, sinful forelock.

17. Let him call on his gang.

18. We will call the Guards.

19. No, do not obey him; but kneel down, and come near.


# TF-IDF Cosine Similarity Ranking
# Vector Model

penilaian berdasarkan model vektor

In [36]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

### Vectorising tf-idf

membuat tf-idf vektor 

In [37]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [38]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [39]:
def cosine_similarity(k, query):
    print("Cosine Similarity / Vector Based Model")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print("hasil pencarian dari queri, bobot terbesar adalah teks ke:")
    print(out)

#     for i in out:
#         print(i, dataset[i][0])

Q = cosine_similarity(10, "God's kindled Fire. That laps to the hearts. It closes in on them. In extended columns.")

Cosine Similarity / Vector Based Model

Query: God's kindled Fire. That laps to the hearts. It closes in on them. In extended columns.

['god', 'kindl', 'fire', 'lap', 'heart', 'close', 'extend', 'column']

hasil pencarian dari queri, bobot terbesar adalah teks ke:
[24 14 34 20 31 21  3  7 10 18]


In [40]:
print_doc(24)

('C:\\Users\\user\\Documents\\Python Scripts\\vsm-quran-in-english\\2. TF-IDF Ranking - Cosine Similarity VSM, Matching Score Boolean Model/stories/104.txt', 'Abudance (al-at-Takathur)')
In the name of God, the Gracious, the Merciful.

1. Woe to every slanderer backbiter.

2. Who gathers wealth and counts it over.

3. Thinking that his wealth has made him immortal.

4. By no means. He will be thrown into the Crusher.

5. And what will make you realize what the Crusher is?

6. God's kindled Fire.

7. That laps to the hearts.

8. It closes in on them.

9. In extended columns.
