<a href="https://colab.research.google.com/github/Achmad96/information-retrieval/blob/master/IR_Ayat_Alqur'an_id_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
BASE_URL = 'https://tanzil.net/trans/'

In [14]:
id_df = pd.read_csv(f'{BASE_URL}/id.indonesian', sep='|',names=['surah','verse','translations'])
id_df = id_df.iloc[:-11]
id_df['lang'] = 'id'
id_df.head()

Unnamed: 0,surah,verse,translations,lang
0,1,1.0,Dengan menyebut nama Allah Yang Maha Pemurah l...,id
1,1,2.0,"Segala puji bagi Allah, Tuhan semesta alam.",id
2,1,3.0,Maha Pemurah lagi Maha Penyayang.,id
3,1,4.0,Yang menguasai di Hari Pembalasan.,id
4,1,5.0,"Hanya Engkaulah yang kami sembah, dan hanya ke...",id


In [15]:
en_df = pd.read_csv(f'{BASE_URL}/en.sahih', sep='|', names=['surah','verse','translations'])
en_df = en_df.iloc[:-11]
en_df['lang'] = 'en'
en_df.head()

Unnamed: 0,surah,verse,translations,lang
0,1,1.0,"In the name of Allah, the Entirely Merciful, t...",en
1,1,2.0,"[All] praise is [due] to Allah, Lord of the wo...",en
2,1,3.0,"The Entirely Merciful, the Especially Merciful,",en
3,1,4.0,Sovereign of the Day of Recompense.,en
4,1,5.0,It is You we worship and You we ask for help.,en


In [16]:
df = pd.concat([id_df, en_df]).sort_values(by=['surah','verse']).reset_index(drop=True)
df.head()

Unnamed: 0,surah,verse,translations,lang
0,1,1.0,Dengan menyebut nama Allah Yang Maha Pemurah l...,id
1,1,1.0,"In the name of Allah, the Entirely Merciful, t...",en
2,1,2.0,"Segala puji bagi Allah, Tuhan semesta alam.",id
3,1,2.0,"[All] praise is [due] to Allah, Lord of the wo...",en
4,1,3.0,Maha Pemurah lagi Maha Penyayang.,id


In [17]:
vectorizer = TfidfVectorizer()

In [18]:
translations = df['translations']
vectorized_translations = vectorizer.fit_transform(translations)

In [19]:
print('Enter the queries (leave it empty if done): ')

x =  input()
queries = []
while x != '':
    queries.append(x)
    x = input()

Enter the queries (leave it empty if done): 
Muhammad
Allah



In [20]:
vectorized_input = vectorizer.transform(queries)

In [21]:
print('Calculate cosine similarity...')
similarity_scores = cosine_similarity(vectorized_input, vectorized_translations)

Calculate cosine similarity...


In [25]:
import textwrap
THRESHOLD = 0.3
TEXTWRAP_LENGTH = 150

for query_index, query_text in enumerate(queries):
  high_score_indices = np.where(similarity_scores[query_index] >= THRESHOLD)
  high_score_indices = high_score_indices[0]

  print(f"--- Results for Query {query_index + 1} ({len(high_score_indices)}) ---")
  wrapped_query_text = textwrap.fill(query_text, width=TEXTWRAP_LENGTH or 130)
  print(f'For Query: "{wrapped_query_text}"\n')
  if len(high_score_indices) == 0:
    print(f"No entries found with a similarity score greater or equals to {THRESHOLD} for this query.")
    continue
  for idx in high_score_indices:
    surah = df.loc[idx, 'surah']
    verse = df.loc[idx, 'verse']
    language = df.loc[idx, 'lang']
    translation = df.loc[idx, 'translations']
    score = similarity_scores[query_index, idx]
    translation_text = textwrap.fill(translation, width=TEXTWRAP_LENGTH or 130)
    print(f'Surah: {surah}')
    print(f'Verse: {verse}')
    print(f'Translations ({language}): {translation_text}')
    print(f'Score: {score:.4f}')
    print()
  print()

--- Results for Query 1 (11) ---
For Query: "Muhammad"

Surah: 21
Verse: 107.0
Translations (en): And We have not sent you, [O Muhammad], except as a mercy to the worlds.
Score: 0.3141

Surah: 35
Verse: 23.0
Translations (en): You, [O Muhammad], are not but a warner.
Score: 0.4400

Surah: 36
Verse: 3.0
Translations (en): Indeed you, [O Muhammad], are from among the messengers,
Score: 0.4172

Surah: 37
Verse: 174.0
Translations (id): Maka berpalinglah kamu (Muhammad) dari mereka sampai suatu ketika.
Score: 0.3263

Surah: 37
Verse: 174.0
Translations (en): So, [O Muhammad], leave them for a time.
Score: 0.4022

Surah: 48
Verse: 1.0
Translations (en): Indeed, We have given you, [O Muhammad], a clear conquest
Score: 0.3323

Surah: 51
Verse: 54.0
Translations (en): So leave them, [O Muhammad], for you are not to be blamed.
Score: 0.3095

Surah: 68
Verse: 2.0
Translations (en): You are not, [O Muhammad], by the favor of your Lord, a madman.
Score: 0.3076

Surah: 7
Verse: 184.0
Translations (