In [1]:
import pandas as pd
import numpy as np
import re
from itertools import groupby
import pickle

In [2]:
df = pd.read_excel("../IR_Spring2021_ph12_7k.xlsx")

In [3]:
def normalize(token):
    token = token.strip(".ء \u200b\u200c\u200d")
    token = re.sub("[ـ،؛,–—!٪؟+:_٫/.]", " ", token)
    token = re.sub("[*%&…#=\\\•;!|\-?]", " ", token)
    token = re.sub("[﴾﴿«»()<>\[\]“”'\"]", " ", token)
    token = re.sub("[\u200f\u200e\ufeff\u2067\u202a\u202b\u202c\u2069\xad]", " ", token)
    token = re.sub("\s+", " ", token)
    
    plurals = {
        "آداب": "ادب", "اطراف": "طرف", "حقایق": "حقیقت", "امواج": "موج",
        "مراکز": "مرکز", "اعماق": "عمق", "مواقع": "موقع", "اخبار": "خبر",
        "علما": "عالم", "آثار": "اثر", "مصارف": "مصرف", "علوم": "علم",
        "ادیان": "دین", "علائم": "علامت", "اسامی": "اسم", "مباحث": "مبحث",
        "دفاتر": "دفتر", "علل": "علت", "مذاهب": "مذهب", "عناصر": "عنصر",
        "مساجد": "مسجد", "روابط": "رابطه", "اعضا": "عضو", "عبارات": "عبارت",
        "موارد": "مورد", "مفاهیم": "مفهوم", "اشعار": "اشعار", "منابع": "منبع",
        "منبع": "قاعده", "فقها": "فقیه", "عجایب": "عجیب", "تصاویر": "تصویر"
    }
    for key, value in plurals.items():
        if key in token:
            token = token.replace(key, value)
            
    arabic_chars = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652']
    token = re.sub("|".join(arabic_chars), "", token)
    
    postfixes_a = ["ها", "های", "هایی", "ی", "ان",
                   "تر", "ترین",
                   "گر"]
    postfixes_b = ["\u200bها", "\u200cها", "\u200bهای", "\u200cهای", 
                   "\u200bهایی", "\u200cهایی",
                   "\u200bشده", "\u200cشده",
                   "\u200bساز", "\u200cساز",
                   "\u200bکنندگان", "\u200cکنندگان",
                   "\u200bتر", "\u200cتر", "\u200bترین", "\u200cترین"]
    
    stemming_b = ["می\u200b", "می\u200c", "نمی\u200b", "نمی\u200c"]
    stemming_e = ["ات", "ام", "اش", 
                  "یم", "ید", "ند", 
                  "مان", "تان", "شان"]
    
    prefixes = ["با", "بی", "نا"]
    
    # idea: avoid wrong cut with half space
    token = re.sub("|".join([f"^{st}" for st in stemming_b]), " ", token)
    token = re.sub("|".join([f"{pf}$" for pf in postfixes_b]), " ", token)
    
    if len(token) > 5: # idea: avoid wrong cut with char limit
        token = re.sub("|".join([f"{pf}$" for pf in postfixes_a]), " ", token)
        token = re.sub("|".join([f"{st}$" for st in stemming_e]), " ", token)
        token = re.sub("|".join([f"^{pf}" for pf in prefixes]), " ", token)
    
    fa_digits = "۱۲۳۴۵۶۷۸۹۰١٢٣٤٥٦٧٨٩٠"
    en_digits = "12345678901234567890"
    token = token.translate(str.maketrans(fa_digits, en_digits))

    token = re.sub("[ئي]", "ی", token)
    token = token.strip(".ء \u200b\u200c\u200d")
    return token

In [4]:
def tokenize(doc):
    tokens = np.array(doc.content.split())
    doc_id = np.full((len(tokens)), doc.id)
    return np.column_stack((tokens, doc_id))

In [5]:
%%time
# saved
tokenized_docs = []
for tokens_doc in df.apply(tokenize, axis=1):
    tokenized_docs.extend(tokens_doc)

CPU times: user 2.75 s, sys: 220 ms, total: 2.97 s
Wall time: 3.78 s


In [6]:
%%time
# saved
normalized_docs = []
for [token, doc] in tokenized_docs:
    token = normalize(normalize(token))
    normalized_docs.append([token, doc])

CPU times: user 1min 19s, sys: 441 ms, total: 1min 19s
Wall time: 1min 32s


In [7]:
%%time
# saved
tokens_docs = sorted(normalized_docs, key=lambda token_doc: token_doc[0])

CPU times: user 1.25 s, sys: 16 ms, total: 1.26 s
Wall time: 1.26 s


In [10]:
%%time
# saved
inverted_indexes = {}
for [token, t_docs] in groupby(tokens_docs, key=lambda tokens_doc: tokens_doc[0]):
    docs = sorted(set(map(lambda item: int(item[1]), t_docs)))
    if len(docs) > 1300:
        continue
    elif re.sub("\s+", "", token).isdigit() and len(re.sub("\s+", "", token)) < 4:
        continue
    elif re.match("^http|^https|^video", token) and len(token) > 15:
        continue
    elif len(token) < 2:
        continue
    else:
        inverted_indexes.update({ token: docs })
        
inverted_indexes_file = open("inverted_indexes", "ab")
pickle.dump(inverted_indexes, inverted_indexes_file)
inverted_indexes_file.close()

CPU times: user 1.7 s, sys: 23.9 ms, total: 1.72 s
Wall time: 1.74 s


In [5]:
inverted_indexes_file = open("inverted_indexes", "rb")
inverted_indexes = pickle.load(inverted_indexes_file)

In [6]:
def search(query):
    tokens = query.split()
    result = {}
    top_rank = 0
    
    found_token = []
    for token in tokens:
        token = normalize(normalize(token))
        if token in inverted_indexes:
            found_token.append(token)
            token_docs = inverted_indexes[token]
            for doc in token_docs:
                if doc in result:
                    result[doc] += 1
                    if result[doc] > top_rank:
                        top_rank = result[doc]
                else:
                    result.update({ doc: 1 })
                    if top_rank == 0:
                        top_rank = 1

    res_df = df.loc[df["id"].isin(list(result.keys())), ["id", "url"]].copy()
    res_df["rank"] = res_df["id"].apply(lambda id: result[id])
    res_df = res_df[res_df["rank"] == top_rank]

    print(found_token)
    print(f"#Matched Tokens: {top_rank} \t #Result: {len(res_df)}")
    for index, row in res_df.iterrows():
        if result[row.id] == top_rank:
            print("=" * 80)
            print(f"#Doc {row.id}")
            print(row.url)

In [24]:
%%time
# search("تمرینات تیم تکواندو")
# search("ونتیلاتور")
# search("میخاییل")

# search("آب‌گرفتگی")
# search("راه‌اندازی")

# search("هفت تپه")
# search("المپیک توکیو")

# search("پاکت پریدنتال میکروب")
search("سد کشاورز آب‌گیری")

['سد', 'کشاورز', 'آب\u200cگیر']
#Matched Tokens: 2 	 #Result: 28
#Doc 2022
https://www.isna.ir/news/99060605261/دولت-با-همه-مشکلات-اقتصادی-و-تحریم-اجرای-طرح-های-توسعه-ای-را
#Doc 2121
https://www.isna.ir/news/99070604559/تخصیص-۱۶۰۰-میلیارد-به-اجرای-سد-باغان-و-خط-انتقال-آب-آن
#Doc 2163
https://www.isna.ir/news/99072216964/قالیباف-رسیدگی-به-مشکلات-محرومان-از-اولویت-های-مجلس-است
#Doc 2495
https://www.isna.ir/news/99122721287/ثبت-نام-۱۰۷۰نفر-کاندید-انتخابات-شورای-شهر-در-استان-کردستان
#Doc 2521
https://www.isna.ir/news/98011303929/لاریجانی-سیل-زدگان-مضطرب-نباشند-کمک-بلاعوض-و-تسهیلات-ارزان-قیمت
#Doc 2753
https://www.isna.ir/news/98050201134/سرکنسول-ترکمنستان-در-مشهد-برقراری-امنیت-کامل-در-مرزهای-مشترک
#Doc 2887
https://www.isna.ir/news/98070403255/بررسی-طرح-تشدید-مجازات-اسید-پاشی-و-آخرین-برنامه-ریزی-های-مراسم
#Doc 2901
https://www.isna.ir/news/98070907299/بررسی-2-تحقیق-و-تفحص-در-کمیسیون-کشاورزی
#Doc 3243
https://www.isna.ir/news/99012212100/دریچه-های-تحتانی-سد-الغدیر-باز-شد
#Doc 3662
https://w

In [27]:
inverted_indexes[normalize(normalize("انقلاب"))]

[20,
 64,
 277,
 351,
 354,
 502,
 505,
 714,
 738,
 903,
 913,
 1068,
 1254,
 1365,
 1377,
 1440,
 1738,
 1739,
 1740,
 1762,
 1778,
 1780,
 1783,
 1796,
 1801,
 1805,
 1809,
 1818,
 1819,
 1829,
 1837,
 1845,
 1849,
 1857,
 1871,
 1878,
 1882,
 1906,
 1907,
 1933,
 1937,
 1955,
 1958,
 1960,
 1983,
 1984,
 1997,
 2011,
 2012,
 2024,
 2033,
 2038,
 2052,
 2054,
 2063,
 2065,
 2094,
 2099,
 2134,
 2148,
 2153,
 2155,
 2157,
 2169,
 2171,
 2172,
 2179,
 2197,
 2212,
 2232,
 2233,
 2239,
 2274,
 2280,
 2283,
 2325,
 2331,
 2337,
 2339,
 2340,
 2342,
 2357,
 2373,
 2426,
 2430,
 2443,
 2444,
 2447,
 2471,
 2474,
 2476,
 2477,
 2497,
 2498,
 2512,
 2520,
 2524,
 2539,
 2541,
 2543,
 2552,
 2554,
 2560,
 2565,
 2569,
 2580,
 2582,
 2587,
 2588,
 2591,
 2613,
 2618,
 2648,
 2650,
 2651,
 2652,
 2676,
 2681,
 2690,
 2691,
 2693,
 2695,
 2701,
 2704,
 2706,
 2709,
 2714,
 2729,
 2731,
 2733,
 2735,
 2736,
 2737,
 2739,
 2756,
 2757,
 2761,
 2762,
 2772,
 2785,
 2787,
 2788,
 2791,
 2796,
 2797