In [1]:
import pandas as pd
import numpy as np
import re
from itertools import groupby
import math
import pickle
import heapq

In [2]:
%%time
df = pd.read_excel("IR00_dataset_ph3/IR00_3_11k News.xlsx")
# df_un_spvs = pd.read_excel("../IR_Spring2021_ph12_7k.xlsx")
df_un_spvs = pd.read_csv("IR_Spring2021_ph12_7k.csv")

CPU times: user 1.53 s, sys: 45.2 ms, total: 1.58 s
Wall time: 2.08 s


In [3]:
def normalize(token):
    token = token.strip(".ء \u200b\u200c\u200d")
    token = re.sub("[ـ،؛,–—!٪؟+:_٫/.]", " ", token)
    token = re.sub("[*%&…#=\\\•;!|\-?]", " ", token)
    token = re.sub("[﴾﴿«»()<>\[\]“”'\"]", " ", token)
    token = re.sub("[\u200f\u200e\ufeff\u2067\u202a\u202b\u202c\u2069\xad]", " ", token)
    token = re.sub("\s+", " ", token)
    
    plurals = {
        "آداب": "ادب", "اطراف": "طرف", "حقایق": "حقیقت", "امواج": "موج",
        "مراکز": "مرکز", "اعماق": "عمق", "مواقع": "موقع", "اخبار": "خبر",
        "علما": "عالم", "آثار": "اثر", "مصارف": "مصرف", "علوم": "علم",
        "ادیان": "دین", "علائم": "علامت", "اسامی": "اسم", "مباحث": "مبحث",
        "دفاتر": "دفتر", "علل": "علت", "مذاهب": "مذهب", "عناصر": "عنصر",
        "مساجد": "مسجد", "روابط": "رابطه", "اعضا": "عضو", "عبارات": "عبارت",
        "موارد": "مورد", "مفاهیم": "مفهوم", "اشعار": "اشعار", "منابع": "منبع",
        "منبع": "قاعده", "فقها": "فقیه", "عجایب": "عجیب", "تصاویر": "تصویر"
    }
    
    for key, value in plurals.items():
        if key in token:
            token = token.replace(key, value)
            
    arabic_chars = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652']
    token = re.sub("|".join(arabic_chars), "", token)
    
    postfixes_a = ["ها", "های", "هایی", "ی", "ان",
                   "تر", "ترین",
                   "گر"]
    postfixes_b = ["\u200bها", "\u200cها", "\u200bهای", "\u200cهای", 
                   "\u200bهایی", "\u200cهایی",
                   "\u200bشده", "\u200cشده",
                   "\u200bساز", "\u200cساز",
                   "\u200bکنندگان", "\u200cکنندگان",
                   "\u200bتر", "\u200cتر", "\u200bترین", "\u200cترین"]
    
    stemming_b = ["می\u200b", "می\u200c", "نمی\u200b", "نمی\u200c"]
    stemming_e = ["ات", "ام", "اش", 
                  "یم", "ید", "ند", 
                  "مان", "تان", "شان"]
    
    prefixes = ["با", "بی", "نا"]
    
    # idea: avoid wrong cut with half space
    token = re.sub("|".join([f"^{st}" for st in stemming_b]), " ", token)
    token = re.sub("|".join([f"{pf}$" for pf in postfixes_b]), " ", token)
    
    if len(token) > 5: # idea: avoid wrong cut with char limit
        token = re.sub("|".join([f"{pf}$" for pf in postfixes_a]), " ", token)
        token = re.sub("|".join([f"{st}$" for st in stemming_e]), " ", token)
        token = re.sub("|".join([f"^{pf}" for pf in prefixes]), " ", token)
    
    fa_digits = "۱۲۳۴۵۶۷۸۹۰١٢٣٤٥٦٧٨٩٠"
    en_digits = "12345678901234567890"
    token = token.translate(str.maketrans(fa_digits, en_digits))

    token = re.sub("[ئي]", "ی", token)
    token = token.strip(".ء \u200b\u200c\u200d")
    return token

In [4]:
def tokenize(row):
    tokens = np.array(row.content.split())
    doc_id = np.full((len(tokens)), row.id)
    return np.column_stack((tokens, doc_id))

In [7]:
%%time
# saved
tokenized_docs = []
for tokens_doc in df.apply(tokenize, axis=1):
    tokenized_docs.extend(tokens_doc)

CPU times: user 4.59 s, sys: 432 ms, total: 5.02 s
Wall time: 5.02 s


In [8]:
%%time
# saved
normalized_docs = []
for [token, doc] in tokenized_docs:
    token = normalize(normalize(token))
    normalized_docs.append([token, doc])

CPU times: user 2min 10s, sys: 632 ms, total: 2min 10s
Wall time: 2min 10s


In [9]:
%%time
# saved
tokens2docs = sorted(normalized_docs, key=lambda token_doc: token_doc[0])

CPU times: user 2.06 s, sys: 56 ms, total: 2.12 s
Wall time: 2.13 s


In [10]:
%%time
# saved
inverted_indexes = {}
docs2tokens = {}
print(f"from {len(list(groupby(tokens2docs, key=lambda tokens2doc: tokens2doc[0])))} items...")
item = 0
for [token, t_docs] in groupby(tokens2docs, key=lambda tokens2doc: tokens2doc[0]):
    print(item, end="\r")
    item += 1
    docs = sorted(map(lambda item: int(item[1]), t_docs))
    
    tf = {}
    for doc in docs:
        if doc in tf:
            tf[doc] += 1
        else:
            tf[doc] = 1
    
    if len(tf) > 1300:
        continue
    elif re.sub("\s+", "", token).isdigit() and len(re.sub("\s+", "", token)) < 4:
        continue
    elif re.match("^http|^https|^video", token) and len(token) > 15:
        continue
    elif len(token) < 2:
        continue
    else:
        for doc in tf.keys():
            if doc in docs2tokens:
                docs2tokens[doc] += [token]
            else:
                docs2tokens[doc] = [token]
        inverted_indexes.update({ token: tf })

inverted_indexes_file = open("inverted_indexes", "ab")
pickle.dump(inverted_indexes, inverted_indexes_file)
inverted_indexes_file.close()

docs2tokens_file = open("docs2tokens", "ab")
pickle.dump(docs2tokens, docs2tokens_file)
docs2tokens_file.close()

from 66985 items...
CPU times: user 9.61 s, sys: 1.26 s, total: 10.9 s
Wall time: 9.98 s


In [5]:
inverted_indexes_file = open("inverted_indexes", "rb")
inverted_indexes = pickle.load(inverted_indexes_file)

docs2tokens_file = open("docs2tokens", "rb")
docs2tokens = pickle.load(docs2tokens_file)

In [6]:
for idx, row in df.iterrows():
    if row.id not in docs2tokens:
        docs2tokens[row.id] = []

In [7]:
idf_dict = {token: math.log(len(df) / len(docs)) for token, docs in inverted_indexes.items()}

In [8]:
%%time
token2index = {}
index2token = [0 for i in range(len(inverted_indexes))]
for idx, token in enumerate(inverted_indexes.keys()):
    token2index[token] = idx
    index2token[idx] = token

CPU times: user 17.4 ms, sys: 0 ns, total: 17.4 ms
Wall time: 17.2 ms


In [9]:
%%time
inverted_indexes_tf_idf = {}
for [token, docs_dict] in inverted_indexes.items():
    tf = {}
    for [doc, freq] in docs_dict.items():
        tf[doc] = (1 + math.log(freq)) * idf_dict[token]
    inverted_indexes_tf_idf[token] = tf

CPU times: user 449 ms, sys: 24 ms, total: 473 ms
Wall time: 472 ms


In [10]:
%%time
def compute_length(doc_id):
    length = 0
    tokens = docs2tokens[doc_id]
    for token in tokens:
        length += inverted_indexes_tf_idf[token][doc_id]
    return length
df["length"] = df["id"].apply(compute_length)

CPU times: user 549 ms, sys: 4.14 ms, total: 553 ms
Wall time: 552 ms


In [34]:
# saved
df_un_spvs["topic"] = ""

In [35]:
# saved
df_un_spvs.to_csv("IR_Spring2021_ph12_7k.csv", encoding='utf-8', index=False)

In [68]:
# saved
item = 0
def knn(row):
    global item
    print(item, end="\r")
    item += 1
    
    k = 5
    tokens = []
    for token in row.content.split():
        tokens.append(normalize(normalize(token)))

    q_tf = {}
    for token in tokens:
        if token in q_tf:
            q_tf[token] += 1
        else:
            q_tf[token] = 1
    
    
    result = {}
    for token in tokens:
        if token in inverted_indexes:
            docs = inverted_indexes[token].keys()
            doc_tf_idf = inverted_indexes_tf_idf[token]
            q_tf_idf = (1 + math.log(q_tf[token])) * idf_dict[token]
            
            for doc in docs:              
                if doc in result:
                    result[doc] += doc_tf_idf[doc] * q_tf_idf
                else:
                    result.update({ doc: doc_tf_idf[doc] * q_tf_idf })
              
    for doc in result:
        result[doc] /= df.loc[df["id"] == doc, "length"].values[0]
        
    res_df = df.loc[df["id"].isin(list(result.keys())), ["id", "topic"]].copy()
    res_df["rank"] = res_df["id"].apply(lambda id: result[id])

    final_df = res_df.loc[res_df["rank"].isin(heapq.nlargest(k, list(res_df["rank"])))].sort_values(by=["rank"], ascending=False)    
        
    return max(set(list(final_df["topic"])), key=list(final_df["topic"]).count)

In [69]:
# saved
df_un_spvs["topic"] = df_un_spvs.apply(knn, axis=1)

6999

In [70]:
# saved
df_un_spvs.to_csv("IR_Spring2021_ph12_7k.csv", encoding='utf-8', index=False)

In [21]:
# saved
df_un_spvs.groupby("topic")["topic"].value_counts()

topic      topic    
culture    culture       326
economy    economy      2125
health     health       1609
political  political    1182
sport      sport        1758
Name: topic, dtype: int64

In [14]:
un_spvs_inverted_indexes_file = open("../Phase-2/inverted_indexes", "rb")
un_spvs_inverted_indexes = pickle.load(un_spvs_inverted_indexes_file)

In [15]:
un_spvs_idf_dict = {token: math.log(len(df) / len(docs)) for token, docs in un_spvs_inverted_indexes.items()}

In [16]:
%%time
un_spvs_inverted_indexes_tf_idf = {}
for [token, docs_dict] in un_spvs_inverted_indexes.items():
    tf = {}
    for [doc, freq] in docs_dict.items():
        tf[doc] = (1 + math.log(freq)) * un_spvs_idf_dict[token]
    un_spvs_inverted_indexes_tf_idf[token] = tf

CPU times: user 295 ms, sys: 7.15 ms, total: 302 ms
Wall time: 301 ms


In [17]:
%%time
def get_doc_length(doc_id):
    content = df_un_spvs.loc[df_un_spvs["id"] == doc_id, "content"].values[0]
    tokens = []
    for token in content.split():
        tokens.append(normalize(normalize(token)))
    
    length = 0
    for token in tokens:
        if token in un_spvs_inverted_indexes:
            length += un_spvs_inverted_indexes[token][doc_id] ** 2
    return math.sqrt(length)

df_un_spvs["length"] = df_un_spvs["id"].apply(get_doc_length)

CPU times: user 1min 14s, sys: 0 ns, total: 1min 14s
Wall time: 1min 14s


In [20]:
def search(query):
    k = 10
    cat = None
    
    tokens = []
    for token in query.split():
        if "cat:" in token:
            cat = token.split("cat:")[1]
        else:
            tokens.append(normalize(normalize(token)))
    
    q_tf = {}
    for token in tokens:
        if token in q_tf:
            q_tf[token] += 1
        else:
            q_tf[token] = 1
            
    result = {}
    
    found_token = []
    for token in tokens:
        if token in un_spvs_inverted_indexes:
            found_token.append(token)
            
            docs = un_spvs_inverted_indexes[token].keys()
            doc_tf_idf = un_spvs_inverted_indexes_tf_idf[token]
            q_tf_idf = (1 + math.log(q_tf[token])) * un_spvs_idf_dict[token]
            
            for doc in docs:         
                if doc in result:
                    result[doc] += doc_tf_idf[doc] * q_tf_idf
                else:
                    result.update({ doc: doc_tf_idf[doc] * q_tf_idf })
              
    for doc in result:
        result[doc] /= df_un_spvs.loc[df_un_spvs["id"] == doc, "length"].values[0]
    
    res_df = df_un_spvs.loc[df_un_spvs["id"].isin(list(result.keys())), ["id", "url", "topic"]].copy()
    res_df["rank"] = res_df["id"].apply(lambda id: result[id])
    
    if cat:
        res_df = res_df[res_df["topic"] == cat]

    final_df = res_df.loc[res_df["rank"].isin(heapq.nlargest(k, list(res_df["rank"])))].sort_values(by=["rank"], ascending=False)
    
    print(found_token)
    for index, row in final_df.iterrows():
            print("=" * 80)
            print(f"#Doc {row.id}")
            print(row.url)
            print(row["rank"])
            print(row.topic)

In [25]:
%%time
# search("تمرینات تیم تکواندو")
# search("تیم ملی تکواندو")
# search("قهرمانی پرسپولیس cat:sport")
# search("کرونا cat:health")
# search("کرونا cat:economy")
# search("کرونا cat:political")
# search("انقلاب cat:culture")
search("انقلاب cat:economy")

['انقلاب']
#Doc 2134
https://www.isna.ir/news/99071007797/دستور-قضایی-بررسی-آتش-سوزی-بازارچه-ساحلی-بندر-دیلم-بازداشت-یک
1.493588868639915
economy
#Doc 4058
https://www.isna.ir/news/99100100431/زمان-قطعی-برای-بررسی-بودجه-در-فراکسیون-انقلاب-اسلامی-تعیین-نشده
1.1716925477371425
economy
#Doc 4186
https://www.isna.ir/news/99111511732/۲۶پروژه-با-۲۳۰۰میلیارد-تومان-در-کردستان-افتتاح-می-شود
1.0551402829936243
economy
#Doc 4209
https://www.isna.ir/news/99112518369/اراضی-آموزش-و-پرورش-در-روستاها-و-شهرهای-کوچک-سند-دار-می-شود
0.8271488361076161
economy
#Doc 5121
https://www.isna.ir/news/98090704741/هر-کدام-از-تحریم-های-اقتصادی-می-تواند-دولتی-را-سرنگون-کند
0.8041730503900254
economy
#Doc 4159
https://www.isna.ir/news/99110503344/کدام-بخش-های-اقتصاد-پس-از-کرونا-اوج-می-گیرند
0.7428574961865295
economy
#Doc 4326
https://www.isna.ir/news/98011907014/القای-جو-روانی-برای-افزایش-قیمت-ارز
0.6906604581868157
economy
#Doc 1983
https://www.isna.ir/news/99052317223/پرونده-فرودگاه-قدیم-غار-نمکدان-و-پلاژ-بانوان-ق