In [1]:
import pandas as pd
import numpy as np
import re
from itertools import groupby
import math
import pickle
import heapq

In [2]:
%%time
# df = pd.read_excel("IR00_dataset_ph3/IR00_3_11k News.xlsx")
df = pd.read_csv("IR00_dataset_ph3/IR00_3_11k News.csv")

CPU times: user 372 ms, sys: 35.8 ms, total: 408 ms
Wall time: 609 ms


In [3]:
def normalize(token):
    token = token.strip(".ء \u200b\u200c\u200d")
    token = re.sub("[ـ،؛,–—!٪؟+:_٫/.]", " ", token)
    token = re.sub("[*%&…#=\\\•;!|\-?]", " ", token)
    token = re.sub("[﴾﴿«»()<>\[\]“”'\"]", " ", token)
    token = re.sub("[\u200f\u200e\ufeff\u2067\u202a\u202b\u202c\u2069\xad]", " ", token)
    token = re.sub("\s+", " ", token)
    
    plurals = {
        "آداب": "ادب", "اطراف": "طرف", "حقایق": "حقیقت", "امواج": "موج",
        "مراکز": "مرکز", "اعماق": "عمق", "مواقع": "موقع", "اخبار": "خبر",
        "علما": "عالم", "آثار": "اثر", "مصارف": "مصرف", "علوم": "علم",
        "ادیان": "دین", "علائم": "علامت", "اسامی": "اسم", "مباحث": "مبحث",
        "دفاتر": "دفتر", "علل": "علت", "مذاهب": "مذهب", "عناصر": "عنصر",
        "مساجد": "مسجد", "روابط": "رابطه", "اعضا": "عضو", "عبارات": "عبارت",
        "موارد": "مورد", "مفاهیم": "مفهوم", "اشعار": "اشعار", "منابع": "منبع",
        "منبع": "قاعده", "فقها": "فقیه", "عجایب": "عجیب", "تصاویر": "تصویر"
    }
    
    for key, value in plurals.items():
        if key in token:
            token = token.replace(key, value)
            
    arabic_chars = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652']
    token = re.sub("|".join(arabic_chars), "", token)
    
    postfixes_a = ["ها", "های", "هایی", "ی", "ان",
                   "تر", "ترین",
                   "گر"]
    postfixes_b = ["\u200bها", "\u200cها", "\u200bهای", "\u200cهای", 
                   "\u200bهایی", "\u200cهایی",
                   "\u200bشده", "\u200cشده",
                   "\u200bساز", "\u200cساز",
                   "\u200bکنندگان", "\u200cکنندگان",
                   "\u200bتر", "\u200cتر", "\u200bترین", "\u200cترین"]
    
    stemming_b = ["می\u200b", "می\u200c", "نمی\u200b", "نمی\u200c"]
    stemming_e = ["ات", "ام", "اش", 
                  "یم", "ید", "ند", 
                  "مان", "تان", "شان"]
    
    prefixes = ["با", "بی", "نا"]
    
    # idea: avoid wrong cut with half space
    token = re.sub("|".join([f"^{st}" for st in stemming_b]), " ", token)
    token = re.sub("|".join([f"{pf}$" for pf in postfixes_b]), " ", token)
    
    if len(token) > 5: # idea: avoid wrong cut with char limit
        token = re.sub("|".join([f"{pf}$" for pf in postfixes_a]), " ", token)
        token = re.sub("|".join([f"{st}$" for st in stemming_e]), " ", token)
        token = re.sub("|".join([f"^{pf}" for pf in prefixes]), " ", token)
    
    fa_digits = "۱۲۳۴۵۶۷۸۹۰١٢٣٤٥٦٧٨٩٠"
    en_digits = "12345678901234567890"
    token = token.translate(str.maketrans(fa_digits, en_digits))

    token = re.sub("[ئي]", "ی", token)
    token = token.strip(".ء \u200b\u200c\u200d")
    return token

In [4]:
def tokenize(row):
    tokens = np.array(row.content.split())
    doc_id = np.full((len(tokens)), row.id)
    return np.column_stack((tokens, doc_id))

In [7]:
%%time
# saved
tokenized_docs = []
for tokens_doc in df.apply(tokenize, axis=1):
    tokenized_docs.extend(tokens_doc)

CPU times: user 4.59 s, sys: 432 ms, total: 5.02 s
Wall time: 5.02 s


In [8]:
%%time
# saved
normalized_docs = []
for [token, doc] in tokenized_docs:
    token = normalize(normalize(token))
    normalized_docs.append([token, doc])

CPU times: user 2min 10s, sys: 632 ms, total: 2min 10s
Wall time: 2min 10s


In [9]:
%%time
# saved
tokens2docs = sorted(normalized_docs, key=lambda token_doc: token_doc[0])

CPU times: user 2.06 s, sys: 56 ms, total: 2.12 s
Wall time: 2.13 s


In [10]:
%%time
# saved
inverted_indexes = {}
docs2tokens = {}
print(f"from {len(list(groupby(tokens2docs, key=lambda tokens2doc: tokens2doc[0])))} items...")
item = 0
for [token, t_docs] in groupby(tokens2docs, key=lambda tokens2doc: tokens2doc[0]):
    print(item, end="\r")
    item += 1
    docs = sorted(map(lambda item: int(item[1]), t_docs))
    
    tf = {}
    for doc in docs:
        if doc in tf:
            tf[doc] += 1
        else:
            tf[doc] = 1
    
    if len(tf) > 1300:
        continue
    elif re.sub("\s+", "", token).isdigit() and len(re.sub("\s+", "", token)) < 4:
        continue
    elif re.match("^http|^https|^video", token) and len(token) > 15:
        continue
    elif len(token) < 2:
        continue
    else:
        for doc in tf.keys():
            if doc in docs2tokens:
                docs2tokens[doc] += [token]
            else:
                docs2tokens[doc] = [token]
        inverted_indexes.update({ token: tf })

inverted_indexes_file = open("inverted_indexes", "ab")
pickle.dump(inverted_indexes, inverted_indexes_file)
inverted_indexes_file.close()

docs2tokens_file = open("docs2tokens", "ab")
pickle.dump(docs2tokens, docs2tokens_file)
docs2tokens_file.close()

from 66985 items...
CPU times: user 9.61 s, sys: 1.26 s, total: 10.9 s
Wall time: 9.98 s


In [5]:
inverted_indexes_file = open("inverted_indexes", "rb")
inverted_indexes = pickle.load(inverted_indexes_file)

docs2tokens_file = open("docs2tokens", "rb")
docs2tokens = pickle.load(docs2tokens_file)

In [6]:
for idx, row in df.iterrows():
    if row.id not in docs2tokens:
        docs2tokens[row.id] = []

In [7]:
idf_dict = {token: math.log(len(df) / len(docs)) for token, docs in inverted_indexes.items()}

In [8]:
%%time
token2index = {}
index2token = [0 for i in range(len(inverted_indexes))]
for idx, token in enumerate(inverted_indexes.keys()):
    token2index[token] = idx
    index2token[idx] = token

CPU times: user 26 ms, sys: 110 µs, total: 26.1 ms
Wall time: 26.6 ms


In [9]:
%%time
inverted_indexes_tf_idf = {}
for [token, docs_dict] in inverted_indexes.items():
    tf = {}
    for [doc, freq] in docs_dict.items():
        tf[doc] = (1 + math.log(freq)) * idf_dict[token]
    inverted_indexes_tf_idf[token] = tf

CPU times: user 453 ms, sys: 19.7 ms, total: 473 ms
Wall time: 473 ms


In [10]:
%%time
# saved
def compute_length(doc_id):
    length = 0
    tokens = docs2tokens[doc_id]
    for token in tokens:
        length += inverted_indexes_tf_idf[token][doc_id]
    return length
df["length"] = df["id"].apply(compute_length)

CPU times: user 554 ms, sys: 0 ns, total: 554 ms
Wall time: 554 ms


In [16]:
# saved
df["cluster"] = -1

In [17]:
# saved
df.to_csv("IR00_dataset_ph3/IR00_3_11k News.csv", encoding='utf-8', index=False)

In [18]:
# saved
centroids_id = df.sample(10)["id"].values
centroids = []
for centroid_id in centroids_id:
    tokens = docs2tokens[centroid_id]
    doc_tf_idf = np.zeros(len(inverted_indexes))
    for token in tokens:
        doc_tf_idf[token2index[token]] = inverted_indexes_tf_idf[token][centroid_id]
    centroids.append(doc_tf_idf)
centroids = np.array(centroids)

In [19]:
def find_cluster(row):
    global item
    print(f"row {item + 1}", end="\r")
    item += 1
    similarities = np.zeros(len(centroids))
        
    tokens = docs2tokens[row.id]
    for index in range(len(centroids[0])):
        token = index2token[index]
        if token in tokens:
            token_tf_idf = inverted_indexes_tf_idf[token][row.id]
            similarities += centroids[:, index] * token_tf_idf
    
    similarities /= np.linalg.norm(centroids)
    return np.argmax(similarities)

In [20]:
%%time
# saved
item = 0
df["cluster"] = df.apply(find_cluster, axis=1)

CPU times: user 1h 6min 59s, sys: 52.3 s, total: 1h 7min 52s
Wall time: 19min


In [21]:
# saved
df.to_csv("IR00_dataset_ph3/IR00_3_11k News.csv", encoding='utf-8', index=False)

In [22]:
# saved
def update_centroids(centroids):
    item = 0
    print(f"embedding size: {len(inverted_indexes_tf_idf)}")
    new_centroids = np.zeros(centroids.shape)
    for token, docs_tf_idf in inverted_indexes_tf_idf.items():
        print(item, end="\r")
        item += 1
        index = token2index[token]
        for doc, tf_idf in docs_tf_idf.items():
            new_centroids[int(df[df["id"] == doc]["cluster"]), index] += tf_idf
    new_centroids /= np.array(list(df.groupby("cluster")["cluster"].value_counts()))[:, None]
    print("update centroids finished")
    return new_centroids

In [42]:
# saved
for i in range(4):
    print(f"iteration {i + 1}")
    centroids = update_centroids(centroids)
    item = 0
    print(f"doc size: {len(df)}")
    df["cluster"] = df.apply(find_cluster, axis=1)
    print("find cluster finished")

iteration 1
embedding size: 62582
update centroids finished
doc size: 11437
find cluster finished
iteration 2
embedding size: 62582
update centroids finished
doc size: 11437
find cluster finished
iteration 3
embedding size: 62582
update centroids finished
doc size: 11437
find cluster finished
iteration 4
embedding size: 62582
update centroids finished
doc size: 11437
find cluster finished


In [43]:
# saved
df.to_csv("IR00_dataset_ph3/IR00_3_11k News.csv", encoding='utf-8', index=False)

In [48]:
# saved
centroids_file = open("centroids", "ab")
pickle.dump(centroids, centroids_file)
centroids_file.close()

In [10]:
centroids_file = open("centroids", "rb")
centroids = pickle.load(centroids_file)

In [11]:
df.groupby("cluster")["cluster"].value_counts()

cluster  cluster
0        0          1422
1        1          1921
2        2          1594
3        3           947
4        4           511
5        5           877
6        6           467
7        7           401
8        8           274
9        9          3023
Name: cluster, dtype: int64

In [12]:
def cosine_similarity(a, b):
    similarity = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    if np.isnan(similarity):
        return 0
    else:
        return similarity

In [13]:
def search(query):
    k = 10
    tokens = []
    for token in query.split():
        tokens.append(normalize(normalize(token)))
    
    q_tf = {}
    for token in tokens:
        if token in q_tf:
            q_tf[token] += 1
        else:
            q_tf[token] = 1
   
    q_tf_idf = np.zeros(len(inverted_indexes))
    for token in tokens:
        if token in token2index:
            q_tf_idf[token2index[token]] = (1 + math.log(q_tf[token])) * idf_dict[token]
        
    similarities = []
    for centroid in centroids:
        similarities.append(cosine_similarity(centroid, q_tf_idf))
        
    docs = df[df["cluster"] == np.argmax(similarities)]

    result = {}
    found_token = []
    for token in tokens:
        if token in inverted_indexes:
            found_token.append(token)
            docs = inverted_indexes[token].keys()
            doc_tf_idf = inverted_indexes_tf_idf[token]
            qt_tf_idf = q_tf_idf[token2index[token]]
            
            for doc in docs:              
                if doc in result:
                    result[doc] += doc_tf_idf[doc] * qt_tf_idf
                else:
                    result.update({ doc: doc_tf_idf[doc] * qt_tf_idf })
              
    for doc in result:
        result[doc] /= df.loc[df["id"] == doc, "length"].values[0]
    
    res_df = df.loc[df["id"].isin(list(result.keys())), ["id", "url"]].copy()
    res_df["rank"] = res_df["id"].apply(lambda id: result[id])

    final_df = res_df.loc[res_df["rank"].isin(heapq.nlargest(k, list(res_df["rank"])))].sort_values(by=["rank"], ascending=False)
    
    print(found_token)
    for index, row in final_df.iterrows():
            print("=" * 80)
            print(f"#Doc {row.id}")
            print(row.url)
            print(row["rank"])

In [20]:
%%time
# search_phase2("تمرینات تیم تکواندو")
# search("تیم ملی تکواندو")
# search("قهرمانی پرسپولیس")
# search("سرمایه گذاری در بازار بورس")
search("اعزام کاروان های اردوی راهیان نور")

['اعزام', 'کارو', 'اردوی', 'راهی', 'نور']
#Doc 9702
https://www.isna.ir/news/98021106170/حضور-لرستانی-ها-در-آیین-افتتاحیه-اردوهای-راهیان-نور-غرب-کشور
0.5887203775719336
#Doc 10656
https://www.isna.ir/news/98080904715/مراسم-افتتاحیه-راهیان-نور-دانش-آموزی-برگزار-می-شود
0.5392010695960446
#Doc 9258
https://www.isna.ir/news/99111410631/اهدای-نشان-ملی-خادمی-شهدا-به-سردار-مطهری
0.509420560187563
#Doc 10031
https://www.isna.ir/news/98041910370/خوزستانی-ها-به-مناطق-عملیاتی-غرب-کشور-می-روند
0.49476017278910245
#Doc 10043
https://www.isna.ir/news/98041910370/خوزستانی-ها-به-مناطق-عملیاتی-غرب-کشور-می-روند
0.49476017278910245
#Doc 9934
https://www.isna.ir/news/98032712899/حضور-دانشجویان-چهارمحال-و-بختیاری-در-یادمان-های-دفاع-مقدس
0.4833069266594536
#Doc 10318
https://www.isna.ir/news/98061005201/کارگاه-توانمندی-سازی-مدیران-کاروان-های-راهیان-نور-برگزار-شد
0.4781446686771322
#Doc 10025
https://www.isna.ir/news/98041708948/۲۵-هزار-دانش-آموز-کردستانی-به-راهیان-نور-می-روند
0.462709744090279
#Doc 8518
htt