# Модуль извлечения ключевых слов для оптимизации электронного документооборота

## Functions

In [38]:
import nltk
import nltk.data
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

my_stopwords = ["...",".","0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab","aab",'aaab', "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]
import pandas as pd
import numpy as np
# For cleaning the text
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import regex as re
import string
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Install SBERT

In [13]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

bert_model = SentenceTransformer('all-mpnet-base-v2')
#multi-qa-mpnet-base-dot-v1
#all-mpnet-base-v2

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## Define functions

In [34]:
from pickle import LONG1
################################ new
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize, sent_tokenize, pos_tag

def Preprocess_text_multi_lang_chunk(text,stopwords, lang="en"):
    # 1. Tokenise to alphabetic tokens
    text = remove_numbers(text)
    text = remove_http(text)
    text = remove_punctuation_without(text) #remove_punctuation(text)

    text = convert_to_lower(text)
    text = remove_white_space(text)
    #text = remove_short_words(text)
    tokens = toknizing(text, stopwords)

    # 2. POS tagging
    pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
    pos_tags = pos_tag(tokens)
    #print(pos_tags)
    # 3. Lowercase and lemmatise
    if lang=="ru":
      lemmatiser = pymorphy2.MorphAnalyzer()
      tokens = [lemmatiser.parse(tok)[0].normal_form for tok in tokens]
    else:
      lemmatiser = WordNetLemmatizer()
      tokens = [lemmatiser.lemmatize(t.lower(), pos=pos_map.get(p[0], 'v')) for t, p in pos_tags]
    return tokens

def Preprocess_text_multi_lang(text, stopwords,lang="en"):
    # 1. Tokenise to alphabetic tokens
    text = remove_numbers(text)
    text = remove_http(text)
    text = remove_punctuation(text) #remove_punctuation(text)
    text = convert_to_lower(text)
    text = remove_white_space(text)
    #text = remove_short_words(text)
    tokens = toknizing(text, stopwords)

    # 2. POS tagging
    pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
    pos_tags = pos_tag(tokens)
    #print(pos_tags)
    # 3. Lowercase and lemmatise
    if lang=="ru":
      lemmatiser = pymorphy2.MorphAnalyzer()
      tokens = [lemmatiser.parse(tok)[0].normal_form for tok in tokens]
    else:
      lemmatiser = WordNetLemmatizer()
      tokens = [lemmatiser.lemmatize(t.lower(), pos=pos_map.get(p[0], 'v')) for t, p in pos_tags]
    return tokens

#################################3
def convert_to_lower(text):
  return text.lower()
####################################################################################
def remove_numbers(text):
  text = re.sub(r'\d+' , '', text)
  return text

def remove_http(text):
  text = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", ' ', text)
  return text
#################################################
def remove_short_words(text):
  text = re.sub(r'\b\w{1,2}\b', '', text)
  return text
####################################################################################
def remove_punctuation_without(text):
     punctuations = '''()[]{}:'"\<>`?@№=#$%^+&*_~'''
     no_punct = ""

     for char in text:
        if char not in punctuations:
            no_punct = no_punct + char
     return no_punct
#####################################################
def remove_punctuation(text):
     punctuations = '''!()[]{};«№»:'"\,`<>./?@=#$-(%^)+&[*_]~'''
     no_punct = ""

     for char in text:
        if char not in punctuations:
            no_punct = no_punct + char
     return no_punct


####################################################################################
def remove_white_space(text):
  text = text.strip()
  return text
####################################################################################

def remove_stopwords(tokens, stopwords=None):
    if not stopwords:
        return tokens
    stopwords = set(stopwords)
    tokens = [tok
              for tok in tokens
              if tok not in stopwords]
    return tokens
#---------------------------------------------------------------
def toknizing(text,stopwords=None):
  tokens = word_tokenize(text)
  ## Remove Stopwords from tokens
  if stopwords:
    result = remove_stopwords(tokens,stopwords)
  return result
###################################################################################
def remove_duplication(text):
  return list(set(text))
###################################
## for tfidf vectors
def get_tokened_docs(data_DF):
  doclist= []
  for d in  data_DF['text']:
    doclist.append(d)

  tokened_docs=[]
  for d in doclist:
    doc_as_token = Preprocess_text_multi_lang(d)
    doc_as_text = ' '.join(doc_as_token)
    tokened_docs.append(doc_as_text)
  return tokened_docs
################################################

#استخراج الكلمات ذات وزن اكبر من متوسط اوزان كلمات المستند
def get_important_words_by_tfidf(doc_vectors):
  from statistics import mean
  new_doc_list =[]
  for doc_vect in doc_vectors:
    #get mean value of dict elements tfidf values
    thr_mean= mean(doc_vect[k] for k in doc_vect)
    #print(thr_mean)
    d = dict((k, v) for k, v in doc_vect.items() if v >= thr_mean)
    new_doc_list.append(d)
  return new_doc_list
##########################
# Convert the dictionary for each document into a list of keyword names only without weight
def get_doc_features_list(new_doc_list):
  doc_list_important = []
  for doc in new_doc_list:
    ##we need only the value
    doc_features = list(doc.keys())
    doc_list_important.append(doc_features)
  return doc_list_important

##################################
def Convert_list_of_list_to_list(all_tokens):
  tokens = []
  for words in all_tokens:
    for i in words:
      tokens.append(i)
  return tokens

########################################
# Clean the basic keywords and remove the spaces and noise
import ast
def clear_orginal_kw(orginal_kw,stopword, lango):
  orginal_kw_clean =[]
  for doc_wk in orginal_kw:
    doc_wk = ast.literal_eval(doc_wk)
    temp =[]
    for t in doc_wk:
      tt = ' '.join(Preprocess_text_multi_lang(t,stopword, lang= lango))
      if len(tt.split())>0:
        temp.append(tt)
    orginal_kw_clean.append(temp)
  return orginal_kw_clean
#############################################
#clean list of single document keywords
def clean_resulted_kw(list_of_doc_kw):
  pro_kw=[]
  for kw in list_of_doc_kw:
    text = remove_numbers(kw)
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_white_space(text)
    pro_kw.append(text)
  return pro_kw
#####################
#get max min value of dictionary
def keywithmaxval(d):
     v=list(d.values())

     k=list(d.keys())
     return k[v.index(min(v))], min(v), k[v.index(max(v))], max(v)
###############################################
# get max val for each dictionary and div it by max
def normalize_dictionary(list_of_dict):
  for doc_dic in list_of_dict:
    m,mv, ma, mav = keywithmaxval(doc_dic)

    for k,v in doc_dic.items():
      vv = v/mav
      doc_dic.update({k:vv})
  return list_of_dict
############################################################################
chunk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

######################################chunck all docs
def all_doc2chunk(dtf,chunk_tokenizer):
  document_as_chunk_sent = []
  for doc in dtf:
    cunked_doc = doc2chunk(doc, chunk_tokenizer)
    document_as_chunk_sent.append(cunked_doc)
  return document_as_chunk_sent

##################################################### chunck single document
def doc2chunk(doc , chunk_tokenizer):
  doc_as_chunk = []
  sents = chunk_tokenizer.tokenize(doc) # list of sent

  for sent in sents:
    chunks = re.split('; |, |! ',sent)
    chunk_of_token = []
    for ch in chunks:
      if len(ch)> 0:
        tokenss = ch.split()

        #check tokens purity
        tokens_in_chunk = [] # to fill clean tokens only
        for t in tokenss:
          if t not in my_stopwords and len(t)>2 and not t.isdigit():
            tokens_in_chunk.append(t)

      if len(tokens_in_chunk) > 0:
        chunk_of_token.append(tokens_in_chunk)  # save token in chunk list

    doc_as_chunk.append(chunk_of_token)        # list of sentences chunks

  return doc_as_chunk
#####################################################################################3

##################################################### chunk single document
def doc2chunk_str(doc):
  chunk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  doc_as_chunk = []
  sents = chunk_tokenizer.tokenize(doc) # list of sent

  for sent in sents:
    chunks = re.split('; |, |! ',sent)
    chunk_of_token = []
    for ch in chunks:
      if len(ch)> 0:
        tokenss = ch.split()

        #check tokens purity
        tokens_in_chunk = [] # to fill clean tokens only
        for t in tokenss:
          if t not in my_stopwords and len(t)>2 and not t.isdigit():
            tokens_in_chunk.append(t)

      if len(tokens_in_chunk) > 0:
        chunk_of_token.append(tokens_in_chunk)  # save token in chunck list

    doc_as_chunk.append(chunk_of_token)        # list of sentences chunks
  new_d = ' '.join(Convert_list_of_list_to_list(Convert_list_of_list_to_list(doc_as_chunk)))
  return doc_as_chunk
###################################################33
def check_kw_chunk(kword, document_as_chunk):
  is_kw = 0
  tokens = kword.split()   # keyword words
  token_len = len(tokens )  #number of words in kw

  for sent in  document_as_chunk:
    for chunks in sent:
      token_in_ch =0  # count the words in one chunk
      for t in tokens:
        if t in chunks:
          token_in_ch = token_in_ch+1
      if token_in_ch == token_len:
        # all token were appeared in one chunk
        is_kw =1
        break;
      else:
        continue;

  return is_kw

#####################################################################################
def from_chunk_to_tokens(doc):
  new_doc= doc.split()
  return new_doc

#################################### get the longest key words
def get_longest_ky_for_doc(orginal_doc_kw):
  longest= orginal_doc_kw[0]
  index =0
  for i, kw in enumerate(orginal_doc_kw):
    if len(kw)> len(longest):
      index = i

  return index

##########################################
def get_longest_ky(orginal_kw):
  longest = 1
  for doc_kw in orginal_kw:
    indx = get_longest_ky_for_doc(doc_kw)
    long  = len(doc_kw[indx].split())
    if long > longest:
      longest = long
  return longest
####################################
############################################################################################
################################################
def check_chunk_tf(doc_sorted_tfidfs, document_as_chunk_sent):
  ranked_chunked_kywords =[]
  for doc_id, ranked_doc in enumerate(doc_sorted_tfidfs):
    tot = 0
    #list_kw= []
    dict_kw= {}
    for t, v in ranked_doc.items():
      x= check_kw_chunk(t, document_as_chunk_sent[doc_id])
      if x == 1:
        tot = tot + 1
        #list_kw.append(t)
        dict_kw.update({t:v})
    ranked_chunked_kywords.append(dict_kw)

  return ranked_chunked_kywords

##################################################################################
##################################  Defining Our Methods Here    ##################################################
# Save the pre_trained model embeddings..
def save_embeddings(model, file_path):
  word_vectors = model.wv
  word_vectors.save(file_path)
###############################################3
### install pre_trained_model
import gensim.downloader
def intall_pre_trained_model(file_path, model_name):
  model = gensim.downloader.load(model_name)
  save_embeddings(model , file_path)
  return model
  print('the models are intalled and saved in the " ' + file_path + '"')

#######################################################33
######################################
### loading the embeddings
def Load_pre_trained_model_embeddings(file_path):
  wv = KeyedVectors.load(file_path, mmap='r')
  return wv

##############################################################################################
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

import math
def score_fastTF(tf_scores,docs_from_fasttext ,B):
  doc_scored_kw = []

  for doc_id in range(len(docs_from_fasttext)):
    Keyords_output = {}
    current_doc = docs_from_fasttext[doc_id] # dictionary token : sim
    current_doc_tf= tf_scores[doc_id]

    for word , sim in current_doc.items():
      #print(word, sim)
      res = (B*B+1)*current_doc_tf[word]*sim/(B*B*current_doc_tf[word]+sim)
      #res = (current_doc_tf[word]+sim)/2
      Keyords_output.update({word:res})

    # store output for each document
    doc_scored_kw.append(Keyords_output)
  return doc_scored_kw
#################################################################
def score_keywords(doc_keywords,doc_tfidfs, Sbert  ,B):
  doc_scored_kw = []

  for doc_id in range(len(doc_keywords)):
    Keyords_output = {}
    current_doc_tfidfs =  doc_tfidfs[doc_id]
    for i, w in enumerate(doc_keywords[doc_id]):
      #mul= (2*B*current_doc_tfidfs[i][1]*Sbert[doc_id][i])/(B*current_doc_tfidfs[i][1]+Sbert[doc_id][i])
      s= Sbert[doc_id][i] * math.exp(Sbert[doc_id][i]/len(i.split()))
      print("s ", s, "old ",  Sbert[doc_id][i])
      mul= (2*B*current_doc_tfidfs[w]*s)/(B*current_doc_tfidfs[w]+s)
      Keyords_output.update({w:mul})

    doc_scored_kw.append(Keyords_output)
  return doc_scored_kw

############################
def rank_kywords(all_doc_kw):
  ranked_kywords = []
  for doc_id in range(len(all_doc_kw)):
    sorted_doc_kw = sorted(all_doc_kw[doc_id].items(), key=lambda x: x[1], reverse=True)
    ranked_kywords.append(sorted_doc_kw)
  return ranked_kywords

################################################
def check_chunk(ranked_kywords, document_as_chunk_sent):
  ranked_chunked_kywords =[]
  for doc_id, ranked_doc in enumerate(ranked_kywords):
    tot = 0
    list_kw= []
    for t, v in ranked_doc:
      x= check_kw_chunk(t, document_as_chunk_sent[doc_id])
      if x == 1:
        tot = tot + 1
        list_kw.append(t)
    ranked_chunked_kywords.append(list_kw)

  return ranked_chunked_kywords
#
########################################################
def get_exact_matching_kws(final_kywords, lem_orginal_kw):
  docs_matched_kw = []
  sum_of_tp = 0
  sum_of_FP = 0
  g_truth= 0
  for doc_id in range(len(final_kywords)):
    len_orgin = len(lem_orginal_kw[doc_id])

    # topN == number of groud truth
    top_15 = final_kywords[doc_id][0:len_orgin]
    obwe = get_exact_intersect(lem_orginal_kw[doc_id], top_15)
    docs_matched_kw.append(obwe)

    right = len(list(set(obwe)))
    wrong = len_orgin - right

    sum_of_tp = sum_of_tp + right
    g_truth = g_truth + len_orgin
    sum_of_FP = sum_of_FP + wrong
  F =0
  if sum_of_tp !=0:
    Precision = sum_of_tp / (sum_of_tp + sum_of_FP)
    Recall = sum_of_tp/ g_truth
    F = 2 * Precision * Recall / (Precision + Recall)
  return docs_matched_kw , F
##################################
def get_matching_kws(final_kywords, lem_orginal_kw):
  docs_matched_kw = []
  sum_of_tp = 0
  sum_of_FP = 0
  g_truth= 0
  for doc_id in range(len(final_kywords)):
    len_orgin = len(lem_orginal_kw[doc_id])

    # topN == number of groud truth
    top_15 = final_kywords[doc_id][0:len_orgin]
    obwe = get_intersect(lem_orginal_kw[doc_id], top_15)
    docs_matched_kw.append(obwe)

    right = len(list(set(obwe)))
    wrong = len_orgin - right

    sum_of_tp = sum_of_tp + right
    g_truth = g_truth + len_orgin
    sum_of_FP = sum_of_FP + wrong
  F =0
  if sum_of_tp !=0:
    Precision = sum_of_tp / (sum_of_tp + sum_of_FP)
    Recall = sum_of_tp/ g_truth
    F = 2 * Precision * Recall / (Precision + Recall)
  return docs_matched_kw , F
##########################################################3
def get_intersect1(doc_orginal_kw, doc_my_kw):
  general = []
  for kw in doc_my_kw:
    for kww in doc_orginal_kw:
      l_my = len(kw.split())
      l_org = len(kww.split())
      if (kw == kww):
        #print("exact matching ========", kw, kww)
        if kww not in general:
          general.append(kww)
      elif (is_subset(kw.split(), kww.split()) == 1):
        #print("result : ",kw, "==is sub === set from :",kww)
        if kww not in general:
          general.append(kww)
      elif (is_subset(kww.split(), kw.split()) == 1) and (l_org >= 2):
        #print(" org: ",kww, "===is sub===== my set from:",kw)
        if kww not in general:
          general.append(kww)
  return general
#####################################################3333
def get_intersect(doc_orginal_kw, doc_my_kw):
  general = []
  for kw in doc_my_kw:
    for kww in doc_orginal_kw:
      l_my = len(kw.split())
      l_org = len(kww.split())
      if (kw == kww):
        #print("exact matching ========", kw, kww)
        if kww not in general:
          general.append(kww)
      elif (is_subset(kw.split(), kww.split()) == 1):
        #print("result : ",kw, "==is sub === set from :",kww)
        if kww not in general:
          general.append(kww)

  return general
#################################
def get_exact_intersect(doc_orginal_kw, doc_my_kw):
  general = []
  for kw in doc_my_kw:
    for kww in doc_orginal_kw:
      l_my = len(kw.split())
      l_org = len(kww.split())
      if (kw == kww):
        #print("exact matching ========", kw, kww)
        if kww not in general:
          general.append(kww)
  return general
########################################################
def is_subset(sub_list, test_list):
  # using all() to check subset of list
  flag = 0
  if(all(x in test_list for x in sub_list)) and len(sub_list)> 1 and len(test_list)> 1:
      flag = 1
  return flag
##############################
sp = spacy.load('en_core_web_sm')
def get_lemmatized_orginal_kw(orginal_kw):
  all_lemmatized_orginal_kw = []
  for doc_id in range(len(orginal_kw)):
    lemmatized_orginal_kw = []
    for t in orginal_kw[doc_id]:
      term_words = t.split()
      #print("befor ",term_words)
      whole_term =[]

      for w in term_words:
        lemmatized_w = [w.lemma_ for w in sp(w)][0]
        #print(lemmatized_w)
        x= lemmatized_w #
        whole_term.append(x)

      if len(whole_term)> 0:
        #print("after ",whole_term)
        lemmatized_orginal_kw.append(' '.join(whole_term))
    all_lemmatized_orginal_kw.append(lemmatized_orginal_kw)
  return all_lemmatized_orginal_kw

#################################################33
#####################33
def filter_kw(doc_kw, N):
  top_N = doc_kw[0:N]
  for kw in doc_kw:
    #print("kw ",kw)
    if len(kw.split()) == 1:
      occ, kw_lens = singl_kw_occur_in_topN(kw, top_N)
      #print("occ = ",occ)
      if (occ >= 2):
        doc_kw.remove(kw)
        filter_kw(doc_kw, N)

    if len(kw.split())  == 2:
      occ, kw_lens = double_kw_occur_in_topN(kw, top_N)
      #print("occ = ",occ)
      if (occ >= 2):
        #print("remove", kw)
        doc_kw.remove(kw)
        #print("after remove ==============", len(doc_kw))
        filter_kw(doc_kw, N)

  return doc_kw


##########################################

# get عدد مرات ظهور كلمة في الكلمات التوب واطوال الكلمات التي ظهرت بها
def singl_kw_occur_in_topN(kw, top_N):
  c=0
  kw_len = []
  for t in top_N:
    if t.split().count(kw) > 0:
      c= c + 1
      kw_len.append(len(t.split()))
  if c>1:
    c=c-1
  return c, kw_len[1:]

################################################333333
def double_kw_occur_in_topN(kw, top_N):
  c=0
  kw_len = []
  for t in top_N:
    t_words = t.split()
    #print("kw ", kw," t_words ",t_words)
    if is_subset(kw.split(),t_words) == 1:
      c= c + 1
      kw_len.append(len(t_words))
  if c>1:
    c=c-1
  return c, kw_len[1:]

  #######################################
def saveList(myList,filename):
    # the filename should mention the extension 'npy'
    np.save(filename,myList)
    print("Saved successfully!")
def loadList(filename):
    # the filename should mention the extension 'npy'
    tempNumpyArray=np.load(filename, allow_pickle=True)
    return tempNumpyArray.tolist()



############ Average p@k and mean avg P@k metric
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])


#اعادة المستند من المقاطع الى شكله كنص متصل
#### select self or pretrained tokens
def from_chunk_to_text(document_as_chunk_sent):
  docs_as_str = []
  for doc in document_as_chunk_sent:
    #print(doc)
    new_d = ' '.join(Convert_list_of_list_to_list(Convert_list_of_list_to_list(doc)))
    docs_as_str.append(new_d)

  return docs_as_str

#########################
# calc the terms freq and mean and std dev
import statistics
from statistics import mean

def term_freq_ngrams(ngram):
  for doc_dic in ngram:
    ngramSum =0
    ngramSum = sum(doc_dic.values())   # ngram sum
    #m = mean(doc_dic.values())
    #st_dev = statistics.pstdev(doc_dic.values())
    for k, v in doc_dic.items():
      doc_dic.update({k:v/ngramSum})
  return ngram


########################
def sort_ngrams(important_ngrams):
  doc_sorted_tfs =[]  # list of doc, features each with tf weight
  #sort each dict of a document
  for dn in important_ngrams:
    newD = sorted(dn.items(), key=lambda x: x[1], reverse=True)
    newD = dict(newD)
    doc_sorted_tfs.append(newD)
  return doc_sorted_tfs


def document_candidate_representation(cleaned_dtf, doc_list_condidates , nr_candidates, bert_model, msg=None):
  document_candidate = []
  doc_list_emb = []
  can_list_emb = []
  for i, doc in enumerate(cleaned_dtf):
    if msg != None:
      print("Documnent №", i, "done ======================================================")
    doc_embedding = bert_model.encode([doc])  # document embedding
    doc_condidates_token = doc_list_condidates[i][:nr_candidates]
    candidate_embeddings = bert_model.encode(doc_condidates_token) #embedding each condidate word

    can_list_emb.append(candidate_embeddings)
    doc_list_emb.append(doc_embedding)
  document_candidate.append(can_list_emb)
  document_candidate.append(doc_list_emb)
  return document_candidate
###################################
#convert list of tuples to a list
def tuple_keywords_to_list(ranked_kywords):
  ranked_list_kywords = []
  for i, ranked_doc in enumerate(ranked_kywords):
    list_kw= []
    for t, v in ranked_doc:
        list_kw.append(t)
    ranked_list_kywords.append(list_kw)

  return ranked_list_kywords
################################################
def norm_Bert_similarities(document_candidate_vectors, doc_list_condidates):
  docs_from_BERT_normalized = []
  #docs_from_BERT = []
  can_list_emb = document_candidate_vectors[0]
  doc_list_emb = document_candidate_vectors[1]

  for i , doc_emb in enumerate(doc_list_emb):

    doc_condidates_token = doc_list_condidates[i][:300]
    candidate_embeddings = can_list_emb[i]
    doc_embedding = doc_list_emb[i]

    #Similarity between each doc and its 300 candidates
    similarities = cosine_similarity(doc_embedding.reshape(1,-1), candidate_embeddings)     # similarity d with each of its words

    # Normalize similarity values by s= s*e^s/|t|
    ss = [s*math.exp(s/len(doc_condidates_token[i].split())) for i,s in enumerate(similarities.tolist()[0])]

    #Normalize values by formula X_new = (X - X_min)/(X_max - X_min)
    X_min = min(ss)
    X_max = max(ss)
    if X_max==X_min:
      print("Max = mean at ",i)
    norm_sim = [(x - X_min)/(X_max - X_min) for x in ss if X_max!=X_min]
    #store similarity values as dictionary  candidates : sim_val
    #dictionary = dict(zip(doc_condidates_token, ss))
    norm_dictionary = dict(zip(doc_condidates_token, norm_sim))


    #docs_from_BERT.append(dictionary)
    docs_from_BERT_normalized.append(norm_dictionary)
  return docs_from_BERT_normalized
#####################################################################333

### Upload dataset

In [10]:
#Read df from csv on drive
df_path = "/content/drive/MyDrive/PHD/Data/Cleaned_DF/wiki20_DF_text_kw"

dtf = pd.read_csv(df_path, sep='\t', encoding='utf-8')
#clean text without removing [.|,|?]
dtf['cleaned_chunked_text'] = dtf.text.apply(lambda x: ' '.join(Preprocess_text_multi_lang_chunk(x,en_stopwords,lang="en")))
#clean text applying all the text preprocessing functions
dtf['finally_cleaned_text'] = dtf.text.apply(lambda x: ' '.join(Preprocess_text_multi_lang(x,en_stopwords,lang="en")))
dtf.head()

Unnamed: 0.1,Unnamed: 0,goldkeys,text,cleaned_chunked_text,finally_cleaned_text
0,0,"['Algorithm', ' Asynchronous learning', ' Clie...",\n The Internet Software Visualization Labo...,internet software visualization laboratory abs...,internet software visualization laboratory abs...
1,1,"['Abstract data type', ' Assertion (computing)...",\n Mutable Object State for Object-Oriented...,mutable object state object-oriented logic pro...,mutable object state objectoriented logic prog...
2,2,"['Algorithm', ' Artificial intelligence', ' Ca...",\n Proceedings of the First International W...,proceeding international workshop multistrateg...,proceeding international workshop multistrateg...
3,3,"['Abstraction (computer science)', ' American ...",\n Observations and Recommendations on the ...,observation recommendation internationalisatio...,observation recommendation internationalisatio...
4,4,"['Algorithm', ' Cluster analysis', ' Data mini...",\n Clustering Full Text Documents\n \n ...,cluster text document abstract topic hierarchy...,cluster text document abstract topic hierarchy...


# **OneClick**

In [35]:
### clean gold kw
orginal_kw  = clear_orginal_kw(dtf['goldkeys'], my_stopwords, "en")

In [18]:
import time
def FBKE(dtf , nr_candidates, bert_model ,chunk_tokenizer,beta):

    print("Start text preprocessing .........")
    start_time = time.time()

    document_as_chunk_sent = all_doc2chunk(dtf['cleaned_chunked_text'] , chunk_tokenizer)
    print("All documents were chunked .... preprocessing done.")
    docs_as_string = from_chunk_to_text(document_as_chunk_sent)

    print("Calculate TF for each ngram type ........ done")
    unigram = []
    bigram = []
    trigram = []

    for n in [1,2,3]:
      vectorizer = CountVectorizer(tokenizer=from_chunk_to_tokens, ngram_range=(n,n))
      vectors = vectorizer.fit_transform(docs_as_string)
      index_value = {i[1]:i[0] for i in vectorizer.vocabulary_.items()}
      if n == 1:
        for row in vectors:
          unigram.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})
      elif n == 2:
        for row in vectors:
          bigram.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

      else:
        for row in vectors:
          trigram.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

    print("Merge all n-gram types in one dictionary ngram:tf ..........done")
    #Calculate Frequencies ngram_count/all_ngram_count

    Unigram= term_freq_ngrams(unigram)
    Bigram = term_freq_ngrams(bigram)
    Trigram =term_freq_ngrams(trigram)

    #Store all ngram in one dict.
    for i, dic in enumerate(Unigram):
      dic.update(Bigram[i])
      dic.update(Trigram[i])

    all_ngram = Unigram

    print("Sort n-gram keywords by TF values ........ done")

    ############################################## get ngram term with tf> mean tf
    important_ngrams = get_important_words_by_tfidf(all_ngram)
    doc_sorted_tfs = sort_ngrams(important_ngrams)

    #  اختيار المرشحات التي ضمن نفس الجملة
    print("Filter and select n-gram keywords that belong to one chunk ........done")
    TF_normalized_weights  = check_chunk_tf(doc_sorted_tfs, document_as_chunk_sent)

    # print the names of candidates only
    doc_list_condidates = get_doc_features_list(TF_normalized_weights) # only features names

    print("Representing documents and their candidates using BERT:")
    document_candidate = document_candidate_representation(dtf['finally_cleaned_text'], doc_list_condidates , nr_candidates, bert_model, None)

    print("Representing is Done ........done")

    #saveList(document_candidate, "/content/drive/MyDrive/PHD/Data/emb_bert/new_bert_wiki20.txt" )
    #document_candidate = loadList("/content/drive/MyDrive/PHD/Data/emb_bert/new_bert_wiki20.txt.npy")

    print("Calculating similarities by BERT ........done")
    BERT_normalized_weights = norm_Bert_similarities(document_candidate, doc_list_condidates)

    print("Done with the excution time --- %s seconds ---" % (time.time() - start_time), "\n")
    scored_doc_kw = score_fastTF(TF_normalized_weights, BERT_normalized_weights , beta)
    #rank keywords by scores
    ranked_scrored_kywords = rank_kywords(scored_doc_kw)

    # check for chuncks
    ranked_chunked_kywords  = tuple_keywords_to_list(ranked_scrored_kywords)


    return ranked_chunked_kywords

In [19]:
nr_candidates = 300 # from each doc we neddd only 300
chunk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#old function without beta
#BERT_normalized_weights , TF_normalized_weights  = FBKE(dtf,nr_candidates,bert_model, chunk_tokenizer)
ks  = FBKE(dtf,nr_candidates,bert_model, chunk_tokenizer, 10)

Start text preprocessing .........
All documents were chunked .... preprocessing done.
Calculate TF for each ngram type ........ done




Merge all n-gram types in one dictionary ngram:tf ..........done
Sort n-gram keywords by TF values ........ done
Filter and select n-gram keywords that belong to one chunk ........done
Representing documents and their candidates using BERT:
Representing is Done ........done
Calculating similarities by BERT ........done
Done with the excution time --- 174.87393021583557 seconds --- 



### **Calculate MAP@k For Many Beta values**

In [37]:
Beta = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 , 1.7, 1.8, 1.9, 2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,12,13,14,15,16,17,18,19,20,30,40,50, 100,200,300,400,500, 1000]
#Beta=[1]

MAP = []
k= 10
for b in Beta:

  # check for chuncks
  ranked_chunked_kywords  = ks
  mpak= mapk(orginal_kw, ranked_chunked_kywords, k)
  # print(b, "mean average precession  @",k,  '=  {0:.4g}'.format(mpak))
  MAP.append(mpak)


resultsMaP  = dict(zip(Beta, MAP))
print("The best result is ",max(MAP))
print("For b=1 MAP@",k, " = ", resultsMaP.get(1))

The best result is  0.16872023809523812
For b=1 MAP@ 10  =  0.16872023809523812
