In [3]:
TOPICS = {
  "economic": "Экономика",
  "culture": "Культура",
  "medicine": "Медицина",
  "science": "Наука",
  "tech": "Технологии",
  "tengri-sport": "Спорт",
  "life":"Жизнь",
  "show": "Шоу-бизнес",
  "accidents":"Происшествия",
  "crime":"Преступность",
}
URL = "https://tengrinews.kz"


# TF-IDF for articles

In [None]:
!pip install dateparser
!pip install --user -U nltk
!pip install pystemmer
!pip install langdetect
!pip install pymorphy2

In [7]:
import re
import string
import math
from collections import defaultdict
import pymorphy2
import Stemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/aitugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def clean_text(text):
  characters_to_remove = ['«', '»', '“', '”', '•', '\xa0', '\r','\t', '…', '–','—','№','0','1','2','3','4','5','6','7','8','9','„','‟']
  pattern = '[' + re.escape(''.join(characters_to_remove)) + ']'
  text = re.sub(pattern, ' ', text)
  return text

def tokenize_words(text):
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.lower()
  text = [word.strip() for word in text.split(" ") if word and word.strip()]

  return text

def contains_kazakh_letters(text):
    kazakh_letters_pattern = r'[әіңғүұқөһӘІҢҒҮҰҚӨҺ]'
    matches = re.findall(kazakh_letters_pattern, text, re.IGNORECASE)
    return bool(matches)


In [9]:
def tokenize_sentences(topic_pages):
    sent_tokenized_page = {}
    for topic, pages in topic_pages.items():
      tokenized_article = []
      for page in pages:
        if contains_kazakh_letters(page):
          continue

        sentence_tokens = sent_tokenize(clean_text(page))
        for i, sentence in enumerate(sentence_tokens):
          sentence = sentence.translate(str.maketrans('', '', string.punctuation))
          sentence = sentence.lower()
        tokenized_article.append(sentence_tokens)

      if topic in sent_tokenized_page:
        sent_tokenized_page[topic].append(tokenized_article)
      else:
        sent_tokenized_page[topic] = tokenized_article

    return sent_tokenized_page

In [10]:
def tokenize_words(sent_tokenized_page):
    word_tokenized_page = {}
    for topic, pages in sent_tokenized_page.items():
      for page in pages:
        page_data = []
        for sentence in page:
          word_tokens = word_tokenize(sentence)
          page_data.append(word_tokens)
        if topic in word_tokenized_page:
          word_tokenized_page[topic].append(page_data)
        else:
          word_tokenized_page[topic] = [page_data]

    return word_tokenized_page


In [11]:
def morph_words(topic_articles_dict):
    sent_tokenized_page = tokenize_sentences(topic_articles_dict)
    word_tokenized_page = tokenize_words(sent_tokenized_page)
    morph = pymorphy2.MorphAnalyzer()
    morphed_words_no_ending = {}
    stemmer = Stemmer.Stemmer('russian')
    for topic, pages in word_tokenized_page.items():
      for page in pages:
        page_data = []
        for sentence in page:
          for i in range(len(sentence)):
            first_tag = morph.parse(sentence[i].lower())[0]
            second_tag = morph.parse(sentence[i-1].lower())[0]
            if first_tag.tag.POS == 'NOUN' and (second_tag.tag.POS == 'ADJF' or second_tag.tag.POS == 'ADJS'):
      
              cut_adj = stemmer.stemWord(sentence[i-1].lower())
              cut_noun = stemmer.stemWord(sentence[i].lower())
      
              normal_noun_tag = morph.parse(first_tag.normal_form)[0].tag
              normal_noun = first_tag.normal_form
              normal_adj = morph.parse(second_tag.normal_form)[0]
      
              if normal_noun_tag.gender is not None:
                if normal_noun_tag.gender == 'masc' and normal_adj.inflect({'masc'}):
                  normal_adj = normal_adj.inflect({'masc'}).word
                elif normal_noun_tag.gender == 'femn' and normal_adj.inflect({'femn'}):
                  normal_adj = normal_adj.inflect({'femn'}).word
                elif normal_noun_tag.gender == 'neut' and normal_adj.inflect({'neut'}):
                  normal_adj = normal_adj.inflect({'neut'}).word
                else:
                  normal_adj = morph.parse(second_tag.normal_form)[0].word
              else:
                normal_adj = morph.parse(second_tag.normal_form)[0].word
      
              page_data.append([cut_adj+" "+cut_noun, normal_adj+" "+normal_noun])

        if topic in morphed_words_no_ending:
          morphed_words_no_ending[topic].append(page_data)
        else:
          morphed_words_no_ending[topic] = [page_data]
    return morphed_words_no_ending


In [12]:
def count_word_tf_by_roots(roots):
  word_tf = defaultdict(lambda: defaultdict(list))
  for topic, pages in roots.items():
    for page in pages:
      count = {}
      for phrase in page:
        if (phrase[0],phrase[1]) in count:
          count[(phrase[0],phrase[1])] += 1
        else:
          count[(phrase[0],phrase[1])] = 1

      total = len(page)
      for phrase, cnt in count.items():
        word_tf[topic][phrase[0]] = [count[phrase]/total, phrase[1]]

  return word_tf

In [13]:
def count_word_idf_by_roots(roots):
  word_cnt = defaultdict(int)
  word_idf = defaultdict(list)

  for topic, pages in roots.items():
    for page in pages:
        phrases_list = [(phrase[0],phrase[1]) for phrase in page]
        phrases_set = set(phrases_list)
        for phrase in phrases_set:
          word_cnt[phrase] += 1

    for phrase, doc_freq in word_cnt.items():
      word_idf[phrase[0]] = [math.log(len(roots[topic]) / word_cnt[phrase]), phrase[1]]

  return word_idf


In [14]:
def calculate_tf_idf_by_roots(roots, words_tf, words_idf):
  word_tf_idf_by_topic={}

  for topic, pages in roots.items():
    word_tf_idf = {}
    for page in pages:
      for phrase in page:
        if phrase[0] in words_tf[topic] and phrase[0] in words_idf:
          word_tf_idf[(phrase[0],phrase[1])] = words_tf[topic][phrase[0]][0] * words_idf[phrase[0]][0]
        else:
          break
      
      word_tf_idf_by_topic[topic] = sorted([(item[0][0],item[1]) for item in word_tf_idf.items()], key=lambda item: item[1], reverse=True)#sorted_word_tf_idf_in_doc[:5]
  return word_tf_idf_by_topic


### Iterate over files

In [15]:
import json
import os
def extract_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

json_file_path = '../articles/data.json'

if os.path.exists(json_file_path):
    with open(json_file_path, 'r') as json_file:
        topic_tf_idf = json.load(json_file)
else:
    topic_tf_idf = {}

for topic in TOPICS:
    topic_articles = {}
    folder_path = f'../articles/{topic}'
    # Iterate over files in the folder
    articles = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            text = extract_text_from_file(file_path)
            articles.append(text)

    pages = len(articles)
    topic_articles[topic] = articles
    morphed_words = morph_words(topic_articles)
    word_tf = count_word_tf_by_roots(morphed_words)
    word_idf = count_word_idf_by_roots(morphed_words)
    word_tf_idf = calculate_tf_idf_by_roots(morphed_words, word_tf, word_idf)
    topic_tf_idf[topic] = word_tf_idf[topic]

with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(topic_tf_idf, json_file, ensure_ascii=False)


### Bring data to map format

In [16]:
json_file_path = "../articles/data.json"
data = {}
if os.path.exists(json_file_path):
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

In [22]:
data_map_format = {}
for topic, word_data in data.items():
    for word_score in word_data:
        word, score = word_score
        if word not in data_map_format:
            data_map_format[word] = [score, topic]
        else:
            prev_score = data_map_format[word][0]
            if prev_score < score:
                data_map_format[word] = [score, topic]

json_file_path = "../articles/map_data.json"
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(data_map_format, json_file, ensure_ascii=False)
