### Load TF-IDF data from addresses and articles

In [1]:
import os
import json
def read_json_data(json_file_path):
    data = {}
    if os.path.exists(json_file_path):
        with open(json_file_path, 'r') as json_file:
            data = json.load(json_file)
    return data

def save_data_to_json(title, data):
  directory = f'../addresses'
  os.makedirs(directory, exist_ok=True)
  title = title.replace('"', '')
  title = title.replace('/', '_')
  file_path = os.path.join(directory, f'{title}.json')
  with open(file_path, 'w', encoding='utf-8') as json_file:
      json.dump(data, json_file, ensure_ascii=False)

In [2]:
import pymorphy2
import Stemmer
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
morph = pymorphy2.MorphAnalyzer()
stemmer = Stemmer.Stemmer('russian')

def morph_words(words):
    words = word_tokenize(words)
    first_tag = morph.parse(words[1].lower())[0]
    second_tag = morph.parse(words[0].lower())[0]
    
    if first_tag.tag.POS == 'NOUN' and (second_tag.tag.POS == 'ADJF' or second_tag.tag.POS == 'ADJS'):
        cut_adj = stemmer.stemWord(words[1].lower())
        cut_noun = stemmer.stemWord(words[0].lower())

        normal_noun_tag = morph.parse(first_tag.normal_form)[0].tag
        normal_noun = first_tag.normal_form
        normal_adj = morph.parse(second_tag.normal_form)[0]

        if normal_noun_tag.gender is not None:
            if normal_noun_tag.gender == 'masc' and normal_adj.inflect({'masc'}):
                normal_adj = normal_adj.inflect({'masc'}).word
            elif normal_noun_tag.gender == 'femn' and normal_adj.inflect({'femn'}):
                normal_adj = normal_adj.inflect({'femn'}).word
            elif normal_noun_tag.gender == 'neut' and normal_adj.inflect({'neut'}):
                normal_adj = normal_adj.inflect({'neut'}).word
            else:
                normal_adj = morph.parse(second_tag.normal_form)[0].word
        else:
            normal_adj = morph.parse(second_tag.normal_form)[0].word

        return cut_noun +" "+ cut_adj
    return words[1].lower() + " " + words[0].lower()



[nltk_data] Downloading package punkt to /home/aitugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
import openai
import json
import os
from dotenv import load_dotenv
load_dotenv()

def ask_gpt3(addresses):
    openai.api_key = os.getenv('OPENAI_API_KEY')

    res = {}
    for year, props in addresses.items():

        text = '''
            Here are the top 25 phrases of each year I parsed from news website. 
            You must choose top 10 ones that are valuable in from point of economic, 
            culture, medicine, science, tech, sport, life, show, accidents or crime.

            For example, "мой взгляд", "всю сфера" or "такой ситуация" are not so valuable 
            in terms of any of those topics, while "синтетический наркотик", "январское событие" are valuable

            The data is in following format:
            "dd/mm/yyyy": [
              ["word", score],
              ["word", score],
            ]
            you must return data in the same format as it comes, and no other word! 
            This is crucial so that I can parse result into python dict.
            The score and the word should be correctly matched. Use double quotes, not single quotes

            Data:
        ''' + str({year:props})

        message = [{"role": "system", "content": text}]

        chat = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=message
        )

        reply = chat.choices[0].message.content
        dict_reply = json.loads(reply)
        for k,v in dict_reply.items():
            res[k] = v

    return res

data = {
  "01/09/2022": [
    ["справедливый казахстан", 0.03213905339601955],
    ["уязвимая категория", 0.012052145023507332],
    ["синтетический наркотик", 0.012052145023507332],
    ["электоральный цикл", 0.012052145023507332],
    ["январское событие", 0.009488098732903343],
    ["политическую модернизация", 0.008131485206349601],
    ["равную возможность", 0.008131485206349601],
    ["общенациональный референдум", 0.008034763349004888],
    ["решающая роль", 0.008034763349004888],
    ["успешная нация", 0.008034763349004888],
    ["школьная форма", 0.008034763349004888],
    ["различная мера", 0.008034763349004888],
    ["общенациональный интерес", 0.008034763349004888],
    ["силовому орган", 0.008034763349004888],
    ["семейно-бытового насилие", 0.008034763349004888],
    ["уголовно-процессуальный кодекс", 0.008034763349004888],
    ["один срок", 0.008034763349004888],
    ["справедливое распределение", 0.006325399155268895],
    ["инвестиционная привлекательность", 0.006325399155268895],
    ["базовый фактор", 0.006325399155268895],
    ["указанная мера", 0.006325399155268895],
    ["данный шаг", 0.006325399155268895],
    ["светлое будущее", 0.006325399155268895],
    ["полноценный институт", 0.006325399155268895],
    ["массовый беспорядок", 0.006325399155268895]
  ]
}
print(ask_gpt3(data))
    

{"01/09/2022": [["справедливый казахстан", 0.03213905339601955], ["уязвимая категория", 0.012052145023507332], ["синтетический наркотик", 0.012052145023507332], ["январское событие", 0.009488098732903343], ["политическую модернизация", 0.008131485206349601], ["равную возможность", 0.008131485206349601], ["общенациональный референдум", 0.008034763349004888], ["решающая роль", 0.008034763349004888], ["успешная нация", 0.008034763349004888], ["школьная форма", 0.008034763349004888]]}
{'01/09/2022': [['справедливый казахстан', 0.03213905339601955], ['уязвимая категория', 0.012052145023507332], ['синтетический наркотик', 0.012052145023507332], ['январское событие', 0.009488098732903343], ['политическую модернизация', 0.008131485206349601], ['равную возможность', 0.008131485206349601], ['общенациональный референдум', 0.008034763349004888], ['решающая роль', 0.008034763349004888], ['успешная нация', 0.008034763349004888], ['школьная форма', 0.008034763349004888]]}


In [4]:
from difflib import SequenceMatcher
topics = read_json_data("../articles/map_data.json")
addresses = read_json_data("../addresses/tf_idf.json")
for date, words in addresses.items():
    for word_score in words:
        word, score = word_score
        morphed_word = morph_words(word)

        if morphed_word in topics:
            word_score.append(topics[morphed_word][1])
        else:
            max_ratio = [-1,""]
            for word, props in topics.items():
                simil_ratio = SequenceMatcher(None, morphed_word, word).ratio()
                if max_ratio[0] < simil_ratio:
                    max_ratio = [simil_ratio, props[1]]

            if max_ratio[0] == -1:
                print(morphed_word)
                word_score.append("undefined topic")
            else:
                word_score.append(max_ratio[1])

save_data_to_json("final", addresses)