In [3]:
import nltk
nltk.download('punkt_tab')
import os
import random
nltk.download('punkt')
import pandas as pd
import numpy as np
import torch
from collections import defaultdict
from sklearn.svm import SVC
import json
import re
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Support functions:

In [5]:
def preprocess(s):
    # function for text clearning
    s=s.replace('no title','')
    s=s.replace('no tags','')
    s=s.replace('[]','')
    s=s.split('$$$ БКС Мир инвестиций $$$ Инвестировать легче вместе')[0]
    s=s.replace('$$$','.')
    s=re.sub('\s*"','',s)
    s=s.replace('#','')
    s=s.replace(' • ',', ')
    return s


In [9]:
def words(i):
    # a function for splitting text into words
    regex = re.compile(r'[А-Яа-яA-zёЁ-]+')
    i= " ".join(regex.findall(i))
    tokens=word_tokenize(i.lower())
    return tokens

In [12]:
def year_extraction(s):
    # a function for year extraction from the date column
    if type(s) == int or type(s) == float:
       s=str(s)
    else:
       s=s.split('-')[0]
    return int(s)


def index_change_labels(s):
   # a function that changes the value of labels to indexes that are understandable for the  BertForClassification model
    if s == 1 :
       s=0
    elif s== 0:
       s=1
    elif s== -1:
       s=2
    return s

In [31]:
def evaluation(model,prediction_dataloader,tokenizer):
      # a function that predicts labels for unmarked dataset
      valid_stats=[]
      model.eval()
      Full_test_set=[]

      predictions , true_labels , text = [], [] ,[]

      # Predict
      for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
        for i in b_input_ids:
            string_seg= tokenizer.decode(i,skip_special_tokens=True)
            text.append(string_seg)


      flat_predictions = np.concatenate(predictions, axis=0)
      flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
      flat_true_labels = np.concatenate(true_labels, axis=0)


      Full_test_set.append(
        {
            'True_labels': flat_true_labels.reshape(-1, 1),
            'Predicted_labels': flat_predictions.reshape(-1, 1),
            'Text': np.transpose(text)


        }
        )

      return Full_test_set

In [34]:
def index_change_labels_reverse(s):
   # a function that changes  indexes to the value of labels  that are understandable to Financial model
    if s == 0 :
       s=1
    elif s== 1:
       s=0
    elif s== 2:
       s=-1
    return s

In [39]:
def month_extraction(s):
    # a function for month  extraction from the date column
    if type(s) == int:
       month=0
    else:
       if len(s.split('-'))>=2:
          month=s.split('-')[1]
       else:
          month=0
    return int(month)


In [40]:
!pip install pymorphy3

Collecting pymorphy3
  Downloading pymorphy3-2.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg-python, pymorphy3
Successfully installed dawg-python-0.7.2 pymorphy3-2.0.2 pymorphy3-dicts-ru-2.4.417150.4580142


In [41]:
import pymorphy3
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

morph = pymorphy3.MorphAnalyzer()

def lemms(text):
    # a function that lemmatize text extractions
    text=text.lower()
    lemms_text=' '.join([morph.parse(x)[0].normal_form for x in text.split(' ')])
    regex = re.compile(r'[А-Яа-яA-zёЁ-]+')
    return ' '.join(regex.findall(lemms_text))


[nltk_data] Downloading package wordnet to /root/nltk_data...


## Initialization  of Financial model :

In [57]:
def parameter_predicted(full_stats,year,month, parameter,filtered_dataset):
    # a function that extracts the tonality of a factor 
    if filtered_dataset.shape[0]==0:
        predicted=-2
    else:
        parameter= '|'.join(parameter)
        Param = filtered_dataset[filtered_dataset['Text_lem'].str.contains(parameter)]['Predicted_labels'].value_counts().reset_index()
        Param.columns=['label','counted_value']
        if Param.empty:
            predicted=-2
        else:
            max_count=[]
            max__value_count=[]
            max =Param['counted_value'].loc[0]
            for index,i in enumerate(Param['counted_value']) :
                if i >= max :
                  max_count.append(max)
                  max__value_count.append(Param.loc[index]['label'])
                  max=i
            if len(max_count)==1:
              predicted = max__value_count[0]
            elif len(max_count)==2:
              predicted = min(max__value_count)
            else:
              predicted = 0
    return predicted

def replace(x):
    # a function that converts labels to words with meaning 
        if x ==0:
           x= 'Neutral'
        elif x ==1:
           x= 'Positive'
        elif x ==-1:
           x='Negative'
        elif x==-2:
           x='No news'
        return x

def full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search):
    # a function the gives final tonality of all factors and generate recommendations
    no_rec=0
    if month==1 :
       month=13
       year=year-1

    filtered_dataset=full_stats[(full_stats['year']==year) &(full_stats['month']>=month-moths_for_search)&(full_stats['month']< month)]
    print('Number of news items :',filtered_dataset.shape[0])
    if filtered_dataset.shape[0]==0:
        print('There are no any news for the period','\n')
        no_rec=1
    predictions=[]
    for i in parameters:
        pred_param=parameter_predicted(full_stats,year,month, i,filtered_dataset)
        predictions.append(pred_param)

    result=[ replace(x) for x in predictions ]

    coupon2=0
    coupon1=0
    period1=0
    period2=0
    company=0



    if predictions[0]==-1 and predictions[1]==-1:
       coupon1='bond with floated coupon rate'
       period1='for half a year'
       coupon2='linker'
       period2='for half a year'

    if predictions[0]==-1 and predictions[1]==0:
       coupon1='bond with floated coupon rate'
       period1='for half a year'
       coupon2='linker'
       period2='until the next CBR meeting'

    if (predictions[0]==-2 and predictions[1]==-1) or(predictions[0]==-1 and predictions[1]==1) :
       coupon1='bond with floated coupon rate '
       period1='for half a year'

    if (predictions[0]==-2 and predictions[1]==0) or (predictions[0]==0 and predictions[1]==-2) or (predictions[0]==-2 and predictions[1]==-2) or (predictions[0]==1 and predictions[1]==-2) or (predictions[0]==-1 and predictions[1]==-2) or (predictions[0]==0 and predictions[1]==1):
       coupon1='bond with fixed coupon rate'
       period1='until the next CBR meeting'

    if (predictions[0]==1 and predictions[1]==0) or (predictions[0]==0 and predictions[1]==0)or (predictions[0]==0 and predictions[1]==-1)or (predictions[0]==1 and predictions[1]==1)or (predictions[0]==1 and predictions[1]==-1)or (predictions[0]==-2 and predictions[1]==1):
       coupon1='bond with fixed coupon rate'
       period1='at least for half a year'




    if risk_apetit =='conservative' :
        company='OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA '
    elif risk_apetit =='neutral' :
        company='corporate bonds  with a credit rating from A to B'
    elif risk_apetit =='agressive':
        company='corporate bonds  with a credit rating C'



    print('Predictions :')
    print('Inflation :'  ,result[0] )
    print('Key Rate :' ,result[1],'\n')

    print('Predictions by industry :')
    print('Gas and oil :'  ,result[2])
    print('Construction industry :'  ,result[3] )
    print('Metallurgy :'  ,result[4] )
    print('Transport :'  ,result[5] )
    print('Energy :'  ,result[6] )
    print('Communication and IT:'  ,result[7] )
    print('Trading:'  ,result[8] )
    print('Banks:'  ,result[9],'\n')

    if no_rec!=1:
        print('Full recomendation:')
        if coupon2 !=0:
          print (f'You can buy {coupon1} with holding period {period1} or {coupon2} with holding period {period2}, companies : {company}')
        else:
          print (f'You can buy  {coupon1} with holding period {period1} , companies: {company}')


 ## Model implementation  :

In [56]:
# loading previously saved model
from transformers import BertTokenizer,BertModel
from torch.utils.data import DataLoader, SequentialSampler,TensorDataset

batch_size = 8
max_length=73
model = BertModel.from_pretrained('./model_save')
tokenizer = BertTokenizer.from_pretrained('./model_save')

In [27]:

import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    model.cuda()
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [66]:
# preparing news_corpus
test = pd.read_excel('news_corpus.xlsx')
test['text_extraction'] = test.title + ' ' + test.body + ' ' + test.tags
test['text_extraction_cleaned']= test.text_extraction.map(preprocess)
sentences_test = test.text_extraction_cleaned.values

test['index_BERT']= test.target.map(index_change_labels)
labels_test= test.index_BERT.values
labels_test



input_ids = []
attention_masks = []

for sent in sentences_test:
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = max_length,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_test)


prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [67]:
dataset_test_with_pred=evaluation(model=model,prediction_dataloader=prediction_dataloader,tokenizer=tokenizer)
df_stats = pd.DataFrame (data=dataset_test_with_pred)
df_stats3 = pd.DataFrame (df_stats['Predicted_labels'][0],columns=['Predicted_labels'])
df_stats2 = pd.DataFrame (df_stats['Text'][0],columns=['Text'])
df_stats=pd.concat([df_stats3,df_stats2],axis=1)
df_stats['Predicted_labels']=df_stats['Predicted_labels'].map(index_change_labels_reverse)

test=test.reset_index()

full_stats=df_stats.join(test)
full_stats

full_stats['year']=full_stats.date.map(year_extraction)
full_stats['month']=full_stats.date.map(month_extraction)
full_stats['Text_lem']=full_stats.Text.map(lemms)
full_stats.head(5)

Unnamed: 0.1,Predicted_labels,Text,index,Unnamed: 0,Unnamed: 1,title,body,date,tags,source,target,text_extraction,text_extraction_cleaned,index_BERT,year,month,Text_lem
0,-1,Пошлины США на российский алюминий не окажут с...,0,2,1,no title,Пошлины США на российский алюминий не окажут с...,2023-02-07,no tags,smart_lab,0,no title Пошлины США на российский алюминий не...,Пошлины США на российский алюминий не окажут ...,1,2023,2,пошлина сша на российский алюминий не оказать ...
1,1,« Газпром » видит потенциал работы с Африкой и...,1,3,2,«Газпром» видит потенциал работы с Африкой и г...,В Африке прогнозируется существенный экономиче...,2023-06-22,['ГАЗПРОМ ао'],finam,0,«Газпром» видит потенциал работы с Африкой и г...,«Газпром» видит потенциал работы с Африкой и г...,1,2023,6,газпром видеть потенциал работа с африка и гот...
2,0,"ЦБ повысил ставку, третий эшелон сдает позиции...",2,4,3,"ЦБ повысил ставку, третий эшелон сдает позиции...",Подводим итоги недели с 11 по 15 сентября на р...,2023-09-15 00:00:00,[],bcs,0,"ЦБ повысил ставку, третий эшелон сдает позиции...","ЦБ повысил ставку, третий эшелон сдает позиции...",1,2023,9,цб повысить ставку третий эшелон сдавать позиц...
3,1,У России хватит средств увеличить расходы по и...,3,7,4,У России хватит средств увеличить расходы по и...,Об этом заявил президент России Владимир Путин,2024-03-13,[],finam,0,У России хватит средств увеличить расходы по и...,У России хватит средств увеличить расходы по и...,1,2024,3,у россия хватить средство увеличить расход по ...
4,0,Ozon назначил руководителем маркетплейса бывше...,4,10,5,no title,Ozon назначил руководителем маркетплейса бывше...,2023-04-12,no tags,smart_lab,0,no title Ozon назначил руководителем маркетпле...,Ozon назначил руководителем маркетплейса бывш...,1,2023,4,ozon назначить руководитель маркетплейс бывший...


In [68]:
# key words setup
inflation=['инфляция','индекс потребительских цен','текущий и ожидаемый уровень инфляции','инфляционные риски']
key_rate=['ключевая ставка','центральный банк','набиулина','банк россии','совет директоров банка россии','ключевой цб']
gazoil=['нефть','газ','газопровод','brent','брент','лукойл','выручка от продажи газа','газотранспортная система','добыча углеводородного сырья','газпром','цена на нефть','urals']
construction=['пик','самолет','ипотека','строительство','девелопмент','уровень обеспеченности заказами','сэтл групп','лср']
metallurgy=['северсталь','металлургический комбинат','металлургический комплекс','нлмк']
transport=['ржд','автодор','транспортная отрасль','грузовые перевозки','автомобильные перевозки',]
energy=['атомэнергопром','русгидро','тгк','россети','моэк','гидроэнергия','декарбонизация','ископаемое топливо','электроэнергия']
communication=['ит-рынок','услуги в сфере ит','информационные технологии','интернет','провайдер','софтлайн','мтс','селектел','позитив текнолоджис','headhunter','каршеринг','мейл','застройщик']
trade_industry=['афк система','x5 retail group','mercury retail group','магнит', 'продажа товаров','ритейлер']
banks=['вэб','газпромбанк','банковский сектор','капитал','промсвязьбанк','т-банк','втб','костин']


# parameters setup
parameters=[inflation,key_rate,gazoil,construction,metallurgy,transport,energy,communication,trade_industry,banks]

year=2023 # year of portfolio rebalancing
month=1 # month of portfolio rebalancing
risk_apetit='conservative' # could be conservative,neutral,agressive
moths_for_search=3 # period for which we want to receive news

full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1157
Predictions :
Inflation : Negative
Key Rate : Positive 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Negative
Transport : Positive
Energy : Negative
Communication and IT: Positive
Trading: Positive
Banks: Positive 

Full recomendation:
You can buy  bond with floated coupon rate  with holding period for half a year , companies: OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [69]:
year=2023
month=4
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1215
Predictions :
Inflation : Negative
Key Rate : Neutral 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Neutral
Transport : Positive
Energy : Positive
Communication and IT: Positive
Trading: Positive
Banks: Positive 

Full recomendation:
You can buy bond with floated coupon rate with holding period for half a year or linker with holding period until the next CBR meeting, companies : OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [70]:
year=2023
month=7
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1180
Predictions :
Inflation : Negative
Key Rate : Positive 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Negative
Transport : Positive
Energy : Positive
Communication and IT: Positive
Trading: Positive
Banks: Neutral 

Full recomendation:
You can buy  bond with floated coupon rate  with holding period for half a year , companies: OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [71]:
year=2023
month=10
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1411
Predictions :
Inflation : Negative
Key Rate : Negative 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Negative
Transport : Positive
Energy : Positive
Communication and IT: Neutral
Trading: Positive
Banks: Neutral 

Full recomendation:
You can buy bond with floated coupon rate with holding period for half a year or linker with holding period for half a year, companies : OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [72]:
year=2024
month=1
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1365
Predictions :
Inflation : Negative
Key Rate : Negative 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Positive
Transport : Positive
Energy : Positive
Communication and IT: Positive
Trading: Positive
Banks: Positive 

Full recomendation:
You can buy bond with floated coupon rate with holding period for half a year or linker with holding period for half a year, companies : OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [73]:
year=2024
month=4
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1333
Predictions :
Inflation : Negative
Key Rate : Neutral 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Positive
Transport : Positive
Energy : Positive
Communication and IT: Positive
Trading: Positive
Banks: Positive 

Full recomendation:
You can buy bond with floated coupon rate with holding period for half a year or linker with holding period until the next CBR meeting, companies : OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [74]:
year=2024
month=7
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1348
Predictions :
Inflation : Negative
Key Rate : Neutral 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Positive
Transport : Negative
Energy : Positive
Communication and IT: Positive
Trading: Positive
Banks: Neutral 

Full recomendation:
You can buy bond with floated coupon rate with holding period for half a year or linker with holding period until the next CBR meeting, companies : OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 


In [75]:
year=2024
month=10
risk_apetit='conservative'
moths_for_search=3
full_prediction (parameters, year,month,full_stats,risk_apetit,moths_for_search)

Number of news items : 1369
Predictions :
Inflation : Negative
Key Rate : Negative 

Predictions by industry :
Gas and oil : Positive
Construction industry : Positive
Metallurgy : Neutral
Transport : Positive
Energy : Positive
Communication and IT: Neutral
Trading: Neutral
Banks: Positive 

Full recomendation:
You can buy bond with floated coupon rate with holding period for half a year or linker with holding period for half a year, companies : OFZ,bonds of the leaders of the gas and oil industry or corporate bonds with a credit rating of at least AA 
