In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score

import os
import re
import gc
import warnings
import pickle
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
companies = ['ALRS', 'AFLT', 'VTBR', 'GAZP', 'GMKN', 'LSRG', 'DSKY', 'IRAO', 'LKOH', 'MAGN', 'MOEX', 'CBOM', 'MTSS',
             'ROSN', 'NLMK', 'NVTK', 'PIKK', 'PLZL', 'RTKM', 'RUAL', 'HYDR', 'SBER', 'CHMF', 'AFKS', 'SNGS', 'TATN',
             'TCSG', 'TRNFP', 'PHOR', 'FEES', 'YNDX']

## Кросс-валидация Телеграмм

### Три класса

In [None]:
tg = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps_1,5sigma_.parquet')
tg_train = tg[tg.date < '2021-06-01'].copy()
tg_test = tg[(tg.date < '2022-01-01') & (tg.date > '2021-06-01')].copy()

In [None]:
df = pd.DataFrame(index=['acc_test', 'pre_test', 'rec_test', 'auc_test', 'f1_test', 'kappa_test'])

In [None]:
targets = tg_train.iloc[:,3:].columns
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), max_features=10000)
tfidf.fit(tg_train.message.values)
for company in companies:
  n_jobs=-1
  print(company)
  os.makedirs(f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}', exist_ok=True)
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
  first_day = pd.to_datetime(first_dates.loc[company,:].values[0])

  comp_train = tg_train[tg_train[company] == True].copy()
  comp_test = tg_test[tg_test[company] == True].copy()

  features_train = tfidf.transform(comp_train['message'].values)
  features_test = tfidf.transform(comp_test['message'].values)

  features_train = np.asarray(features_train.todense())
  features_test = np.asarray(features_test.todense())

  tr = [col for col in targets if company in col]
  if company in tr:
    tr.remove(company)

  for t in tr:
    y_train = comp_train[t].values.astype(int)
    y_test = comp_test[t].values.astype(int)

    param_grid = {'max_depth': list(range(5, 14))}

    clf = RandomForestClassifier(random_state=123, n_estimators=200, criterion='entropy', n_jobs=-1, class_weight='balanced')

    if company in ['GAZP', 'SBER', 'VTBR']:
      n_jobs=1

    gc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=n_jobs, scoring='accuracy', verbose=2)
    gc.fit(features_train, y_train)

    preds_test = gc.predict(features_test)
    preds_proba_test = gc.predict_proba(features_test)

    acc_test = accuracy_score(y_test, preds_test)
    pre_test = precision_score(y_test, preds_test, average='weighted')
    rec_test = recall_score(y_test, preds_test, average='weighted')
    auc_test = roc_auc_score(y_test, preds_proba_test, average='weighted', multi_class='ovo')
    f1_test = f1_score(y_test, preds_test, average='weighted')
    kappa_test = cohen_kappa_score(y_test, preds_test)

    df[t] = [acc_test, pre_test, rec_test, auc_test, f1_test, kappa_test]

    filename = f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}/{t}_neutral_all.sav'
    pickle.dump(gc, open(filename, 'wb'))

    df.to_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_neutral_all.csv')

In [None]:
df = pd.DataFrame(index=['acc_test', 'pre_test', 'rec_test', 'auc_test', 'f1_test', 'kappa_test'])

In [None]:
targets = tg_train.iloc[:,3:].columns
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), max_features=10000)
tfidf.fit(tg_train.message.values)
for company in companies:
  n_jobs=-1
  print(company)
  os.makedirs(f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}', exist_ok=True)
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
  first_day = pd.to_datetime(first_dates.loc[company,:].values[0])

  comp_train = tg_train[tg_train[company] == True].copy()
  comp_test = tg_test[tg_test[company] == True].copy()

  features_train = tfidf.transform(comp_train['message'].values)
  features_test = tfidf.transform(comp_test['message'].values)

  features_train = np.asarray(features_train.todense())
  features_test = np.asarray(features_test.todense())

  tr = [col for col in targets if company in col]
  if company in tr:
    tr.remove(company)

  for t in tr:
    y_train = comp_train[t].values.astype(int)
    y_test = comp_test[t].values.astype(int)

    param_grid = {'max_depth': list(range(5, 14))}

    clf = LGBMClassifier(random_state=123, n_jobs=-1, class_weight='balanced',  n_estimators=200)

    if company in ['GAZP', 'SBER', 'VTBR']:
      n_jobs=1

    gc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=n_jobs, scoring='accuracy', verbose=2)
    gc.fit(features_train, y_train)

    preds_test = gc.predict(features_test)
    preds_proba_test = gc.predict_proba(features_test)

    acc_test = accuracy_score(y_test, preds_test)
    pre_test = precision_score(y_test, preds_test, average='weighted')
    rec_test = recall_score(y_test, preds_test, average='weighted')
    auc_test = roc_auc_score(y_test, preds_proba_test, average='weighted', multi_class='ovo')
    f1_test = f1_score(y_test, preds_test, average='weighted')
    kappa_test = cohen_kappa_score(y_test, preds_test)

    df[t] = [acc_test, pre_test, rec_test, auc_test, f1_test, kappa_test]

    filename = f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}/{t}_neutral_all_boosting.sav'
    pickle.dump(gc, open(filename, 'wb'))

    df.to_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_neutral_all_boosting.csv')

### Два класса

In [None]:
tg = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps.parquet')
tg_train = tg[tg.date < '2021-06-01'].copy()
tg_test = tg[(tg.date < '2022-01-01') & (tg.date > '2021-06-01')].copy()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_all.csv', index_col=0)

In [None]:
df

In [None]:
df = pd.DataFrame(index=['acc_test', 'pre_test', 'rec_test', 'auc_test', 'f1_test', 'kappa_test'])

In [None]:
targets = tg_train.iloc[:,3:].columns
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), max_features=10000)
tfidf.fit(tg_train.message.values)
for company in companies:
  n_jobs=-1
  print(company)
  os.makedirs(f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}', exist_ok=True)
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
  first_day = pd.to_datetime(first_dates.loc[company,:].values[0])

  comp_train = tg_train[tg_train[company] == True].copy()
  comp_test = tg_test[tg_test[company] == True].copy()

  features_train = tfidf.transform(comp_train['message'].values)
  features_test = tfidf.transform(comp_test['message'].values)

  features_train = np.asarray(features_train.todense())
  features_test = np.asarray(features_test.todense())

  tr = [col for col in targets if company in col]
  if company in tr:
    tr.remove(company)

  for t in tr:
    y_train = comp_train[t].values.astype(int)
    y_test = comp_test[t].values.astype(int)

    param_grid = {'max_depth': list(range(5, 14))}

    clf = RandomForestClassifier(random_state=123, n_estimators=200, criterion='entropy', n_jobs=-1, class_weight='balanced')

    if company in ['GAZP', 'SBER', 'VTBR']:
      n_jobs=1

    gc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=n_jobs, scoring='accuracy', verbose=2)
    gc.fit(features_train, y_train)

    preds_test = gc.predict(features_test)
    preds_proba_test = gc.predict_proba(features_test)

    acc_test = accuracy_score(y_test, preds_test)
    pre_test = precision_score(y_test, preds_test)
    rec_test = recall_score(y_test, preds_test)
    auc_test = roc_auc_score(y_test, preds_proba_test[:,1])
    f1_test = f1_score(y_test, preds_test)
    kappa_test = cohen_kappa_score(y_test, preds_test)

    df[t] = [acc_test, pre_test, rec_test, auc_test, f1_test, kappa_test]

    filename = f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}/{t}_all.sav'
    pickle.dump(gc, open(filename, 'wb'))

    df.to_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_all.csv')

In [None]:
df = pd.DataFrame(index=['acc_test', 'pre_test', 'rec_test', 'auc_test', 'f1_test', 'kappa_test'])

In [None]:
targets = tg_train.iloc[:,3:].columns
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2), max_features=10000)
tfidf.fit(tg_train.message.values)
for company in companies:
  n_jobs=-1
  print(company)
  os.makedirs(f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}', exist_ok=True)
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
  first_day = pd.to_datetime(first_dates.loc[company,:].values[0])

  comp_train = tg_train[tg_train[company] == True].copy()
  comp_test = tg_test[tg_test[company] == True].copy()

  features_train = tfidf.transform(comp_train['message'].values)
  features_test = tfidf.transform(comp_test['message'].values)

  features_train = np.asarray(features_train.todense())
  features_test = np.asarray(features_test.todense())

  tr = [col for col in targets if company in col]
  if company in tr:
    tr.remove(company)

  for t in tr:
    y_train = comp_train[t].values.astype(int)
    y_test = comp_test[t].values.astype(int)

    param_grid = {'max_depth': list(range(5, 14))}

    clf = LGBMClassifier(random_state=123, n_jobs=-1, class_weight='balanced',  n_estimators=200)

    if company in ['GAZP', 'SBER', 'VTBR']:
      n_jobs=1

    gc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=n_jobs, scoring='accuracy', verbose=2)
    gc.fit(features_train, y_train)

    preds_test = gc.predict(features_test)
    preds_proba_test = gc.predict_proba(features_test)

    acc_test = accuracy_score(y_test, preds_test)
    pre_test = precision_score(y_test, preds_test, average='weighted')
    rec_test = recall_score(y_test, preds_test, average='weighted')
    auc_test = roc_auc_score(y_test, preds_proba_test, average='weighted', multi_class='ovo')
    f1_test = f1_score(y_test, preds_test, average='weighted')
    kappa_test = cohen_kappa_score(y_test, preds_test)

    df[t] = [acc_test, pre_test, rec_test, auc_test, f1_test, kappa_test]

    filename = f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}/{t}_all_boosting.sav'
    pickle.dump(gc, open(filename, 'wb'))

    df.to_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_all_boosting.csv')

## Кросс-валидация традиционные новости

In [None]:
news = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news.parquet')
news = news[news.date < '2022-01-01'].copy()
news.drop(columns=sectors, inplace=True)
news['message'] = news['title'] + ' ' + news['announce']
news.drop(columns=['link', 'title', 'announce', 'text'], inplace=True)

In [None]:
targets = news.iloc[:,3:-1].columns
for company in companies:
  print(company)
  os.makedirs(f'/content/drive/MyDrive/Диссертация/news_loggs/{company}', exist_ok=True)
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='date')
  first_day = pd.to_datetime(first_dates.loc[company,:].values[0])
  comp = news[news[company] == True].copy()
  comp = comp[comp.date >= first_day].copy()
  comp.dropna(inplace=True)

  tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2))

  train, test = comp.iloc[:int(0.8*len(comp)), :], comp.iloc[int(0.8*len(comp)):, :]
  tfidf.fit(comp['message'])

  features_train = tfidf.transform(train['message'])
  features_test = tfidf.transform(test['message'])

  X_train = np.asarray(features_train.todense())
  X_test = np.asarray(features_test.todense())

  tr = [col for col in targets if company in col]

  test_logg = []
  train_logg = []

  for t in tr:
    y_train = train[t].values.astype(int)
    y_test = test[t].values.astype(int)

    param_grid = {
        'max_depth' : list(range(3,13))}

    clf = RandomForestClassifier(random_state=123, criterion='entropy', n_jobs=-1)
    gc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=1, scoring='accuracy', verbose=2)
    gc.fit(X_train, y_train)

    preds_test = gc.predict(X_test)
    preds_train = gc.predict(X_train)

    preds_proba_test = gc.predict_proba(X_test)
    preds_proba_train = gc.predict_proba(X_train)

    test_logg.append(np.mean(y_test==preds_test))
    train_logg.append(np.mean(y_train==preds_train))

  with open(f'/content/drive/MyDrive/Диссертация/news_loggs/{company}/{company}_cv_rf_train_logg.txt', 'w') as fp:
    for item in train_logg:
        fp.write("%s\n" % item)
  with open(f'/content/drive/MyDrive/Диссертация/news_loggs/{company}/{company}_cv_rf_test_logg.txt', 'w') as fp:
    for item in test_logg:
        fp.write("%s\n" % item)

  
  print(f'{company} done')

In [None]:
targets = news.iloc[:,3:-1].columns
for company in companies:
  print(company)
  os.makedirs(f'/content/drive/MyDrive/Диссертация/news_loggs/{company}', exist_ok=True)
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='date')
  first_day = pd.to_datetime(first_dates.loc[company,:].values[0])
  comp = news[news[company] == True].copy()
  comp = comp[comp.date >= first_day].copy()
  comp.dropna(inplace=True)

  tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2))

  train, test = comp.iloc[:int(0.8*len(comp)), :], comp.iloc[int(0.8*len(comp)):, :]
  tfidf.fit(comp['message'])

  features_train = tfidf.transform(train['message'])
  features_test = tfidf.transform(test['message'])

  X_train = np.asarray(features_train.todense())
  X_test = np.asarray(features_test.todense())

  tr = [col for col in targets if company in col]

  test_logg = []
  train_logg = []

  for t in tr:
    y_train = train[t].values.astype(int)
    y_test = test[t].values.astype(int)

    param_grid = {
        'max_depth' : list(range(3,13))}

    clf = LGBMClassifier(random_state=123, n_jobs=-1)
    gc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=-1, scoring='accuracy', verbose=2)
    gc.fit(X_train, y_train)

    preds_test = gc.predict(X_test)
    preds_train = gc.predict(X_train)

    test_logg.append(np.mean(y_test==preds_test))
    train_logg.append(np.mean(y_train==preds_train))

  with open(f'/content/drive/MyDrive/Диссертация/news_loggs/{company}/{company}_cv_boosting_train_logg.txt', 'w') as fp:
    for item in train_logg:
        fp.write("%s\n" % item)
  with open(f'/content/drive/MyDrive/Диссертация/news_loggs/{company}/{company}_cv_boosting_test_logg.txt', 'w') as fp:
    for item in test_logg:
        fp.write("%s\n" % item)

  
  print(f'{company} done')

## Тесты на значимость полученных результатов

In [None]:
def sample_test(prob1, test_values, n_times=1000):
  accs = []
  for _ in range(n_times):
    rand_test = np.random.choice([0, 1], size=len(test_values), replace=True, p=[1-prob1, prob1])
    acc = (test_values == rand_test).sum() / len(test_values)
    accs.append(acc)
  return np.mean(accs) + np.std(accs)

In [None]:
def _color_red_or_green(val):
    color = 'red' if val < 0 else 'green'
    return 'color: %s' % color

In [None]:
def sample_test_neutral(probs, test_values, n_times=1000):
  accs = []
  for _ in range(n_times):
    rand_test = np.random.choice([-1, 0, 1], size=len(test_values), replace=True, p=probs)
    acc = (test_values == rand_test).sum() / len(test_values)
    accs.append(acc)
  return np.mean(accs) + np.std(accs)

In [None]:
def test_const(test_values):
  zeros = np.zeros(len(test_values))
  f1 = f1_score(test_values, zeros, average='weighted')
  return f1

In [None]:
def get_effects_bin(df_train, df_test, loggs, companies=companies, shortened=False):
  if shortened:
    effects = pd.DataFrame(columns=['5 min', '10 min', '15 min', '30 min', '1 hour', '1 day'], index=companies)
  else:
    effects = pd.DataFrame(columns=['1 min', '5 min', '10 min', '15 min', '30 min', '1 hour', '1 day'], index=companies)

  targets = df_train.columns
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
  for company in companies:
    first_day = pd.to_datetime(first_dates.loc[company,:].values[0])

    comp_train = df_train[df_train[company] == True].copy()
    comp_test = df_test[df_test[company] == True].copy()

    comp_train = comp_train[comp_train.date >= first_day].copy()
    test_accs = loggs.loc['acc_test',:]

    tr = [col for col in targets if company in col]
    if company in tr:
      tr.remove(company)
    if shortened:
      tr = tr[1:]

    res = []
    for t in tr:
      ba = comp_train[t].mean()
      te = test_accs[t]
      best_acc = sample_test(ba, comp_test[t].values.astype(int), n_times=1000)
      res.append(round((te-best_acc)*100, 2))
    effects.loc[company,:] = res
  return effects

In [None]:
def get_effects_neutral(df_train, df_test, loggs, companies=companies):
  effects_rand = pd.DataFrame(columns=['5 min', '10 min', '15 min', '30 min', '1 hour', '1 day'], index=companies)
  effects_zero = pd.DataFrame(columns=['5 min', '10 min', '15 min', '30 min', '1 hour', '1 day'], index=companies)

  targets = df_train.columns
  first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
  for company in companies:
    first_day = pd.to_datetime(first_dates.loc[company,:].values[0])
    comp_train = df_train[df_train[company] == True].copy()
    comp_test = df_test[df_test[company] == True].copy()
    comp_train = comp_train[comp_train.date >= first_day].copy()

    tr = [col for col in targets if company in col]
    if company in tr:
      tr.remove(company)

    accs_test = loggs[[col for col in loggs.columns if company in col]].loc['acc_test', :]
    f1_test = loggs[[col for col in loggs.columns if company in col]].loc['f1_test', :]

    res_acc = []
    res_f1 = []

    for t in tr:
      neg_p = (comp_train[t] == -1).sum() / len(comp_train[t])
      neu_p = (comp_train[t] == 0).sum() / len(comp_train[t])
      pos_p = (comp_train[t] == 1).sum() / len(comp_train[t])

      best_acc = sample_test_neutral([neg_p, neu_p, pos_p], comp_test[t].values, n_times=1000)
      res_acc.append(round((accs_test[t]-best_acc)*100, 2))
      res_f1.append (f1_test[t] - test_const(comp_test[t].values))

    effects_rand.loc[company,:] = res_acc
    effects_zero.loc[company,:] = res_f1

  return effects_rand, effects_zero

### Кросс-валидация Телеграмм

In [None]:
companies = ['ALRS', 'AFLT', 'VTBR', 'GAZP', 'GMKN', 'LSRG', 'DSKY', 'IRAO', 'LKOH', 'MAGN', 'MOEX', 'CBOM', 'MTSS',
             'ROSN', 'NLMK', 'NVTK', 'PIKK', 'PLZL', 'RTKM', 'RUAL', 'HYDR', 'SBER', 'CHMF', 'AFKS', 'SNGS', 'TATN',
             'TCSG', 'TRNFP', 'PHOR', 'FEES', 'YNDX']

#### 2 класса

In [None]:
tg = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps.parquet')
tg_train = tg[tg.date < '2021-06-01'].copy()
tg_test = tg[(tg.date < '2022-01-01') & (tg.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/loggs_tg_rf_bin.csv', index_col=0)

In [None]:
effects_tg_rf_cv = get_effects_bin(tg_train, tg_test, loggs)

In [None]:
effects_tg_rf_cv.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,1 min,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.28,-1.2,-0.22,0.26,-0.93,-1.14,-4.23
AFLT,-1.05,-1.04,-4.61,-3.59,-1.66,-1.29,-2.17
ALRS,-3.19,-5.67,-1.4,-2.44,-3.81,-0.74,-5.02
CBOM,-3.4,-6.13,-2.8,-5.48,-4.78,-0.5,-0.41
CHMF,8.47,3.9,3.22,0.87,2.1,2.14,-0.2
DSKY,5.58,3.69,4.34,7.18,7.19,5.41,-4.74
FEES,-2.27,-0.06,-1.82,-0.97,-2.96,-1.39,-5.66
GAZP,-0.3,0.72,-0.3,-0.07,-0.24,1.1,-0.32
GMKN,-0.26,-1.52,-0.88,-2.28,-0.44,-1.28,0.75
HYDR,-6.5,0.23,2.36,-4.83,-4.97,-3.09,0.51


In [None]:
effects_tg_rf_cv.mean()

1 min    -0.620000
5 min    -0.976452
10 min   -0.352581
15 min   -1.544516
30 min   -0.751290
1 hour   -0.452258
1 day    -1.445484
dtype: float64

In [None]:
effects_tg_rf_cv.filter(['MAGN', 'CHMF', 'DSKY', 'SNGS', 'NVTK'], axis=0).sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,1 min,5 min,10 min,15 min,30 min,1 hour,1 day
CHMF,8.47,3.9,3.22,0.87,2.1,2.14,-0.2
DSKY,5.58,3.69,4.34,7.18,7.19,5.41,-4.74
MAGN,5.12,2.65,0.46,2.02,7.04,3.03,2.14
NVTK,2.89,0.34,-0.04,-2.18,1.1,0.25,1.41
SNGS,3.77,3.33,3.73,-0.07,0.58,0.61,3.57


In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/loggs_tg_boosting_bin.csv', index_col=0)

In [None]:
effects_tg_boosting_cv = get_effects_bin(tg_train, tg_test, loggs)

In [None]:
effects_tg_boosting_cv.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,1 min,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.16,-1.8,-0.95,-1.77,1.04,-0.66,-4.25
AFLT,-0.09,-4.34,-5.56,-3.81,-2.3,1.36,-1.63
ALRS,-0.29,-5.79,0.76,-2.95,-2.24,0.71,-2.66
CBOM,-3.05,-8.17,-6.25,-1.76,-2.55,-6.36,2.36
CHMF,1.53,1.82,0.98,2.49,-0.41,3.2,2.29
DSKY,1.15,2.08,2.83,6.51,5.22,3.41,-7.31
FEES,-2.03,-3.0,-1.64,-3.08,0.51,-2.42,-0.44
GAZP,-0.27,0.85,0.55,-0.25,0.3,0.35,-0.45
GMKN,-1.74,-5.57,-5.3,-2.29,-1.21,-1.48,-1.35
HYDR,-3.69,0.16,-2.06,-4.7,-0.03,-1.49,3.82


In [None]:
effects_tg_boosting_cv.mean()

1 min    -0.690645
5 min    -1.972258
10 min   -1.899355
15 min   -1.848065
30 min   -0.462581
1 hour   -0.832258
1 day    -1.188065
dtype: float64

#### 3 класса

In [None]:
tg = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps_1,5sigma_.parquet')
tg_train = tg[tg.date < '2021-06-01'].copy()
tg_test = tg[(tg.date < '2022-01-01') & (tg.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/loggs_tg_rf_neutral.csv', index_col=0)

In [None]:
effects_neutral_tg_rand_rf, effects_neutral_tg_zero_rf = get_effects_neutral(tg_train, tg_test, loggs)

In [None]:
effects_neutral_tg_rand_rf.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,1.47,4.63,1.27,0.89,0.11,-0.13
AFLT,-0.77,1.91,2.53,1.44,1.56,2.88
ALRS,0.5,3.32,1.84,-0.35,2.93,2.82
CBOM,-2.4,-3.7,-5.56,4.57,1.13,-0.74
CHMF,-0.52,7.04,7.72,5.15,7.47,0.05
DSKY,6.32,4.01,9.69,14.72,9.26,-0.2
FEES,-1.84,-2.32,-3.71,1.09,0.58,2.55
GAZP,1.94,1.0,1.97,2.37,2.45,-0.45
GMKN,-2.11,3.38,1.31,1.85,2.96,-6.13
HYDR,4.5,4.27,6.03,-1.58,1.63,4.17


In [None]:
effects_neutral_tg_rand_rf.mean()

5 min     0.740968
10 min    1.600000
15 min    1.801290
30 min    1.856129
1 hour    1.946129
1 day     0.149355
dtype: float64

In [None]:
effects_neutral_tg_rand_rf.filter(['ALRS', 'AFLT', 'GAZP', 'LSRG', 'DSKY', 'MAGN', 'MOEX', 'MTSS', 'ROSN', 'NVTK', 'PIKK', 'HYDR', 'CHMF', 'AFKS', 'SNGS', 'PHOR'], axis=0).sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,1.47,4.63,1.27,0.89,0.11,-0.13
AFLT,-0.77,1.91,2.53,1.44,1.56,2.88
ALRS,0.5,3.32,1.84,-0.35,2.93,2.82
CHMF,-0.52,7.04,7.72,5.15,7.47,0.05
DSKY,6.32,4.01,9.69,14.72,9.26,-0.2
GAZP,1.94,1.0,1.97,2.37,2.45,-0.45
HYDR,4.5,4.27,6.03,-1.58,1.63,4.17
LSRG,5.13,3.22,0.48,3.63,0.27,-1.48
MAGN,-2.17,2.37,4.53,2.86,5.24,-0.05
MOEX,1.16,1.47,1.8,1.74,2.43,1.26


In [None]:
effects_neutral_tg_zero_rf.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.15,0.19,0.18,0.18,0.17,0.11
AFLT,0.13,0.14,0.16,0.17,0.17,0.14
ALRS,0.11,0.24,0.19,0.2,0.26,0.22
CBOM,0.08,0.11,0.1,0.24,0.21,0.16
CHMF,0.07,0.29,0.31,0.26,0.27,0.16
DSKY,0.28,0.33,0.36,0.46,0.39,0.19
FEES,0.08,0.1,0.07,0.14,0.18,0.14
GAZP,0.15,0.18,0.17,0.2,0.15,0.15
GMKN,0.1,0.18,0.15,0.18,0.21,0.02
HYDR,0.1,0.11,0.16,0.11,0.15,0.0


In [None]:
effects_neutral_tg_zero_rf.filter(['ALRS', 'AFLT', 'GAZP', 'LSRG', 'DSKY', 'MAGN', 'MOEX', 'MTSS', 'ROSN', 'NVTK', 'PIKK', 'HYDR', 'CHMF', 'AFKS', 'SNGS', 'PHOR'], axis=0).sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.15,0.19,0.18,0.18,0.17,0.11
AFLT,0.13,0.14,0.16,0.17,0.17,0.14
ALRS,0.11,0.24,0.19,0.2,0.26,0.22
CHMF,0.07,0.29,0.31,0.26,0.27,0.16
DSKY,0.28,0.33,0.36,0.46,0.39,0.19
GAZP,0.15,0.18,0.17,0.2,0.15,0.15
HYDR,0.1,0.11,0.16,0.11,0.15,0.0
LSRG,0.04,0.06,0.09,0.06,0.11,0.03
MAGN,0.12,0.17,0.24,0.23,0.25,0.15
MOEX,0.12,0.15,0.19,0.2,0.17,0.1


In [None]:
effects_neutral_tg_zero_rf.mean()

5 min     0.122517
10 min    0.167812
15 min    0.173382
30 min    0.187880
1 hour    0.192530
1 day     0.104955
dtype: float64

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/loggs_tg_boosting_neutral.csv', index_col=0)

In [None]:
effects_neutral_tg_rand_boosting, effects_neutral_tg_zero_boosting = get_effects_neutral(tg_train, tg_test, loggs)

In [None]:
effects_neutral_tg_rand_boosting.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-0.69,-0.77,0.39,0.17,0.68,-0.39
AFLT,1.13,-1.34,1.51,2.26,1.75,3.89
ALRS,-5.26,-0.34,0.69,1.44,-0.73,-0.11
CBOM,-2.1,-2.85,1.51,1.92,5.47,-2.95
CHMF,-1.21,4.05,6.72,5.25,5.63,-2.42
DSKY,3.45,3.6,8.56,9.94,6.41,0.37
FEES,-1.27,-4.07,1.34,-1.63,0.08,-0.1
GAZP,0.71,1.42,0.94,2.67,2.12,0.19
GMKN,-2.39,0.61,-0.59,1.02,2.25,-1.74
HYDR,-1.6,0.27,2.65,-3.77,-0.31,0.76


In [None]:
effects_neutral_tg_rand_boosting.mean()

5 min    -0.357742
10 min   -0.073226
15 min    0.417097
30 min    0.838710
1 hour    0.499032
1 day    -0.670645
dtype: float64

In [None]:
effects_neutral_tg_zero_boosting.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.14,0.15,0.17,0.18,0.17,0.1
AFLT,0.15,0.13,0.17,0.19,0.18,0.16
ALRS,0.06,0.21,0.2,0.24,0.24,0.19
CBOM,0.11,0.15,0.2,0.22,0.27,0.15
CHMF,0.06,0.26,0.29,0.26,0.26,0.13
DSKY,0.26,0.33,0.35,0.41,0.38,0.2
FEES,0.09,0.1,0.15,0.12,0.18,0.11
GAZP,0.14,0.19,0.18,0.22,0.19,0.16
GMKN,0.1,0.16,0.13,0.18,0.21,0.07
HYDR,0.06,0.12,0.17,0.11,0.17,-0.02


In [None]:
effects_neutral_tg_zero_boosting.mean()

5 min     0.123813
10 min    0.166759
15 min    0.175258
30 min    0.189400
1 hour    0.193876
1 day     0.106521
dtype: float64

### Кросс-валидация традиционные новости

In [None]:
companies = ['ALRS', 'AFLT', 'VTBR', 'GAZP', 'GMKN', 'LSRG', 'DSKY', 'IRAO', 'LKOH', 'MAGN', 'MOEX', 'CBOM', 'MTSS',
             'ROSN', 'NVTK', 'PIKK', 'RTKM', 'RUAL', 'HYDR', 'SBER', 'CHMF', 'AFKS', 'SNGS', 'TATN',
             'TCSG', 'TRNFP', 'PHOR', 'YNDX']

#### 2 класса

In [None]:
news = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news_comps.parquet')
news_train = news[news.date < '2021-06-01'].copy()
news_test = news[(news.date < '2022-01-01') & (news.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/news_loggs/loggs_news_rf_bin.csv', index_col=0)

In [None]:
effects_news_rf_cv = get_effects_bin(news_train, news_test, loggs, companies=companies)

In [None]:
effects_news_rf_cv.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,1 min,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-12.54,-9.56,-4.19,-8.09,-4.24,-14.59,-19.86
AFLT,3.09,-1.82,-4.04,-10.29,-4.56,-4.59,-1.03
ALRS,4.15,-1.33,-3.6,1.56,-4.61,-4.72,3.77
CBOM,-0.14,-5.0,-0.83,1.57,-7.5,-2.83,-7.5
CHMF,-3.56,3.63,6.38,-1.32,-1.36,2.28,-10.07
DSKY,-28.0,-32.07,-15.05,3.69,-1.17,-7.9,5.87
GAZP,1.66,4.49,3.71,4.82,3.88,3.35,3.11
GMKN,-5.11,-2.22,-5.65,-14.01,-10.35,-5.26,-4.47
HYDR,-0.48,-4.68,-14.3,-10.35,-12.84,-13.08,-17.36
IRAO,-9.69,-4.77,-1.53,-9.57,4.22,2.05,-9.88


In [None]:
effects_news_rf_cv.filter(['GAZP', 'TCSG', 'VTBR'], axis=0).sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,1 min,5 min,10 min,15 min,30 min,1 hour,1 day
GAZP,1.66,4.49,3.71,4.82,3.88,3.35,3.11
TCSG,4.8,8.93,7.44,0.71,-3.88,1.22,-2.19
VTBR,5.74,0.52,1.68,1.07,0.99,1.74,-0.27


In [None]:
effects_news_rf_cv.mean()

1 min    -4.558571
5 min    -3.799643
10 min   -4.447857
15 min   -4.815000
30 min   -3.686429
1 hour   -5.552857
1 day    -7.297500
dtype: float64

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/news_loggs/loggs_news_boosting_bin.csv', index_col=0)

In [None]:
effects_news_boosting_cv = get_effects_bin(news_train, news_test, loggs, companies=companies)

In [None]:
effects_news_boosting_cv.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,1 min,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-1.89,-6.81,-2.09,-2.91,-10.34,-7.86,-10.26
AFLT,2.65,-1.38,-4.42,-3.61,-10.37,-6.07,0.93
ALRS,-13.24,-18.27,-18.05,-21.09,-25.05,1.06,1.35
CBOM,-4.07,-8.44,-1.25,-4.82,-5.25,-10.21,3.93
CHMF,-1.74,3.72,4.06,-2.42,-5.8,-4.47,2.13
DSKY,2.3,-8.53,-7.75,-3.52,-1.49,-23.87,-25.55
GAZP,1.71,3.9,4.38,5.42,-5.37,4.0,5.62
GMKN,-5.29,-9.06,-3.38,-13.53,-9.14,-9.49,-3.86
HYDR,-10.2,-2.61,-8.83,-15.67,-18.76,-20.12,-24.5
IRAO,-11.49,-1.45,-6.67,-14.66,4.14,8.96,-14.72


In [None]:
effects_news_boosting_cv.mean()

1 min    -5.684286
5 min    -5.639643
10 min   -6.119643
15 min   -6.228214
30 min   -4.814286
1 hour   -5.713571
1 day    -5.465000
dtype: float64

#### 3 класса

In [None]:
companies = ['ALRS', 'AFLT', 'VTBR', 'GAZP', 'GMKN', 'DSKY', 'IRAO', 'LKOH', 'MAGN', 'MOEX', 'CBOM', 'MTSS',
             'ROSN', 'NVTK', 'PIKK', 'RTKM', 'RUAL', 'HYDR', 'SBER', 'CHMF', 'AFKS', 'SNGS', 'TATN',
             'TCSG', 'TRNFP', 'PHOR', 'YNDX']

In [None]:
news = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news_comps_1,5sigma_.parquet')
news_train = news[news.date < '2021-06-01'].copy()
news_test = news[(news.date < '2022-01-01') & (news.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/news_loggs/loggs_news_rf_neutral.csv', index_col=0)

In [None]:
effects_neutral_news_rand_rf, effects_neutral_news_zero_rf = get_effects_neutral(news_train, news_test, loggs, companies=companies)

In [None]:
effects_neutral_news_rand_rf.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-16.27,-3.04,-12.33,-11.56,-4.21,-10.45
AFLT,-6.1,-10.36,-2.97,2.04,1.31,-3.85
ALRS,4.75,-13.27,-14.51,2.57,-0.0,-0.79
CBOM,2.15,-2.37,-5.03,-9.32,-11.78,-5.64
CHMF,-6.39,-0.34,1.34,-4.73,-3.02,-4.85
DSKY,-19.69,-34.2,-9.42,6.39,-0.24,-23.96
GAZP,2.58,6.15,6.3,6.36,-6.33,0.0
GMKN,2.41,-10.69,-3.67,-2.41,-0.34,-4.52
HYDR,-12.2,-11.44,-6.26,-13.79,-1.86,-15.87
IRAO,-10.73,-6.0,-11.81,-1.56,-4.7,1.0


In [None]:
effects_neutral_news_rand_rf.mean()

5 min    -4.429259
10 min   -3.124815
15 min   -1.812222
30 min   -1.809259
1 hour   -1.610000
1 day    -6.300000
dtype: float64

In [None]:
effects_neutral_news_rand_rf.filter(['GAZP', 'MTSS', 'RTKM', 'SBER', 'TCSG', 'VTBR', 'YNDX'], axis=0).sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
GAZP,2.58,6.15,6.3,6.36,-6.33,0.0
MTSS,5.83,0.59,13.75,1.42,5.89,-13.76
RTKM,-11.37,6.78,4.21,0.13,3.83,-11.84
SBER,1.07,2.16,2.42,3.2,3.17,6.72
TCSG,1.47,0.2,1.07,9.47,3.59,1.51
VTBR,2.77,9.7,10.51,10.47,10.26,-10.56
YNDX,4.3,6.99,7.84,4.12,2.27,-1.97


In [None]:
effects_neutral_news_zero_rf.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-0.01,0.14,0.08,0.01,0.12,0.05
AFLT,0.06,0.0,0.07,0.16,0.15,0.05
ALRS,0.25,0.09,0.11,0.35,0.3,0.16
CBOM,0.05,0.1,0.06,0.08,0.06,0.06
CHMF,0.15,0.2,0.24,0.15,0.1,0.12
DSKY,-0.0,-0.09,0.05,0.3,0.0,-0.03
GAZP,0.1,0.13,0.12,0.15,0.06,0.1
GMKN,0.14,0.06,0.16,0.16,0.14,0.16
HYDR,-0.16,0.09,0.17,0.14,0.23,-0.12
IRAO,0.11,0.17,0.03,0.14,0.09,0.13


In [None]:
effects_neutral_news_zero_rf.mean()

5 min     0.092041
10 min    0.146940
15 min    0.149434
30 min    0.167547
1 hour    0.149813
1 day     0.055965
dtype: float64

In [None]:
effects_neutral_news_zero_rf.filter(['GAZP', 'MTSS', 'RTKM', 'SBER', 'TCSG', 'VTBR', 'YNDX'], axis=0).sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
GAZP,0.1,0.13,0.12,0.15,0.06,0.1
MTSS,0.22,0.24,0.31,0.27,0.28,-0.01
RTKM,0.01,0.17,0.21,0.2,0.2,0.04
SBER,0.11,0.18,0.2,0.24,0.23,0.17
TCSG,0.15,0.21,0.16,0.28,0.16,0.21
VTBR,0.15,0.27,0.3,0.28,0.29,-0.1
YNDX,0.22,0.24,0.24,0.18,0.14,-0.06


In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/news_loggs/loggs_news_boosting_neutral.csv', index_col=0)

In [None]:
effects_neutral_news_rand_boosting, effects_neutral_news_zero_boosting = get_effects_neutral(news_train, news_test, loggs, companies=companies)

In [None]:
effects_neutral_news_rand_boosting.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-10.01,-10.76,-6.45,-7.41,-1.08,-2.23
AFLT,-4.63,-10.61,-1.58,-5.7,6.32,0.38
ALRS,-18.21,-15.79,-8.65,-15.11,-0.29,10.67
CBOM,-15.49,-9.55,1.43,-0.74,-10.55,-11.13
CHMF,-9.57,-2.58,2.09,-2.98,0.74,2.26
DSKY,17.84,-18.73,13.12,7.09,-8.9,-24.65
GAZP,2.96,1.25,4.8,3.22,6.74,3.96
GMKN,-0.8,2.6,-3.07,-2.42,1.16,-13.76
HYDR,-8.99,-10.22,-10.76,-11.75,-9.14,-15.83
IRAO,-2.99,-12.58,-16.21,-5.0,-8.03,-10.19


In [None]:
effects_neutral_news_rand_boosting.mean()

5 min    -5.471852
10 min   -5.353333
15 min   -2.913333
30 min   -2.143333
1 hour   -2.712222
1 day    -5.787037
dtype: float64

In [None]:
effects_neutral_news_zero_boosting.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.05,0.09,0.12,0.11,0.19,0.16
AFLT,0.07,-0.01,0.08,0.08,0.21,0.1
ALRS,0.08,0.13,0.18,0.18,0.29,0.28
CBOM,-0.02,0.07,0.16,0.18,0.07,0.03
CHMF,0.12,0.2,0.28,0.2,0.17,0.21
DSKY,0.53,0.12,0.23,0.32,-0.0,0.02
GAZP,0.12,0.11,0.12,0.13,0.16,0.12
GMKN,0.13,0.2,0.17,0.15,0.15,0.07
HYDR,-0.07,0.12,0.14,0.16,0.19,-0.12
IRAO,0.19,0.09,-0.02,0.1,0.08,0.09


In [None]:
effects_neutral_news_zero_boosting.mean()

5 min     0.105112
10 min    0.144713
15 min    0.151958
30 min    0.177827
1 hour    0.153655
1 day     0.083656
dtype: float64

### Берт Телеграмм

#### 2 класса

In [None]:
tg = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps_raw.parquet')
tg.dropna(subset=['message'],inplace=True)
tg.drop(columns=['views', 'forwards', 'fwd_from', 'source'], inplace=True)

tg_train = tg[tg.date < '2021-01-01'].copy()
tg_val = tg[(tg.date > '2021-01-01') & (tg.date < '2021-06-01')].copy()
tg_test = tg[(tg.date < '2022-01-01') & (tg.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/dist_bert_tg_loggs_bin.csv', index_col=0)

In [None]:
companies = sorted(['MAGN', 'CHMF', 'DSKY', 'SNGS', 'NVTK'])

In [None]:
effects_bert_tg_bin = get_effects_bin(tg_train, tg_test, loggs, companies=companies, shortened=True)

In [None]:
effects_bert_tg_bin.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
CHMF,6.52,3.32,5.93,-3.15,4.91,-2.98
DSKY,7.84,4.52,14.88,8.06,4.09,-4.04
MAGN,-0.72,-3.4,0.29,-0.05,-0.11,3.05
NVTK,0.12,1.04,0.82,1.0,0.0,-4.23
SNGS,-1.03,2.69,-2.68,0.36,3.08,2.01


#### 3 класса

In [None]:
tg = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/tg_comps_raw_1,5sigma_.parquet')
tg.dropna(subset=['message'],inplace=True)
tg.drop(columns=['views', 'forwards', 'fwd_from', 'source'], inplace=True)

tg_train = tg[tg.date < '2021-01-01'].copy()
tg_val = tg[(tg.date > '2021-01-01') & (tg.date < '2021-06-01')].copy()
tg_test = tg[(tg.date < '2022-01-01') & (tg.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/dist_bert_tg_loggs_neutral.csv', index_col=0)

In [None]:
companies = sorted(['ALRS', 'AFLT', 'GAZP', 'LSRG', 'DSKY', 'MAGN', 'MOEX', 'MTSS', 'ROSN', 'NVTK', 'PIKK', 'HYDR', 'CHMF', 'AFKS', 'SNGS', 'PHOR'])

In [None]:
effects_bert_tg_rand, effects_bert_tg_zero = get_effects_neutral(tg_train, tg_test, loggs, companies=companies)

In [None]:
effects_bert_tg_rand.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,-3.59,1.96,-5.6,-2.38,-2.12,0.09
AFLT,2.06,2.28,1.56,0.89,-4.61,4.86
ALRS,-3.16,1.35,1.17,-1.98,-3.79,-0.09
CHMF,-2.18,-8.67,-7.05,-4.76,-7.31,-2.99
DSKY,-9.19,-19.53,-17.59,-17.72,-13.51,-8.28
GAZP,4.06,3.79,2.97,3.3,1.93,2.77
HYDR,4.37,4.93,-0.7,-0.45,-2.5,10.21
LSRG,-11.36,-10.61,-10.04,-7.43,-9.05,-6.43
MAGN,3.15,-0.67,2.87,6.46,-0.23,-0.8
MOEX,-2.91,-4.84,-5.44,-2.58,-4.0,-4.62


In [None]:
effects_bert_tg_zero.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
AFKS,0.05,0.04,0.04,0.08,0.07,0.05
AFLT,-0.0,0.09,0.16,0.18,0.12,0.12
ALRS,0.08,0.15,0.12,0.16,0.19,0.13
CHMF,0.01,-0.03,-0.02,0.07,0.04,0.04
DSKY,0.02,0.04,0.04,0.08,0.11,0.04
GAZP,0.11,0.15,0.09,0.2,0.19,0.09
HYDR,0.05,0.06,0.07,0.15,0.07,0.01
LSRG,-0.08,-0.07,-0.03,-0.03,-0.07,-0.04
MAGN,0.15,0.1,0.16,0.2,0.2,0.01
MOEX,0.03,0.04,-0.04,-0.0,0.07,-0.05


### Берт Традиционные новости

#### 2 класса

In [None]:
news = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news_comps_raw.parquet')
news['title'] = news['title'].str.replace('no title', '')
news['announce'] = news['announce'].str.replace('no announce', '')

news['message'] = news['title'] + ' ' + news['announce']
news = news[news['message'] != ' '].copy()
news.drop(columns=['link', 'title', 'announce', 'text'], inplace=True)
news.dropna(subset=['message'],inplace=True)

news_train = news[news.date < '2021-01-01'].copy()
news_val = news[(news.date > '2021-01-01') & (news.date < '2021-06-01')].copy()
news_test = news[(news.date < '2022-01-01') & (news.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/news_loggs/dist_bert_news_loggs_bin.csv', index_col=0)

In [None]:
companies = sorted(['TCSG', 'GAZP', 'VTBR'])

In [None]:
effects_news_bert_bin = get_effects_bin(news_train, news_test, loggs, companies=companies, shortened=True)

In [None]:
effects_news_bert_bin.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
GAZP,4.52,5.04,-7.83,-1.54,-8.07,5.06
TCSG,-17.54,-11.33,-8.51,-4.88,-2.73,-2.83
VTBR,6.11,8.49,8.94,6.65,4.41,0.22


#### 3 класса

In [None]:
news = pd.read_parquet('/content/drive/MyDrive/Диссертация/Парсеры сайтов/news_comps_raw_1,5sigma_.parquet')
news['title'] = news['title'].str.replace('no title', '')
news['announce'] = news['announce'].str.replace('no announce', '')

news['message'] = news['title'] + ' ' + news['announce']
news = news[news['message'] != ' '].copy()
news.drop(columns=['link', 'title', 'announce', 'text'], inplace=True)
news.dropna(subset=['message'],inplace=True)

news_train = news[news.date < '2021-01-01'].copy()
news_val = news[(news.date > '2021-01-01') & (news.date < '2021-06-01')].copy()
news_test = news[(news.date < '2022-01-01') & (news.date > '2021-06-01')].copy()

In [None]:
loggs = pd.read_csv('/content/drive/MyDrive/Диссертация/news_loggs/dist_bert_news_loggs_neutral', index_col=0)

In [None]:
companies = sorted(['MTSS', 'TCSG', 'YNDX', 'RTKM', 'SBER', 'GAZP', 'VTBR'])

In [None]:
effects_news_bert_neutral_rand, effects_news_bert_neutral_zero = get_effects_neutral(news_train, news_test, loggs, companies=companies)

In [None]:
effects_news_bert_neutral_rand.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
GAZP,2.06,2.38,3.77,4.58,1.4,-0.94
MTSS,-5.42,-0.51,-0.65,-1.29,-6.83,-10.19
RTKM,-19.4,-8.49,-8.13,-6.77,-7.01,-5.62
SBER,0.59,-0.61,-0.21,-4.26,-4.57,-9.07
TCSG,-17.46,-17.81,-10.06,-23.44,-6.89,-5.0
VTBR,4.47,5.58,6.5,9.21,9.51,12.66
YNDX,-2.09,-6.79,-6.22,1.73,1.62,-9.23


In [None]:
effects_news_bert_neutral_zero.sort_index().style.applymap(_color_red_or_green).format(precision=2)

Unnamed: 0,5 min,10 min,15 min,30 min,1 hour,1 day
GAZP,0.07,0.14,0.14,0.21,0.19,-0.0
MTSS,-0.01,0.02,0.01,-0.0,0.0,-0.05
RTKM,-0.17,-0.04,0.09,0.08,0.08,0.13
SBER,0.09,0.09,0.1,0.1,0.1,0.02
TCSG,-0.14,-0.06,-0.01,0.0,-0.02,0.03
VTBR,0.04,0.21,0.23,0.27,0.25,0.0
YNDX,0.09,0.05,0.05,0.01,0.09,-0.12


## Перезапуск обученных моделей

In [None]:
# df = pd.read_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_neutral_boosting.csv')
# targets = tg.iloc[:,3:].columns
# for company in companies:
#   n_jobs=-1
#   print(company)
#   os.makedirs(f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}', exist_ok=True)
#   first_dates = pd.read_csv(f'/content/drive/MyDrive/Диссертация/Парсеры сайтов/Стоимость акций/1 мин/first_dates.csv', index_col='company')
#   first_day = pd.to_datetime(first_dates.loc[company,:].values[0])
#   comp = tg[tg[company] == True].copy()
#   comp = comp[comp.date >= first_day].copy()
#   comp.dropna(inplace=True)

#   tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1,2))

#   train, test = comp.iloc[:int(0.8*len(comp)), :], comp.iloc[int(0.8*len(comp)):, :]
#   tfidf.fit(comp['message'])

#   features_train = tfidf.transform(train['message'])
#   features_test = tfidf.transform(test['message'])

#   features_train = np.asarray(features_train.todense())
#   features_test = np.asarray(features_test.todense())

#   tr = [col for col in targets if company in col]
#   if company in tr:
#     tr.remove(company)

#   for t in tr:
#     y_train = train[t].values.astype(int)
#     y_test = test[t].values.astype(int)

#     filename = f'/content/drive/MyDrive/Диссертация/tg_loggs/{company}/{t}_neutral_boosting.sav'
#     gc = pickle.load(open(filename, 'rb'))

#     preds_test = gc.predict(features_test)
#     preds_train = gc.predict(features_train)

#     preds_proba_test = gc.predict_proba(features_test)
#     preds_proba_train = gc.predict_proba(features_train)

#     acc_test = accuracy_score(y_test, preds_test)
#     acc_train = accuracy_score(y_train, preds_train)

#     pre_test = precision_score(y_test, preds_test, average='micro')
#     pre_train = precision_score(y_train, preds_train, average='micro')

#     rec_test = recall_score(y_test, preds_test, average='micro')
#     rec_train = recall_score(y_train, preds_train, average='micro')

#     auc_test = roc_auc_score(y_test, preds_proba_test, average='weighted', multi_class='ovo')
#     auc_train = roc_auc_score(y_train, preds_proba_train, average='weighted', multi_class='ovo')

#     f1_test = f1_score(y_test, preds_test, average='micro')
#     f1_train = f1_score(y_train, preds_train, average='micro')

#     df[t] = [acc_test, acc_train, pre_test, pre_train, rec_test, rec_train, auc_test, auc_train, f1_test, f1_train]

#     df.to_csv('/content/drive/MyDrive/Диссертация/tg_loggs/all_loggs_neutral_boosting.csv', index=False)