In [None]:
!pip install boto3
import copy
import os
import boto3
import traceback
import io
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, date

from botocore.exceptions import ClientError, NoCredentialsError
from traceback import format_exc

os.environ['AWS_ACCESS_KEY_ID'] = <access_key>
os.environ['AWS_SECRET_ACCESS_KEY'] = <secret_access_key>

BUCKET = 'russian-stocks-quotes'

access_key = os.getenv('AWS_ACCESS_KEY_ID')
secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
endpoint_url = 'https://storage.yandexcloud.net'

# Создание клиента S3
s3_client = boto3.client('s3',
                         region_name='ru-central1',
                         aws_access_key_id=access_key,
                         aws_secret_access_key=secret_key,
                         endpoint_url=endpoint_url)

def upload_object_to_s3(key, body):
    response = s3_client.put_object(Bucket=BUCKET, Key=key, Body=body)
    if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        print(f"Успешно сохранен в {BUCKET}/{key}")
    else:
        print(f"Ошибка при сохранении: {response['ResponseMetadata']['HTTPStatusCode']}")

def upload_data_frame_to_s3(secid, data_frame, dir):
    pickle_buffer = io.BytesIO()
    data_frame.to_pickle(pickle_buffer)
    pickle_buffer.seek(0)
    data_frame_file = f'{dir}secids/{secid}/{secid}_data_frame.pkl'
    upload_object_to_s3(data_frame_file, pickle_buffer)

def upload_info_to_s3(secid, info, dir):
    info['miss_index'] = info['miss_index'].to_numpy().tolist()
    json_data = json.dumps(info)
    info_file = f'{dir}secids/{secid}/{secid}_info.pkl'
    upload_object_to_s3(info_file, json_data)

def upload_secid_names(dict_data, dir):
    json_data = json.dumps(list(dict_data.keys()))
    secid_names_file = f'{dir}secid_names.pkl'
    upload_object_to_s3(secid_names_file, json_data)

def upload_data_to_s3(dict_data, dir):
    try:
        upload_secid_names(dict_data, dir)
        for secid, data in dict_data.items():
            copy_data = copy.deepcopy(data)
            upload_data_frame_to_s3(secid, copy_data['data_frame'], dir)
            del copy_data['data_frame']
            upload_info_to_s3(secid, copy_data, dir)
    except ClientError as e:
        print(f"Произошла ошибка: {e.response['Error']['Message']}")
    except Exception as e:
        error_message = f"Неизвестная ошибка: {str(e)}"
        error_context = traceback.format_exc()
        print(f"{error_message}\nКонтекст ошибки:\n{error_context}")

def list_directories(s3_client):
    directories = set()
    try:
        paginator = s3_client.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=BUCKET, Delimiter='/'):
            for prefix in page.get('CommonPrefixes', []):
                directories.add(prefix.get('Prefix'))
    except NoCredentialsError:
        print("Ошибка: Неверные учетные данные.")
    except Exception as e:
        print(f"Произошла ошибка: {e}")
    return directories

def download_object_from_s3(key):
    response = s3_client.get_object(Bucket=BUCKET, Key=key)
    if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        print(f"Успешно получен из {BUCKET}/{key}")
    else:
        print(f"Ошибка при получении: {response['ResponseMetadata']['HTTPStatusCode']}")
    return response['Body'].read()

def download_info_from_s3(dir, secid):
    key = f'{dir}secids/{secid}/{secid}_info.pkl'
    response = download_object_from_s3(key)
    data = json.loads(response)
    data['miss_index'] = pd.Index(np.array(data['miss_index']))
    return data

def download_data_frame_from_s3(dir, secid):
    key = f'{dir}secids/{secid}/{secid}_data_frame.pkl'
    response = download_object_from_s3(key)
    buffer = io.BytesIO(response)
    data = pd.read_pickle(buffer)
    data['TRADEDATE'] = pd.to_datetime(data['TRADEDATE'])
    return data

def download_secid_names(dir):
    key = f'{dir}secid_names.pkl'
    return json.loads(download_object_from_s3(key))

def download_data_from_s3(dir, secids=None):
    data = {}
    try:
        directories = download_secid_names(dir)
        for secid in directories:
            if secids is None or secid in secids:
                if secids is not None:
                    secids.remove(secid)
                data[secid] = download_info_from_s3(dir, secid)
                data[secid]['data_frame'] = download_data_frame_from_s3(dir, secid)
    except Exception as e:
        error_message = f"Неизвестная ошибка: {str(e)}"
        error_context = traceback.format_exc()
        print(f"{error_message}\nКонтекст ошибки:\n{error_context}")
    if secids is not None and len(secids) > 0:
        print(f'Не нашли {secids}')
    return data

# data_frames = download_data_from_s3('preprocessed_data/')

Collecting boto3
  Downloading boto3-1.38.13-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.39.0,>=1.38.13 (from boto3)
  Downloading botocore-1.38.13-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.13.0,>=0.12.0 (from boto3)
  Downloading s3transfer-0.12.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.38.13-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.38.13-py3-none-any.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.12.0-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.8/84.8 kB[0m [31m4.1 MB/s[0m eta [36m0:0

In [6]:
# править даты с осторожностью
date = datetime.strptime('2022-04-25', '%Y-%m-%d').date()
# start_date = datetime.strptime('2024-01-11', '%Y-%m-%d').date()
# date = datetime.strptime('2024-02-14', '%Y-%m-%d').date()
delta = timedelta(days=1)
start_date = datetime.strptime('2022-04-19', '%Y-%m-%d').date()
while (date >= start_date):
    print(f"День: {date}")
    BUCKET = 'rated-russian-news'
    data = json.loads(download_object_from_s3(f'ria/{date.year}/{date}.pkl'))
    for secid, info in data.items():
      if secid == 'count':
        continue
      print(secid)
      # print(info)
      assessments = np.array(info['assessments'])
      # assessments[assessments > 1] = 1
      # assessments[assessments < -1] = -1
      # print(assessments)
      importances = sorted(assessments, key=lambda x: x['importance'])

      # print([i['importance'] for i in importances])
      # print([i['time'] for i in importances])
      # sum_importance = sum([i['importance'] for i in importances])
      # print(sum_importance)
      # min_importance = importances[0]['importance']
      # max_importance = importances[-1]['importance']
      # min_importance_time = importances[0]['time']
      # max_importance_time = importances[-1]['time']
      # count_news = len(importances)
      # mean_importance = sum_importance / count_news
      BUCKET = 'russian-stocks-quotes'
      key = f'preprocessed_data/secids/{secid}/news_info.pkl'
      news_file_info = None
      try:
        news_file_info = json.loads(download_object_from_s3(key))
      except Exception:
        news_file_info = {}
      news_file_info[str(date)] = {
        'min_importance': importances[0]['importance'] if float(importances[0]['importance']) > -1 else -1,
        'max_importance': importances[-1]['importance'] if float(importances[0]['importance']) < 1 else 1,
        'min_importance_time': importances[0]['time'],
        'max_importance_time': importances[-1]['time'],
        'count_news': len(importances),
        'mean_importance': sum(np.clip([i['importance'] for i in importances], -1, 1)) / len(importances)
      }
      # print(news_file_info)
      upload_object_to_s3(key, json.dumps(news_file_info))

    # news = get_news_for_day(date)
    # indexes = []
    # links = []
    # for index in range(len(news)):
    #     if len(news[index]['titles']) == 0 or len(news[index]['texts']) == 0:
    #         if len(links) == 0:
    #             links = get_links_for_day(date)
    #         news[index] = parse_page(links[index])
    #         indexes.append(index)
    # if len(links) != 0:
    #     s3_client.put_object(
    #         Bucket=TARGET_BUCKET_NAME,
    #         Key=target_key(date),
    #         Body=json.dumps(news, ensure_ascii=False)
    #     )
    #     print(f'По возможности обновлены новости из url {[links[index] for index in indexes]}')
    date -= delta

День: 2022-04-25
Успешно получен из rated-russian-news/ria/2022/2022-04-25.pkl
GAZP
Успешно получен из russian-stocks-quotes/preprocessed_data/secids/GAZP/news_info.pkl
Успешно сохранен в russian-stocks-quotes/preprocessed_data/secids/GAZP/news_info.pkl
LKOH
Успешно получен из russian-stocks-quotes/preprocessed_data/secids/LKOH/news_info.pkl
Успешно сохранен в russian-stocks-quotes/preprocessed_data/secids/LKOH/news_info.pkl
ROSN
Успешно получен из russian-stocks-quotes/preprocessed_data/secids/ROSN/news_info.pkl
Успешно сохранен в russian-stocks-quotes/preprocessed_data/secids/ROSN/news_info.pkl
SBER
Успешно получен из russian-stocks-quotes/preprocessed_data/secids/SBER/news_info.pkl
Успешно сохранен в russian-stocks-quotes/preprocessed_data/secids/SBER/news_info.pkl
VTBR
Успешно получен из russian-stocks-quotes/preprocessed_data/secids/VTBR/news_info.pkl
Успешно сохранен в russian-stocks-quotes/preprocessed_data/secids/VTBR/news_info.pkl
AFLT
Успешно получен из russian-stocks-quotes/