**Вычисление основных метрик для Индексов и Компаний**

In [25]:
import pandas as pd
import numpy as np


In [13]:
# Пути к файлам
file_paths = {
    "IMOEX": "imoex_data_new.csv",
    "MOEXFN": "moexfn_data_2017_2025.csv",
    "OilGas": "oilgas_data_2017_2025.csv",
    "RTSI": "rtsi_data_2017_2025.csv",
    "Gazprom": "gazp_data_2017_2025.csv",
    "Lukoil": "lukoyl_data_2017_2025.csv",
    "Magnit": "magnit_data_2017_2025.csv",
    "Sberbank": "sber_data_2017_2025.csv",
    "Yandex": "yndx_data_2017_2025.csv",
}

# Списки индексов и компаний
index_names = ["IMOEX", "MOEXFN", "OilGas", "RTSI"]
company_names = ["Gazprom", "Lukoil", "Magnit", "Sberbank", "Yandex"]

In [14]:
# Обработка
def process_data(path, label):
    df = pd.read_csv(path)
    df['date'] = pd.to_datetime(df['begin']) #Приводим к стандартному типу даты
    df.set_index('date', inplace=True)
    df.sort_index(inplace=True)
    df['return'] = np.log(df['close'] / df['close'].shift(1)) #Логарифмическая доходность
    df['volatility_5d'] = df['return'].rolling(window=5).std() #Вычисление скользящего стандартного отклонения 5-ти дневного
    df['label'] = label
    return df[['return', 'volatility_5d', 'value', 'label']].dropna()

In [15]:
# Объединение
all_indices = []
all_companies = []

for name in index_names:
    df = process_data(file_paths[name], name)
    all_indices.append(df)

for name in company_names:
    df = process_data(file_paths[name], name)
    all_companies.append(df)



In [16]:
# Комбинированные датафреймы
combined_indices = pd.concat(all_indices)
combined_companies = pd.concat(all_companies)


In [17]:
# Сохранение
combined_indices.to_csv("combined_indices.csv")
combined_companies.to_csv("combined_companies.csv")

In [18]:
df=pd.read_csv("combined_indices.csv")

In [19]:
df.head()

Unnamed: 0,date,return,volatility_5d,value,label
0,2017-01-10,0.011797,0.011468,36020860000.0,IMOEX
1,2017-01-11,-0.008474,0.011363,36461560000.0,IMOEX
2,2017-01-12,-0.002984,0.007531,38818420000.0,IMOEX
3,2017-01-13,-0.007628,0.008141,42466050000.0,IMOEX
4,2017-01-16,-0.002792,0.008148,23936260000.0,IMOEX


In [21]:
df['label'].unique()

array(['IMOEX', 'MOEXFN', 'OilGas', 'RTSI'], dtype=object)

**Категоризации и удаление дубликатов**

In [40]:
df = pd.read_csv('bbc_news_2017_2025_filtered.csv')

In [42]:
df['link'].isnull().unique()

array([False])

In [44]:
import pandas as pd
import re
from urllib.parse import urlparse

df = pd.read_csv('bbc_news_2017_2025_filtered.csv')

df['link_keywords'] = df['link'].apply(lambda x: ' '.join([w for w in re.split(r'[/\-]', urlparse(x).path) if w.isalpha()]))

print(df[['link', 'link_keywords']].head(10))


                                                link             link_keywords
0       http://www.bbc.co.uk/sport/football/38696547            sport football
1       http://www.bbc.co.uk/sport/football/38632703            sport football
2  http://www.bbc.co.uk/news/world-latin-america-...  news world latin america
3         http://www.bbc.co.uk/sport/tennis/38704836              sport tennis
4    http://www.bbc.co.uk/news/world-europe-38703914         news world europe
5   http://www.bbc.co.uk/sport/horse-racing/38694316        sport horse racing
6       http://www.bbc.co.uk/sport/football/38707859            sport football
7              http://www.bbc.co.uk/news/uk-38704598                   news uk
8  http://www.bbc.co.uk/news/uk-northern-ireland-...  news uk northern ireland
9         http://www.bbc.co.uk/sport/tennis/38702928              sport tennis


In [45]:
len(df['link_keywords'].unique())

497

In [30]:
# Сохраняем DataFrame с новым столбцом в новый CSV
df.to_csv('bbc_news_with_keywords.csv', index=False)

In [46]:
import pandas as pd

# Загружаем данные
df = pd.read_csv("bbc_news_with_keywords.csv")

# Удалим строки с пропущенными значениями
df = df.dropna(subset=["link_keywords"])

# Словарь ключевых слов по категориям
category_keywords = {
    "Спортивные": ["sport", "football", "tennis", "cricket", "rugby", "golf"],
    "Политические": ["politics", "election", "government", "parliament"],
    "Бизнес": ["business", "economy", "company", "market"],
    "Финансовые": ["finance", "bank", "currency", "stock", "trade"],
    "Технологические": ["tech", "technology", "internet", "digital", "ai"],
    "Мировые новости": ["world", "europe", "asia", "africa", "latin-america", "middle-east"],
    "Локальные UK-новости": ["uk", "england", "wales", "scotland", "northern ireland"],
    "Культура и искусство": ["entertainment", "arts", "magazine"],
    "Наука и природа": ["science", "environment"],
    "Здравоохранение": ["health"],
    "Образование": ["education"],
    "Погода и окружающая среда": ["weather"],
    "Медиа и СМИ": ["articles", "videos", "blogs", "newsbeat", "stories"]
}

# Функция категоризации
def categorize(link_keywords):
    tokens = link_keywords.lower().split()
    for category, keywords in category_keywords.items():
        if any(token in keywords for token in tokens):
            return category
    return "Прочее"

# Применяем к DataFrame
df["category"] = df["link_keywords"].apply(categorize)



In [47]:
df['category'].unique()

array(['Спортивные', 'Мировые новости', 'Локальные UK-новости',
       'Технологические', 'Образование', 'Прочее', 'Бизнес',
       'Медиа и СМИ', 'Культура и искусство', 'Наука и природа',
       'Политические', 'Здравоохранение', 'Погода и окружающая среда'],
      dtype=object)

In [48]:
df[df['category'] == 'Образование']

Unnamed: 0,title,published_date,description,link,link_keywords,category
18,Meet the mum with quadruplet toddlers - BBC News,2017-01-21,Meet the mum to quadruplets who went viral aft...,http://www.bbc.co.uk/news/education-38690621,news education,Образование
55,"Could tuition fees really cost £54,000? - BBC ...",2017-01-21,The headline cost of increased fees might be £...,http://www.bbc.co.uk/news/education-38651059,news education,Образование
408,"Could tuition fees really cost £54,000? - BBC ...",2017-01-22,The headline cost of increased fees might be £...,http://www.bbc.co.uk/news/education-38651059,news education,Образование
517,How schools promote pupils' mental wellbeing -...,2017-01-10,Schools have long been are at the front line w...,http://www.bbc.co.uk/news/education-38571628,news education,Образование
876,'Pink girly toys don't deter women from engine...,2017-01-27,Meet the Sellafield engineer who says playing ...,http://www.bbc.co.uk/news/education-38760602,news education,Образование
...,...,...,...,...,...,...
122414,"Rise in school sexism down to phones, says uni...",2024-04-04,Teachers fear sexism is on the rise in schools...,http://www.bbc.co.uk/news/education-68731795,news education,Образование
122532,"Rise in school sexism down to phones, says uni...",2024-04-05,Teachers fear sexism is on the rise in schools...,http://www.bbc.co.uk/news/education-68731795,news education,Образование
122954,Children living near Sure Start centres did be...,2024-04-09,Children who grew up close to a centre achieve...,http://www.bbc.co.uk/news/education-68763942,news education,Образование
123495,EHCP: Councils missing education plan deadline...,2024-05-17,Many councils in England are failing to meet l...,http://www.bbc.co.uk/news/education-68668602,news education,Образование


In [49]:
len(df)

149169

In [50]:
df.drop_duplicates()

Unnamed: 0,title,published_date,description,link,link_keywords,category
0,Saido Berahino: Stoke complete deal to sign We...,2017-01-21,Stoke sign West Brom striker Saido Berahino fo...,http://www.bbc.co.uk/sport/football/38696547,sport football,Спортивные
1,Lawro's Premier League predictions v Split sta...,2017-01-21,BBC football expert Mark Lawrenson takes on ac...,http://www.bbc.co.uk/sport/football/38632703,sport football,Спортивные
2,Chapecoense plane: Footballer Neto dreamt of c...,2017-01-21,Chapecoense football club player Neto is one o...,http://www.bbc.co.uk/news/world-latin-america-...,news world latin america,Мировые новости
3,Australian Open: Johanna Konta praises support...,2017-01-21,Great Britain's Johanna Konta says her family ...,http://www.bbc.co.uk/sport/tennis/38704836,sport tennis,Спортивные
4,Friends' 30-year-search for Celtic treasure tr...,2017-01-21,Two metal detector enthusiasts found a huge ho...,http://www.bbc.co.uk/news/world-europe-38703914,news world europe,Мировые новости
...,...,...,...,...,...,...
149173,What will anger at sight of gaunt hostages mea...,2025-02-09,Concerns were raised over the condition of fre...,http://www.bbc.co.uk/news/articles/clyz7124dppo,news articles,Медиа и СМИ
149174,Kevin Holding: Family of man killed by pop-up ...,2025-02-09,The family of Kevin Holding fear they may neve...,http://www.bbc.co.uk/news/articles/cp8q5mpqrvqo,news articles,Медиа и СМИ
149175,Parents suing TikTok over children's deaths sa...,2025-02-09,Four families who say their children died afte...,http://www.bbc.co.uk/news/articles/c20pyn55v79o,news articles,Медиа и СМИ
149176,Trump trade war: How Japan shaped Trump's pers...,2025-02-09,"As a young real estate developer in New York, ...",http://www.bbc.co.uk/news/articles/c4gp5pw654lo,news articles,Медиа и СМИ


In [54]:
df["published_date"] = pd.to_datetime(df["published_date"])  # преобразуем в формат даты
df = df.sort_values(by="published_date")  # сортировка по возрастанию
df = df.reset_index(drop=True)

In [56]:
df.head()

Unnamed: 0,title,published_date,description,link,link_keywords,category
0,Manu Tuilagi out of England training camp afte...,2017-01-01,Manu Tuilagi is withdrawn from England's two-d...,http://www.bbc.co.uk/sport/rugby-union/38485970,sport rugby union,Спортивные
1,'Hollywood' sign changed to 'Hollyweed' in new...,2017-01-01,A prankster changes the world-famous Hollywood...,http://www.bbc.co.uk/news/world-us-canada-3848...,news world us canada,Мировые новости
2,How are Australia's Syrian refugees coping? - ...,2017-01-01,Australia has so far resettled about half of t...,http://www.bbc.co.uk/news/world-australia-3830...,news world australia,Мировые новости
3,How a dead gorilla became the meme of 2016 - B...,2017-01-01,After Harambe was shot in a sad incident in Ci...,http://www.bbc.co.uk/news/blogs-trending-38383126,news blogs trending,Медиа и СМИ
4,Reflections on Africa - BBC News,2017-01-01,The BBC's Southern Africa correspondent looks ...,http://www.bbc.co.uk/news/world-africa-38393661,news world africa,Мировые новости


In [60]:
df[df['category'] == 'Образование']

Unnamed: 0,title,published_date,description,link,link_keywords,category
543,How schools promote pupils' mental wellbeing -...,2017-01-10,Schools have long been are at the front line w...,http://www.bbc.co.uk/news/education-38571628,news education,Образование
642,Oxford academics warning of Brexit 'disaster' ...,2017-01-11,"A ""hard Brexit"" would be the ""biggest disaster...",http://www.bbc.co.uk/news/education-38587765,news education,Образование
650,How schools promote pupils' mental wellbeing -...,2017-01-11,Schools have long been are at the front line w...,http://www.bbc.co.uk/news/education-38571628,news education,Образование
721,What does post-truth mean for a philosopher? -...,2017-01-12,How are philosophers meant to make sense of th...,http://www.bbc.co.uk/news/education-38557838,news education,Образование
1229,Meet the mum with quadruplet toddlers - BBC News,2017-01-20,Meet the mum to quadruplets who went viral aft...,http://www.bbc.co.uk/news/education-38690621,news education,Образование
...,...,...,...,...,...,...
122302,Childcare shortage worsens as costs rise – rep...,2024-04-19,Fewer than a third of councils in all areas ha...,http://www.bbc.co.uk/news/education-68580918,news education,Образование
122306,"DfE says 85,000 more free childcare places nee...",2024-04-19,The Department for Education estimates about 4...,http://www.bbc.co.uk/news/education-68848936,news education,Образование
122796,Childcare shortage worsens as costs rise – rep...,2024-04-24,Fewer than a third of councils in all areas ha...,http://www.bbc.co.uk/news/education-68580918,news education,Образование
124043,EHCP: Councils missing education plan deadline...,2024-05-17,Many councils in England are failing to meet l...,http://www.bbc.co.uk/news/education-68668602,news education,Образование


In [70]:
df_drop = df.drop_duplicates(subset=["title"]).reset_index(drop=True)

In [72]:
df_drop

Unnamed: 0,title,published_date,description,link,link_keywords,category
0,Manu Tuilagi out of England training camp afte...,2017-01-01,Manu Tuilagi is withdrawn from England's two-d...,http://www.bbc.co.uk/sport/rugby-union/38485970,sport rugby union,Спортивные
1,'Hollywood' sign changed to 'Hollyweed' in new...,2017-01-01,A prankster changes the world-famous Hollywood...,http://www.bbc.co.uk/news/world-us-canada-3848...,news world us canada,Мировые новости
2,How are Australia's Syrian refugees coping? - ...,2017-01-01,Australia has so far resettled about half of t...,http://www.bbc.co.uk/news/world-australia-3830...,news world australia,Мировые новости
3,How a dead gorilla became the meme of 2016 - B...,2017-01-01,After Harambe was shot in a sad incident in Ci...,http://www.bbc.co.uk/news/blogs-trending-38383126,news blogs trending,Медиа и СМИ
4,Reflections on Africa - BBC News,2017-01-01,The BBC's Southern Africa correspondent looks ...,http://www.bbc.co.uk/news/world-africa-38393661,news world africa,Мировые новости
...,...,...,...,...,...,...
104718,Manchester United: Andre Onana analysis after ...,2025-02-27,Manchester United's Andre Onana is in the spot...,http://www.bbc.co.uk/sport/football/articles/c...,sport football articles,Спортивные
104719,Emma Raducanu will play Indian Wells after Dub...,2025-02-27,Britain's Emma Raducanu will travel to the WTA...,http://www.bbc.co.uk/sport/tennis/articles/cy5...,sport tennis articles,Спортивные
104720,The Today Podcast - What will Starmer’s defenc...,2025-02-27,Plus: Amol & Nick talk tactics over PM-Preside...,http://www.bbc.co.uk/sounds/play/m0028bpw,sounds play,Прочее
104721,Andrew Tate and brother Tristan arrive in US a...,2025-02-27,"The pair, facing trial for human trafficking, ...",http://www.bbc.co.uk/news/articles/cpq222rqv4po,news articles,Медиа и СМИ


In [73]:
len(df_drop)

104723

In [75]:
# Сохраняем в новый CSV (опционально)
df_drop.to_csv("bbc_news_categorized.csv", index=False)

**Сентимент**

In [80]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Загрузка лексикона VADER
nltk.download("vader_lexicon")

# Загружаем данные
df = pd.read_csv("bbc_news_categorized.csv")

# Объединяем title + description
df["text"] = (df["title"] + " " + df["description"]).fillna("")

# Инициализируем анализатор
sid = SentimentIntensityAnalyzer()

# Функция для извлечения compound-оценки
def get_sentiment_scores(text):
    return sid.polarity_scores(text)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/narciss/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [81]:
# Применяем VADER к тексту
df["sentiment_scores"] = df["text"].apply(get_sentiment_scores)

# Распаковка словаря с оценками в отдельные столбцы
df = pd.concat([df, df["sentiment_scores"].apply(pd.Series)], axis=1)


In [84]:
# Категория по compound
def categorize_sentiment(compound):
    # Используем значения для compaund по базовым параметрам VADER
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"

df["Sentiment"] = df["compound"].apply(categorize_sentiment)

In [83]:
# Сохраняем результат
df.to_csv("bbc_news_with_sentiment_and_categorized.csv", index=False)

**Объединяем данные для удобства дальнейшего исследования**

In [98]:
import pandas as pd

# Загружаем файл с новостями
df = pd.read_csv("bbc_news_with_sentiment_and_categorized.csv")

# Приводим дату к нужному формату
df["published_date"] = pd.to_datetime(df["published_date"])

# Считаем количество новостей по Sentiment (positive, neutral, negative) 
sentiment_counts = df.groupby(["published_date", "Sentiment"]).size().unstack(fill_value=0)
sentiment_counts.columns = [f"count_{col}" for col in sentiment_counts.columns]

# Средний compound по дню
compound_mean = df.groupby("published_date")["compound"].mean().rename("compound_mean")

# Определим преобладающий сентимент по compound_mean
def get_dominant(compound):
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"

dominant_sentiment = compound_mean.apply(get_dominant).rename("dominant_sentiment")

# Считаем количество новостей по тематическим категориям 
category_counts = df.groupby(["published_date", "category"]).size().unstack(fill_value=0)
category_counts.columns = [f"count_{col}" for col in category_counts.columns]

# Общее количество новостей в день
total_counts = df.groupby("published_date").size().rename("total_news")

# Объединяем всё по дате 
df_summary = pd.concat([sentiment_counts,dominant_sentiment, category_counts, total_counts], axis=1).reset_index()

In [99]:
df_summary.head()

Unnamed: 0,published_date,count_negative,count_neutral,count_positive,dominant_sentiment,count_Бизнес,count_Здравоохранение,count_Культура и искусство,count_Локальные UK-новости,count_Медиа и СМИ,count_Мировые новости,count_Наука и природа,count_Образование,count_Погода и окружающая среда,count_Политические,count_Прочее,count_Спортивные,count_Технологические,total_news
0,2017-01-01,12,10,22,positive,1,1,6,8,3,8,0,0,0,0,2,15,0,44
1,2017-01-02,10,9,18,positive,1,0,4,4,2,8,0,0,0,4,0,13,1,37
2,2017-01-03,16,16,14,neutral,5,0,6,7,1,6,1,0,0,1,1,17,1,46
3,2017-01-04,21,4,13,negative,3,1,2,4,2,11,0,0,0,1,1,12,1,38
4,2017-01-05,21,17,16,negative,3,0,1,9,2,13,0,0,0,2,2,15,7,54


In [103]:
# Сохраняем в файл
df_summary.to_csv("news_summary_dayly.csv", index=False)

In [101]:
len(df_summary)

2609

In [93]:
df_summary.tail()

Unnamed: 0,published_date,count_negative,count_neutral,count_positive,count_Бизнес,count_Здравоохранение,count_Культура и искусство,count_Локальные UK-новости,count_Медиа и СМИ,count_Мировые новости,count_Наука и природа,count_Образование,count_Погода и окружающая среда,count_Политические,count_Прочее,count_Спортивные,count_Технологические,total_news
2604,2025-02-23,22,6,25,0,0,0,0,41,0,0,0,0,0,1,11,0,53
2605,2025-02-24,37,12,18,0,0,0,0,52,0,0,0,0,0,6,9,0,67
2606,2025-02-25,37,7,21,0,0,0,0,49,0,0,0,0,0,3,13,0,65
2607,2025-02-26,38,11,21,0,0,0,0,51,0,0,0,0,0,3,16,0,70
2608,2025-02-27,28,10,22,0,0,0,0,42,0,0,0,0,0,6,12,0,60


In [102]:
df_summary

Unnamed: 0,published_date,count_negative,count_neutral,count_positive,dominant_sentiment,count_Бизнес,count_Здравоохранение,count_Культура и искусство,count_Локальные UK-новости,count_Медиа и СМИ,count_Мировые новости,count_Наука и природа,count_Образование,count_Погода и окружающая среда,count_Политические,count_Прочее,count_Спортивные,count_Технологические,total_news
0,2017-01-01,12,10,22,positive,1,1,6,8,3,8,0,0,0,0,2,15,0,44
1,2017-01-02,10,9,18,positive,1,0,4,4,2,8,0,0,0,4,0,13,1,37
2,2017-01-03,16,16,14,neutral,5,0,6,7,1,6,1,0,0,1,1,17,1,46
3,2017-01-04,21,4,13,negative,3,1,2,4,2,11,0,0,0,1,1,12,1,38
4,2017-01-05,21,17,16,negative,3,0,1,9,2,13,0,0,0,2,2,15,7,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,2025-02-23,22,6,25,neutral,0,0,0,0,41,0,0,0,0,0,1,11,0,53
2605,2025-02-24,37,12,18,negative,0,0,0,0,52,0,0,0,0,0,6,9,0,67
2606,2025-02-25,37,7,21,negative,0,0,0,0,49,0,0,0,0,0,3,13,0,65
2607,2025-02-26,38,11,21,negative,0,0,0,0,51,0,0,0,0,0,3,16,0,70
