In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from pathlib import Path

raw_data_folder: Path = Path("00_raw/")
prod_data_folder: Path = Path("30_prod/")

In [3]:
import pandas as pd

excel_dataset_name: str = "12_Industry_Portfolios_Daily.xlsx"
equally_weighted_returns_sheet_name: str = "Average Equal Weighted Returns"

df: pd.DataFrame = pd.read_excel(
    io=raw_data_folder.joinpath(excel_dataset_name),
    sheet_name=equally_weighted_returns_sheet_name,
)
df.rename(columns={"Unnamed: 0": "Date"}, inplace=True)
df["Date"] = pd.to_datetime(df["Date"], format="%Y%m%d")

sectors: list[str] = [
    industry.strip() for industry in df.columns.difference(["Date"]).to_list()
]

df.rename(
    columns={k: f"{k.strip()}_Returns_1" for k in df.columns.difference(["Date"])},
    inplace=True,
)

df.loc[:, df.columns != "Date"] = df.loc[:, df.columns != "Date"] / 100

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

def analyze_sentiment(text: str) -> dict:
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return {"Positive": probabilities[0, 0].item(),
            "Negative": probabilities[0, 1].item(),
            "Neutral": probabilities[0, 2].item()}

In [5]:
from datetime import date

starting_date: date = date(2010, 1, 1)
ending_date: date = date(2020, 12, 31)

backward_days: int = 10

industries: dict = {"Consumer Non-Durables": "NoDur",
                    "Consumer Durables": "Durbl",
                    "Manufacturing": "Manuf",
                    "Energy": "Enrgy",
                    "Chemicals": "Chems",
                    "Business Equipment": "BusEq",
                    "Telecommunications": "Telcm",
                    "Electric, Gas, and Sanitary Services": "Utils",
                    "Wholesale, Retail services": "Shops",
                    "Healthcare, Medical Equipment and Drugs": "Hlth",
                    "Banks, financial institutions": "Money",
                    "Mines, Construction, Building Materials, and Transportation": "Other"}

In [6]:
import pandas as pd

dataset: pd.DataFrame = pd.read_csv(
    filepath_or_buffer=prod_data_folder.joinpath(
        "dataset.csv"
    ),
    sep=",",
    encoding="UTF-8",
)
dataset["Date"] = pd.to_datetime(dataset["Date"], format="%Y-%m-%d")
dataset = dataset[(dataset["Date"] >= pd.to_datetime(starting_date)) & (dataset["Date"] <= pd.to_datetime(ending_date))]
dataset.reset_index(inplace=True, drop=True)

In [7]:
from gnews import GNews
from datetime import date
from statistics import mean
from numpy.typing import NDArray

import numpy as np

industries_sentiment: NDArray = np.zeros((dataset.shape[0], len(sectors)))

b: date = dataset.at[0, "Date"]
a: date = b - pd.Timedelta(days=backward_days)

google_news_client = GNews(language='en',
                           country='US',
                           start_date=(a.year, a.month, a.day),
                           end_date=(b.year, b.month, b.day),
                           max_results=35)

for j in range(len(industries)):
    list_news: list = [news.get("description", "") for news in google_news_client.get_news(key=list(industries.keys())[j])]

    positive_average: list = []

    for n in list_news:
        probs: dict = analyze_sentiment(n)
        if probs.get("Neutral", 1) < 0.5: positive_average.append(probs.get("Positive", 0.0))

    if not len(positive_average):
        positive_average.append(0.0)

    industries_sentiment[0, j] = mean(positive_average)

In [None]:
last_sentiment_vector = industries_sentiment[0, :]

for i in range(1, dataset.shape[0]):
    if i % backward_days != 0:
        industries_sentiment[i, :] = last_sentiment_vector
    else:
        a = b
        b = dataset.at[i - 1, "Date"]

        print(a, b)

        google_news_client = GNews(language='en',
                           country='US',
                           start_date=(a.year, a.month, a.day),
                           end_date=(b.year, b.month, b.day),
                           max_results=20)

        for j in range(len(industries)):
            list_news: list = [news.get("description", "") for news in google_news_client.get_news(key=list(industries.keys())[j])]

            positive_average: list = []

            for n in list_news:
                probs: dict = analyze_sentiment(n)
                if probs.get("Neutral", 1) < 0.5: positive_average.append(probs.get("Positive", 0.0))

            if not len(positive_average):
                positive_average.append(0.0)

            industries_sentiment[i, j] = mean(positive_average)

        last_sentiment_vector = industries_sentiment[i, :]

In [19]:
industries_sentiment_columns: list[str] = [f"{x}_Positive_Sentiment" for x in list(industries.values())]

dataframe_sentiment_analysis = pd.DataFrame(industries_sentiment)
dataframe_sentiment_analysis.rename(columns={k : v for k, v in zip(range(len(industries)), industries_sentiment_columns)}, inplace=True)

dataframe_sentiment_analysis["Date"] = dataset["Date"]
col_ =  dataframe_sentiment_analysis.pop("Date")
dataframe_sentiment_analysis.insert(0, col_.name, col_)

dataframe_sentiment_analysis.to_csv(
    path_or_buf=prod_data_folder.joinpath("sentiment.csv"),
    sep=",",
    encoding="UTF-8",
    index=False,
)