# **Importing all the required libraries.**

In [1]:
import pandas as pd

from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")

import yfinance as yf

import warnings
warnings.simplefilter("ignore", FutureWarning)

Device set to use cpu


# **Loading the news data and cleaning it.**

In [100]:
#reading the news and cleaning the data
news = pd.read_csv("data/news.csv")
news['date'] = pd.to_datetime(news['date'], errors='coerce', utc=True)
news = news.dropna(subset=['date'])
news['date'] = news['date'].dt.normalize()
news['date'] = pd.to_datetime(news['date']).dt.tz_localize(None)


#finding all unique tickers of stocks
tickers = news['stock'].unique()

# **Loading the financial data and mapping the news dates.**

In [3]:
#dates
start_date = '2009-01-01'
end_date = '2020-07-31'

# news data for a particular stock.
def particularNews(ticker):
    news_data = news[news['stock'] == ticker]
    news_data = news_data.iloc[::-1]
    return news_data

# stocks data for a particular stock.
def particularStock(ticker):
    stock_data = yf.download(tickers=ticker, start= start_date, end=end_date, interval='1d')
    return stock_data

# mapping the news dates to the next trading day as the effect will be on the next trading day.
def mapDates(news_data, stock_data):
    trading_days = pd.Series(stock_data.index)
    calendar_days = pd.date_range(start=trading_days.min(), end=trading_days.max(), freq='D')

    date_to_next_trading = {}

    for date in calendar_days:
        idx = trading_days.searchsorted(date, side='right')  # get strictly *next* trading day to test the returns for
        if idx < len(trading_days):
            date_to_next_trading[date] = trading_days.iloc[idx]
        else:
            date_to_next_trading[date] = pd.NaT

    mapping_series = pd.Series(date_to_next_trading)

    news_data['date'] = news_data['date'].map(mapping_series)

    return news_data


# **Caculating the sentiment of every news.**

In [36]:
def mapSentiment(result):
    if result['label'] == 'positive':
            return 1
    elif result['label'] == 'negative':
        return -1
    return 0

def sentimentAnalysis(news_data):
    results = pipe(news_data['title'].tolist())
    news_data['sentiment'] = [mapSentiment(r) for r in results]

    return news_data


def totalNews(ticker, stock_data = 'nothing'):
    news_data = particularNews(ticker)
    # news_data = mapDates(news_data, particularStock(ticker))
    news_data = mapDates(news_data, stock_data)
    news_data = sentimentAnalysis(news_data)
    return news_data

# news_data = totalNews('AVGO')

# **Function for calculating the total CAGR.**

In [97]:
def calculate(data, ticker):
    money = 100
    # trade_data = pd.DataFrame({})

    # standing_trade = False
    # amount = 0

    # row[f'Open_{ticker}'] < row['EMA_']

    i = 0
    invested = True
    last_price = data.iloc[0][f'Open_{ticker}']

    for _, row in data.iterrows():
        if i == 3:
            last_price = row[f'Open_{ticker}']
            invested = True

        if row['sentiment'] == -1 and row[f'5MA_'] < row['20MA_'] and invested:
            # money *= row[f'Open_{ticker}']/last_price
            # last_price = row[f'Close_{ticker}']
            if invested:
                invested = False
                money *= row[f'Open_{ticker}']/last_price
                i = 0
            else:
                i = 0

        i += 1

    if invested:
        money *= data.iloc[-1][f'Close_{ticker}']/last_price


    # data analysis
    number_of_years = (data['Date'].max() - data['Date'].min()).days / 365
    cagr = (((money/100)**(1/number_of_years)) -1 ) * 100
    simple_cagr = (((data.iloc[-1][f'Close_{ticker}']/data.iloc[0][f'Close_{ticker}'])**(1/number_of_years)) -1) * 100

    rtrn = pd.DataFrame({"CAGR using strategy": cagr, "CAGR without strategy": simple_cagr, "Company": ticker, "Number of years": number_of_years}, index=[0])

    return rtrn


# configuring data
def config(data, ticker):
    # Data date and time configurations
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce', utc=True)
    data = data.dropna(subset=['Date'])
    data['Date'] = data['Date'].dt.normalize()
    data['Date'] = pd.to_datetime(data['Date']).dt.tz_localize(None)


    data['20MA_'] = data[f'Close_{ticker}'].ewm(span = 20, adjust=False, min_periods=1).mean()
    data['20MA_'] = data['20MA_'].shift(-1)
    data['5MA_'] = data[f'Close_{ticker}'].ewm(span = 5, adjust=False, min_periods=1).mean()
    data['5MA_'] = data['5MA_'].shift(-1)

    return data


# this means that sentiment is should be really negative
def classify(score):
    if score >= 2: return 1
    elif score <= -2: return -1
    else: return 0


def CAGR(ticker):
    stock_data = particularStock(ticker)
    news_data = totalNews(ticker, stock_data)

    # if no stock data on yfinance
    if stock_data.empty: return "why"

    # aggregate sentiment for a particular date
    date_sentiment = news_data.groupby('date')['sentiment'].sum()
    date_sentiment = date_sentiment.apply(classify).reset_index()

    # rolling average
    stock_data['return'] = ((stock_data['Open']-stock_data['Close'])/stock_data['Open'])
    stock_data['EMA'] = stock_data['Close'].rolling(window=20).mean()

    # just flattening the data
    stock_data.columns = ['_'.join(col) for col in stock_data.columns.to_flat_index()]
    date_sentiment.rename(columns={'date':'Date'}, inplace = True)

    # union of news and stock data
    data = pd.merge(date_sentiment, stock_data, on='Date', how='outer')

    # data = pd.read_csv(f'data/{ticker}_data.csv')

    data = config(data, ticker)

    return calculate(data, ticker)


# companies with the largest market cap and at least 10 years of news data
companies = pd.read_csv('data/companies.csv')
returns = pd.DataFrame()

for _,company in companies.iterrows():
    rtrn = CAGR(company['Symbol'])
    returns = pd.concat([returns, rtrn], ignore_index=True)

print(returns)

   CAGR using strategy  CAGR without strategy Company  Number of years
0            39.678748              33.639455    AVGO        10.989041
1            27.640487              26.269882     TSM        11.580822
2            15.123622              16.240333     LLY        11.580822
3            32.865049              30.520303      MA        11.580822
4            23.746840              26.132495      HD        11.580822
5            33.992310              33.398024    ASML        11.580822
6            19.753022              19.672491     NVO        11.580822
7            10.530753               9.883191      KO        11.580822
8             9.398106               9.792891     NVS        11.580822
9             5.771306              11.471818      MS        11.580822


# **Testing Code.**

In [99]:
# starting_amount *= row[f'Open_{ticker}']/amount
            # new_row = {'Date': row['Date'], 'returns': starting_amount}
            # trade_data = pd.concat([trade_data, pd.DataFrame([new_row])], ignore_index=True)
        # if row['sentiment'] == 1 and row[f'Open_{ticker}'] > row['EMA_']:
        #     starting_amount *= (1 + row['return_'])
            # standing_trade = True
            # amount = row[f'Open_{ticker}']
            # new_row = {'Date': row['Date'], 'returns': starting_amount}
            # trade_data = pd.concat([trade_data, pd.DataFrame([new_row])], ignore_index=True)

    #closing any standing trades
    # if standing_trade:
    #     starting_amount *= data.iloc[-1][f'Close_{ticker}']/amount
# returns.to_csv("returns.csv", index=False)