# **Importing all the required libraries.**

In [1]:
import pandas as pd

from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")

import yfinance as yf

Device set to use cpu


# **Loading the news data and cleaning it.**

In [5]:
#reading the news and cleaning the data
news = pd.read_csv("news.csv")
news['date'] = pd.to_datetime(news['date'], errors='coerce', utc=True)
news = news.dropna(subset=['date'])
news['date'] = news['date'].dt.normalize()
news['date'] = pd.to_datetime(news['date']).dt.tz_localize(None)


#finding all unique tickers of stocks
tickers = news['stock'].unique()

# **Loading the financial data and mapping the news dates.**

In [23]:
# news data for a particular stock.
def particularNews(ticker):
    news_data = news[news['stock'] == ticker]
    news_data = news_data.iloc[::-1]
    return news_data

# stocks data for a particular stock.
def particularStock(ticker):
    stock_data = yf.download(tickers=ticker, start='2009-01-01', end='2020-07-31', interval='1d')
    return stock_data

# mapping the news dates to the next trading day as the effect will be on the next trading day.
def mapDates(news_data, stock_data):
    trading_days = pd.Series(stock_data.index)
    calendar_days = pd.date_range(start=trading_days.min(), end=trading_days.max(), freq='D')

    date_to_next_trading = {}

    for date in calendar_days:
        idx = trading_days.searchsorted(date, side='right')  # get strictly *next* trading day to test the returns for
        if idx < len(trading_days):
            date_to_next_trading[date] = trading_days.iloc[idx]
        else:
            date_to_next_trading[date] = pd.NaT

    mapping_series = pd.Series(date_to_next_trading)

    news_data['date'] = news_data['date'].map(mapping_series)

    return news_data


# **Caculating the sentiment of every news.**

In [56]:
def mapSentiment(result):
    if result['label'] == 'positive':
            return 1
    elif result['label'] == 'negative':
        return -1
    return 0

def sentimentAnalysis(news_data):
    results = pipe(news_data['title'].tolist())
    news_data['sentiment'] = [mapSentiment(r) for r in results]

    return news_data

news_data = particularNews('GOOGL')
news_data = mapDates(news_data, particularStock('GOOGL'))
news_data = sentimentAnalysis(news_data)


  stock_data = yf.download(tickers=ticker, start='2009-01-01', end='2020-07-31', interval='1d')
[*********************100%***********************]  1 of 1 completed


# **Function for calculating the total CAGR.**

In [57]:
def classify(score):
    if score >= 2: return 1
    elif score <= -2: return -1
    else: return 0

def CAGR(ticker):
    # news_data = particularNews(ticker)
    stock_data = particularStock(ticker)
    if stock_data.empty:
        return "why"
    # news_data = mapDates(news_data, stock_data)
    # news_data = sentimentAnalysis(news_data)

    date_sentiment = news_data.groupby('date')['sentiment'].sum()
    date_sentiment = date_sentiment.apply(classify).reset_index()
    # print((date_sentiment[0]))

    starting_amount = 100

    stock_data['return'] = ((stock_data['Open']-stock_data['Close'])/stock_data['Open'])

    stock_data.columns = ['_'.join(col) for col in stock_data.columns.to_flat_index()]
    date_sentiment.rename(columns={'date':'Date'}, inplace = True)
    print(date_sentiment)
    print(stock_data)
    data = pd.merge(date_sentiment, stock_data, on='Date')

    # print(data)
    trade_data = pd.DataFrame({})

    for _, row in data.iterrows():
        if row['sentiment'] == -1:
            starting_amount *= -1*(-1 + row['return_'])
            new_row = {'Date': row['Date'], 'returns': starting_amount}
            trade_data = pd.concat([trade_data, pd.DataFrame([new_row])], ignore_index=True)
        elif row['sentiment'] == 1 :
            starting_amount *= (1 + row['return_'])
            new_row = {'Date': row['Date'], 'returns': starting_amount}
            trade_data = pd.concat([trade_data, pd.DataFrame([new_row])], ignore_index=True)


    print(trade_data)
    return starting_amount

# everything = pd.DataFrame({})
#
# tickers = {"NVDA", "MSFT", "AAPL", "AMZN", "META","	AVGO","GOOGL","	GOOG","NFLX","	COST"}
#
# for ticker in tickers:
#     returns = CAGR(ticker)
#     if returns == 'why':
#         new_row = {"stock": ticker, "return": 'no data'}
#         everything = pd.concat([everything, pd.DataFrame([new_row])], ignore_index=True)
#         continue
#     new_row = {"stock": ticker, "return": returns}
#     everything = pd.concat([everything, pd.DataFrame([new_row])], ignore_index=True)
#
# everything
CAGR('GOOGL')


  stock_data = yf.download(tickers=ticker, start='2009-01-01', end='2020-07-31', interval='1d')
[*********************100%***********************]  1 of 1 completed

          Date  sentiment
0   2018-07-26          0
1   2018-07-27          0
2   2018-07-30          0
3   2018-07-31          0
4   2018-08-01          1
..         ...        ...
423 2020-06-05          0
424 2020-06-08          0
425 2020-06-09          0
426 2020-06-10          0
427 2020-06-11          0

[428 rows x 2 columns]
            Close_GOOGL  High_GOOGL  Low_GOOGL  Open_GOOGL  Volume_GOOGL  \
Date                                                                       
2009-01-02     7.993104    8.005542   7.599568    7.676683     144275580   
2009-01-05     8.160518    8.239872   7.835888    7.985144     195364440   
2009-01-06     8.310021    8.477685   8.119223    8.283155     256750992   
2009-01-07     8.010266    8.231661   7.929172    8.167233     179600220   
2009-01-08     8.089374    8.089374   7.894098    7.917481     143883972   
...                 ...         ...        ...         ...           ...   
2020-07-24    74.960938   75.319785  73.936086   74.5126




108.09682176135465