In [2]:
from datetime import datetime, timedelta
from src.collect_price_data import collect_price_data
from src.format_price_data import format_price_data
from src.collect_sentiment_data import collect_sentiment_data
from src.preprocess_news_data import preprocess_news_data
from src.sentiment_analysis import perform_sentiment_analysis
from src.sentiment_summary import create_sentiment_summary
from src.calculate_technical_indicators import calculate_technical_indicators

# Get the current date and time
current_date = datetime.now()

# Calculate yesterday's date by subtracting one day
yesterday_date = current_date - timedelta(days=1)

# Calculate the date from 4 years ago
years_ago = 4
five_years_ago = current_date - timedelta(days=365 * years_ago)

# Set the start date and end date for the data retrieval
start_date =  five_years_ago.strftime('%Y-%m-%d')
end_date = yesterday_date.strftime('%Y-%m-%d')

# Define the time period for historical data (start date, end date)
time_period = (start_date, end_date)

# List of stock tickers for analysis
tickers = [ "AAPL", "META", "JPM", "JNJ", "AMT"]

# Load historical price data for the specified tickers and time period
price_data = collect_price_data(tickers, time_period) 

# Format price_data
formatted_price_data = format_price_data(price_data)

# Remove NaN values
formatted_price_data = formatted_price_data.dropna(axis=0, how='any')
formatted_price_data.head()


Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap,symbol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-10-14 08:00:00+00:00,236.4,236.65,236.02,53,236.65,3161,236.395463,AAPL
2019-10-14 09:00:00+00:00,234.37,236.13,233.8,277,236.13,30631,234.741837,AAPL
2019-10-14 10:00:00+00:00,235.0,235.0,234.16,78,234.41,9244,234.648556,AAPL
2019-10-14 11:00:00+00:00,236.22,236.22,234.72,514,234.72,73252,235.446287,AAPL
2019-10-14 12:00:00+00:00,235.85,236.3,234.9,813,235.818,113045,235.890449,AAPL


### Load News Data

In [3]:
# Load the news data for the specified tickers and time period
# Note: this step will take approximately 39 minutes
news_data = collect_sentiment_data(tickers, time_period)
news_data.head()

Unnamed: 0,Ticker,Date,Title
0,AAPL,2019-10-13 14:42:18+00:00,Can Netflix Deliver A Hit After Q2 Subscriber ...
1,AAPL,2019-10-14 16:46:22+00:00,A Look At Benzinga Pro's Most-Searched Tickers...
2,AAPL,2019-10-14 15:03:19+00:00,Benzinga Pro's Top 10 Most-Searched Tickers Fo...
3,AAPL,2019-10-14 14:47:35+00:00,Stocks That Hit 52-Week Highs On Monday
4,AAPL,2019-10-14 14:13:59+00:00,UBS On Apple Also Notes 'A stronger sell-throu...


### Preprocess Data

In [17]:
# Preprocess news data for sentiment analysis
preprocessed_data = preprocess_news_data(news_data)

# Print the preprocessed data
preprocessed_data.head()

#### Perform Sentiment Analysis

In [20]:
# Perform sentiment analysis
# Note: this process will take approximately 34 minutes
sentiment_df = perform_sentiment_analysis(preprocessed_data)
print(sentiment_df.head())

      Ticker                      Date  \
0       AAPL 2019-10-13 14:42:18+00:00   
1       AAPL 2019-10-14 16:46:22+00:00   
2       AAPL 2019-10-14 15:03:19+00:00   
3       AAPL 2019-10-14 14:47:35+00:00   
4       AAPL 2019-10-14 14:13:59+00:00   
...      ...                       ...   
11366    AMT 2023-08-07 10:46:30+00:00   
11367    AMT 2023-08-22 11:13:20+00:00   
11368    AMT 2023-09-14 13:33:00+00:00   
11369    AMT 2023-09-27 14:31:14+00:00   
11370    AMT 2023-10-04 16:01:17+00:00   

                                                   Title  Sentiment_Score  \
0        netflix deliver hit q subscriber disappointment         0.999713   
1          look benzinga pro mostsearched ticker october         0.949207   
2      benzinga pro top mostsearched ticker morning m...         0.686997   
3                             stock hit week high monday         0.998139   
4      ubs apple also note stronger sellthrough combi...         0.995910   
...                              

#### Exploring Sentiment Label as a Feature

In [24]:
# Explore the sentiment label column
unique_values = sentiment_df['Sentiment_Label'].value_counts()
print(unique_values)

NEGATIVE    8663
POSITIVE    2708
Name: Sentiment_Label, dtype: int64


##### **Feature Selection Consideration:**
At this stage of the project, we've made a conscious decision not to include the sentiment label (positive or negative) as a feature in our machine learning model. There are several logical reasons for this choice:

**1. Class Imbalance:**

The dataset exhibits a significant class imbalance, with a notably higher count of negative sentiment compared to positive sentiment. Imbalanced data can impact the model's performance and lead to potential biases, which should be addressed. 

**2. Sentiment Label Accuracy:**

The sentiment labels are derived from the analysis of headlines, and there is a possibility that they may not be entirely accurate. Incorporating potentially inaccurate sentiment labels as features can introduce noise into the model, reducing its reliability. (That is also why we don't fix the problem stated in reason 1 with resampling techniques)

**3. Model Iteration:**

Machine learning projects often involve multiple stages and iterations. We've chosen to prioritize other aspects of model development first and consider refining the inclusion of sentiment features at a later stage when we have more accurate and reliable data.

**4. Feature Engineering:**

Additionally, in the future, we plan to explore performing sentiment analysis on the summaries of articles, which may provide more context and accuracy in sentiment assessment. This aligns with our goal to continuously refine feature engineering for improved model performance.


#### Exploring Sentiment Score as a Feature for Analysis

In [129]:
# Create sentiment summary to evaluate statistics (min, max, mean, sum of sentiment scores) as potential features for the model
sentiment_summary = create_sentiment_summary(tickers, sentiment_df, formatted_price_data)

In [188]:
import hvplot.pandas
from bokeh.plotting import show

sentiment_summary['META'].hvplot.line(
        xlabel='Time', ylabel='Statistics', title="Sentiment Score Analysis",
        line_width=2, alpha=0.7, hover_line_color='red',
        width=1000, height=500
    ).opts(legend_position='top_left') 

##### **Exploratory Analysis:**
During our initial feature selection process, we considered various sentiment score statistics (mean, min, max, sum) to understand their potential influence on predicting the closing price. Although the sentiment scores themselves are relatively small values, we noticed an interesting behavior.

**Observation:**

The "sum of the sentiment score," while having small values, exhibits more pronounced ups and downs on a daily basis. This feature, which reflects the aggregate sentiment for the day, demonstrates higher variability, even if the absolute values are modest.

**Logical Relevance:**

We chose to focus on the "sum of the sentiment score" as a feature also due to its logical relevance. Aggregating daily sentiment scores into a sum provides a meaningful representation of overall sentiment for each day, which aligns with our goal of capturing sentiment trends that might influence stock prices.

It's worth noting that machine learning models, especially more complex ones, have the capacity to learn from features that may not have a strong linear correlation with the target variable. Hence, we believe this feature is promising for our model.


##### **Missing Sentiment Scores for "META" Ticker:**

During the exploration of sentiment scores, it was observed that sentiment scores for the "META" ticker were missing for the initial two years, despite the availability of stock price data. Several factors could contribute to this issue:

1. **Data Retrieval Issues:** Data retrieval methods for sentiment analysis may not have been comprehensive or accurate in collecting data for the "META" ticker. Data collection methods can vary in terms of coverage and accuracy.

2. **Data Quality:** Ensuring the quality and consistency of data sources is crucial. Inaccurate or incomplete news data can result in missing sentiment scores.

**Possible Solutions:**

To address the missing sentiment scores for the "META" ticker, consider the following:

1. **Refining Data Sources:** Review and expand news data sources to cover a wider range of topics and keywords, including "META."

2. **Exploring Additional Data Sources:** Consider integrating other news data sources such as tweets or web scraping. These sources can offer a broader range of news data and help fill gaps in sentiment score coverage.

**Project Approach:**

For the initial stage of the project and to avoid excessive complexity, a pragmatic approach was taken. In the calculate_technical_indicators module, backward filling (bfill) was applied to address the missing sentiment scores for the "META" ticker. This allowed for the inclusion of available sentiment data while keeping the project manageable. Further enhancements can be explored to improve sentiment score coverage in future stages.



#### Calculate SMA-fast and SMA-slow 

In [261]:
fast_window = 4  # Adjust the fast SMA window as needed
slow_window = 50  # Adjust the slow SMA and EMA window as needed
rsi_window = 14   # Adjust the RSI window as needed

# Calculate technical indicators 
technical_indicators_df = calculate_technical_indicators(formatted_price_data, fast_window, slow_window, rsi_window, sentiment_summary)

# Separate features and target variables
features = technical_indicators_df.filter(like='_').copy()  # Filter columns with '_'


lol


In [263]:
technical_indicators_df.head()

symbol,AAPL,AMT,JNJ,JPM,META,SMA_Slow_AAPL,SMA_Fast_AAPL,EMA_AAPL,RSI_AAPL,Sentiment_Score_Sum_AAPL,...,SMA_Slow_JPM,SMA_Fast_JPM,EMA_JPM,RSI_JPM,Sentiment_Score_Sum_JPM,SMA_Slow_META,SMA_Fast_META,EMA_META,RSI_META,Sentiment_Score_Sum_META
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-22 13:00:00,241.025,230.69,128.8023,123.44,190.3,236.44143,240.686275,237.219781,88.563798,9.017142,...,120.26234,123.475,120.528019,81.801481,0.995456,188.017778,189.7525,187.788755,87.301007,0.953524
2019-10-22 14:00:00,241.56,228.48,130.28,123.61,188.95,236.52943,240.90375,237.389986,89.160141,9.017142,...,120.40974,123.5025,120.648881,81.912776,0.995456,188.125978,189.6775,187.834294,74.281564,0.953524
2019-10-22 15:00:00,241.19,227.77,129.92,123.98,184.17,236.59533,241.06875,237.539006,83.611369,9.017142,...,120.55904,123.6075,120.779513,83.044121,0.995456,188.125176,188.2825,187.690596,41.530359,0.953524
2019-10-22 16:00:00,241.9378,228.32,129.78,124.47,183.47,236.694086,241.4282,237.711508,83.894293,9.017142,...,120.71204,123.875,120.924238,85.744298,0.995456,188.120376,186.7225,187.525083,38.548776,0.953524
2019-10-22 17:00:00,241.94,227.86,129.99,124.92,183.16,236.805886,241.65695,237.877331,88.881262,9.017142,...,120.87884,124.245,121.080935,91.877653,0.995456,188.113872,184.9375,187.353903,38.762952,0.953524


In [273]:
targets = technical_indicators_df.iloc[:, : len(tickers)] 
targets.head()

symbol,AAPL,AMT,JNJ,JPM,META
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-10-22 13:00:00,241.025,230.69,128.8023,123.44,190.3
2019-10-22 14:00:00,241.56,228.48,130.28,123.61,188.95
2019-10-22 15:00:00,241.19,227.77,129.92,123.98,184.17
2019-10-22 16:00:00,241.9378,228.32,129.78,124.47,183.47
2019-10-22 17:00:00,241.94,227.86,129.99,124.92,183.16
