In [1]:
import os
import re
import pandas as pd
import numpy as np
import json
import alpaca_trade_api as tradeapi
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression

In [17]:
from datetime import datetime

In [2]:
os.chdir('../')
print(os.getcwd())

c:\Users\pehbo\projects\algo


In [3]:
# Replace with your own API key and secret
with open('secrets/secrets.json') as f:
    secrets = json.load(f)
API_KEY = secrets['KEY']
API_SECRET = secrets['SECRET']
BASE_URL = 'https://paper-api.alpaca.markets'  # For paper trading, use the paper trading URL

# Set up the Alpaca API client
api = tradeapi.REST(API_KEY, API_SECRET, base_url=BASE_URL, api_version='v2')

In [4]:
symbol = 'TSLA'
start_date = '2010-01-01'
end_date = '2023-03-25'

historical_data = api.get_bars(symbol, tradeapi.rest.TimeFrame.Day, start=start_date, end=end_date).df
historical_data.index.name = 'date'
historical_data.reset_index(inplace=True)

In [9]:
# Download the TSLA news data
tsla_news = api.get_news(symbol, start_date, end_date)
news_summaries = [news.summary for news in tsla_news]
news_dates = [news.updated_at.date() for news in tsla_news]
# Extract significant words and phrases using CountVectorizer
vectorizer = CountVectorizer(max_features=50)
X = vectorizer.fit_transform(news_summaries).toarray()

In [66]:
import feedparser
from dateutil.parser import parse

# Fetch news summaries from Yahoo News
yahoo_news_rss_url = 'https://news.yahoo.com/rss/tesla'
parsed_rss = feedparser.parse(yahoo_news_rss_url)
news_titles = [entry.title for entry in parsed_rss.entries]
news_dates = [parse(news.published).date() for news in parsed_rss.entries]

In [77]:
# Preprocess the news data
def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert text to lowercase and tokenize
    words = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

cleaned_titles = [preprocess_text(title) for title in news_titles]
# Extract significant words and phrases using CountVectorizer
vectorizer = CountVectorizer(max_features=50)
X = vectorizer.fit_transform(cleaned_titles).toarray()

In [78]:
# Create a DataFrame to store significant words and phrases
significant_words = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())

# Merge the TSLA historical data and significant_words DataFrames
historical_data['date'] = pd.to_datetime(historical_data['date']).dt.date


In [79]:

significant_words['date'] = news_dates
merged_data = pd.merge(historical_data, significant_words, on='date', how='inner')

# Train a linear regression model to predict the next closing price
X_train = merged_data.drop(['date', 'close'], axis=1)
y_train = merged_data['close']

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict the next closing price
X_test = X_train.iloc[-1].values.reshape(1, -1)
next_closing_price = lr_model.predict(X_test)

print("Predicted next closing price for TSLA:", next_closing_price[0])

Predicted next closing price for TSLA: 190.40999999999985




In [83]:
parsed_rss.entries[0]

{'title': 'Search on for missing in deadly chocolate factory explosion',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'https://news.yahoo.com/rss/tesla',
  'value': 'Search on for missing in deadly chocolate factory explosion'},
 'links': [{'rel': 'alternate',
   'type': 'text/html',
   'href': 'https://news.yahoo.com/six-injured-blast-pennsylvania-chocolate-013258990.html'}],
 'link': 'https://news.yahoo.com/six-injured-blast-pennsylvania-chocolate-013258990.html',
 'published': '2023-03-25T01:32:58Z',
 'published_parsed': time.struct_time(tm_year=2023, tm_mon=3, tm_mday=25, tm_hour=1, tm_min=32, tm_sec=58, tm_wday=5, tm_yday=84, tm_isdst=0),
 'source': {'href': 'http://www.ap.org/', 'title': 'Associated Press'},
 'id': 'six-injured-blast-pennsylvania-chocolate-013258990.html',
 'guidislink': False,
 'media_content': [{'height': '86',
   'url': 'https://media.zenfs.com/en/ap.org/4cd97a6c150225c1b45daadb58131e24',
   'width': '130'}],
 'media_credit': [{'role':

In [114]:
# Fetch the latest news title from Yahoo News
latest_news_title = parsed_rss.entries[0].title

# Preprocess the latest news title
cleaned_latest_title = preprocess_text(latest_news_title)

# Transform the preprocessed title into a feature vector
latest_title_vector = vectorizer.transform([cleaned_latest_title]).toarray()

# Create a DataFrame for the latest news title
latest_significant_words = pd.DataFrame(latest_title_vector, columns=vectorizer.get_feature_names_out())

# Ensure the latest_significant_words DataFrame has the same columns as the significant_words DataFrame
for col in significant_words.columns:
    if col not in latest_significant_words.columns:
        latest_significant_words[col] = 0

latest_significant_words = latest_significant_words[significant_words.columns]

# Create a new DataFrame with the same columns as the training data
latest_merged_data = pd.DataFrame(columns=X_train.columns)

# Fill the DataFrame with values from the last row of historical data and the latest_significant_words DataFrame
for col in latest_merged_data.columns:
    if col in latest_significant_words.columns:
        latest_merged_data.loc[0, col] = latest_significant_words.loc[0, col]
    else:
        latest_merged_data.loc[0, col] = merged_data.iloc[-1][col]

# Predict the next closing price based on the current latest news
X_test_latest = latest_merged_data.values.reshape(1, -1)
next_closing_price_latest = lr_model.predict(X_test_latest)

print("Predicted next closing price for TSLA based on current latest news:", next_closing_price_latest[0])



Predicted next closing price for TSLA based on current latest news: 190.40999999999985




In [None]:
import schedule
import time

def update_model():
    # Fetch the latest TSLA news data
    parsed_rss = feedparser.parse(yahoo_news_rss_url)
    latest_news_summary = parsed_rss.entries[0].summary

    # Preprocess the latest news summary
    cleaned_latest_summary = preprocess_text(latest_news_summary)

    # Transform the preprocessed summary into a feature vector
    latest_summary_vector = vectorizer.transform([cleaned_latest_summary]).toarray()

    # Create a DataFrame for the latest news summary
    latest_significant_words = pd.DataFrame(latest_summary_vector, columns=vectorizer.get_feature_names_out())

    # Fetch the latest TSLA historical data
    tsla_data = api.get_barset('TSLA', 'day', limit=1).df['TSLA'].reset_index()

    # Merge the latest TSLA historical data and latest_significant_words DataFrames
    tsla_data['date'] = pd.to_datetime(tsla_data['time']).dt.date
    latest_significant_words['date'] = parsed_rss.entries[0].published_parsed[:3]
    latest_merged_data = pd.merge(tsla_data, latest_significant_words, on='date', how='inner')

    # Update the training data and retrain the linear regression model
    global merged_data, X_train, y_train, lr_model
    merged_data = merged_data.append(latest_merged_data, ignore_index=True)
    X_train = merged_data.drop(['date', 'close'], axis=1)
    y_train = merged_data['close']
    lr_model.fit(X_train, y_train)
    print("Model updated with latest data.")

# Schedule the update_model function to run daily
schedule.every().day.at("18:00").do(update_model)

# Keep the script running indefinitely
while True:
    schedule.run_pending()
    time.sleep(60)
