In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from dateutil.parser import parse
import warnings
import seaborn as sns
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
tesla = pd.read_csv('data/tsla.csv')
tesla['Date'] = pd.to_datetime(tesla['Date'])
tesla.sort_values('Date', inplace=True)
tesla.reset_index(drop=True, inplace=True)

In [None]:
reuters = pd.read_csv('data/reuters_headlines.csv')
guardian = pd.read_csv('data/guardian_headlines.csv')
cnbc = pd.read_csv('data/cnbc_headlines.csv')
reuters['Date'] = pd.to_datetime(reuters['Time'], errors='coerce')
guardian['Date'] = pd.to_datetime(guardian['Time'], format='%d-%b-%y', errors='coerce')

In [None]:
def parse_cnbc_time(time_str):
    try:
        time_str = time_str.strip().replace('ET', '').strip()
        dt = parse(time_str, fuzzy=True)
        return dt
    except:
        return pd.NaT
cnbc.dropna(subset=['Headlines', 'Time'], inplace=True)
cnbc['Date'] = cnbc['Time'].apply(parse_cnbc_time)
reuters_news = reuters[['Date', 'Headlines']].rename(columns={'Headlines': 'Headline'})
guardian_news = guardian[['Date', 'Headlines']].rename(columns={'Headlines': 'Headline'})
cnbc_news = cnbc[['Date', 'Headlines']].rename(columns={'Headlines': 'Headline'})
news = pd.concat([reuters_news, guardian_news, cnbc_news], ignore_index=True)
news.dropna(subset=['Date'], inplace=True)

In [None]:
stop_words = set(stopwords.words('english'))
def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)
news['Clean_Headline'] = news['Headline'].apply(clean)
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity
news['Sentiment'] = news['Headline'].apply(get_sentiment)
daily_news = news.groupby('Date').agg({
    'Clean_Headline': ' '.join,
    'Sentiment': 'mean'
}).reset_index()

In [None]:
# Most Common Words
all_words = ' '.join(news['Clean_Headline']).split()
word_freq = pd.Series(all_words).value_counts().head(20)
plt.figure(figsize=(10,6))
sns.barplot(x=word_freq.values, y=word_freq.index, palette='viridis')
plt.title('Top 20 Most Common Words in Headlines')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

In [None]:
# Top Words That Are Proper Nouns
def extract_proper_nouns(text):
    words = word_tokenize(text)
    proper_nouns = [word for word in words if word.istitle()]
    return proper_nouns
news['Proper_Nouns'] = news['Headline'].apply(extract_proper_nouns)
all_proper_nouns = news['Proper_Nouns'].explode()
proper_noun_freq = all_proper_nouns.value_counts().head(20)
plt.figure(figsize=(10,6))
sns.barplot(x=proper_noun_freq.values, y=proper_noun_freq.index, palette='magma')
plt.title('Top 20 Proper Nouns in Headlines')
plt.xlabel('Frequency')
plt.ylabel('Proper Nouns')
plt.show()

In [None]:
# Tesla Stock Data Aggregates
plt.figure(figsize=(12,6))
plt.plot(tesla['Date'], tesla['Close'], label='Close Price')
plt.title('Tesla Close Price Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.show()
plt.figure(figsize=(12,6))
plt.plot(tesla['Date'], tesla['Volume'], label='Volume', color='orange')
plt.title('Tesla Trading Volume Over Time')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.legend()
plt.show()

In [None]:
merged = pd.merge(tesla, daily_news, on='Date', how='inner')
merged['Target'] = (merged['Close'].shift(-1) > merged['Close']).astype(int)
merged.dropna(inplace=True)
for lag in [1, 2, 3]:
    merged[f'Close_Lag{lag}'] = merged['Close'].shift(lag)
    merged[f'Volume_Lag{lag}'] = merged['Volume'].shift(lag)
merged.dropna(inplace=True)

In [None]:
X_num = merged[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
               'Close_Lag1', 'Close_Lag2', 'Close_Lag3',
               'Volume_Lag1', 'Volume_Lag2', 'Volume_Lag3', 'Sentiment']]
X_text = merged['Clean_Headline']
y = merged['Target']
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
vectorizer = TfidfVectorizer(max_features=500)
X_text_vect = vectorizer.fit_transform(X_text).toarray()
X = np.hstack((X_num.values, X_text_vect))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
xgb_params = {
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'n_estimators': 100,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 158,
    'objective': 'binary:logistic',
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}
model = xgb.XGBClassifier(**xgb_params)
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
rand_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                 n_iter=20, scoring='accuracy',
                                 cv=3, verbose=1, random_state=158, n_jobs=-1)
rand_search.fit(X_train, y_train)
best_model = rand_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=['Down', 'Up'], yticklabels=['Down', 'Up'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()