In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
file_path = 'Apple_data.csv'
data = pd.read_csv(file_path)
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values(by='Date', ascending=True)
data = data.drop(index=220)
def fill_missing_prices(data):
    data = data.reset_index(drop=True)
    for i in range(len(data)):
        if np.isnan(data.loc[i, 'Open Price']) and i > 0:
            data.loc[i, 'Open Price'] = data.loc[i - 1, 'Close Price']
        if np.isnan(data.loc[i, 'Close Price']) and i < len(data) - 1:
            data.loc[i, 'Close Price'] = data.loc[i + 1, 'Open Price']

    return data
data = fill_missing_prices(data)
data = data.dropna()
data.head()

Unnamed: 0,Date,company,headline,abstract,url,section,Open Price,Close Price
0,2022-12-20,Apple,Bans on TikTok Gain Momentum in Washington and...,At least 14 states have passed bans on the ser...,https://www.nytimes.com/2022/12/20/technology/...,Technology,130.02,130.92
1,2022-12-20,Apple,Clean Energy Quest Pits Google Against Utilities,Google says its goals for carbon-free power ar...,https://www.nytimes.com/2022/12/20/business/go...,Business Day,130.02,130.92
2,2022-12-20,Apple,Amazon and E.U. Reach Deal to End Antitrust In...,The online retail giant avoided a major fine b...,https://www.nytimes.com/2022/12/20/technology/...,Technology,130.02,130.92
3,2022-12-21,Apple,YouTube in Advanced Talks for N.F.L. Sunday Ti...,"Tech’s biggest companies — Apple, Amazon and Y...",https://www.nytimes.com/2022/12/20/business/nf...,Business Day,131.6,134.04
4,2022-12-21,Apple,A New Chat Bot Is a ‘Code Red’ for Google’s Se...,A new wave of chat bots like ChatGPT use artif...,https://www.nytimes.com/2022/12/21/technology/...,Technology,131.6,134.04


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['cleaned_headline'] = data['headline'].apply(clean_text)
data['cleaned_abstract'] = data['abstract'].apply(clean_text)

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['headline_sentiment'] = data['cleaned_headline'].apply(get_sentiment)
data['abstract_sentiment'] = data['cleaned_abstract'].apply(get_sentiment)

sia = SentimentIntensityAnalyzer()

data['headline_sentiment'] = data['cleaned_headline'].apply(lambda x: sia.polarity_scores(x)['compound'])
data['abstract_sentiment'] = data['cleaned_abstract'].apply(lambda x: sia.polarity_scores(x)['compound'])

vectorizer = TfidfVectorizer(max_features=1000)


headline_features = vectorizer.fit_transform(data['cleaned_headline'])
abstract_features = vectorizer.fit_transform(data['cleaned_abstract'])
data['price_change'] = (data['Close Price'] - data['Open Price']) / data['Open Price']

X = data[['headline_sentiment', 'abstract_sentiment']]
y = data['Close Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

mape = (abs((y_test - predictions) / y_test.replace(0, 1)).mean()) * 100


print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")
print(f"MAPE: {mape:.2f}%")

MAE: 23.177875072407804
MSE: 820.5922195630922
R²: 0.00029425255934123484
MAPE: 12.64%


In [None]:
sia = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    return sia.polarity_scores(text)['compound']

data['headline_sentiment'] = data['headline'].apply(get_vader_sentiment)
data['abstract_sentiment'] = data['abstract'].apply(get_vader_sentiment)
data['price_increase'] = (data['Close Price'] > data['Open Price']).astype(int)
daily_data = data.groupby('Date').agg({
    'headline_sentiment': ['mean', 'max', 'count'],
    'abstract_sentiment': ['mean', 'max', 'count'],
    'Open Price': 'first',
    'Close Price': 'last',
    'price_increase': 'last'
})

daily_data.columns = ['_'.join(col).strip() for col in daily_data.columns]
daily_data = daily_data.reset_index()
X = daily_data[['headline_sentiment_mean', 'headline_sentiment_max', 'headline_sentiment_count',
                'abstract_sentiment_mean', 'abstract_sentiment_max', 'abstract_sentiment_count']]
y = daily_data['price_increase_last']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5568181818181818
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.29      0.36        38
           1       0.58      0.76      0.66        50

    accuracy                           0.56        88
   macro avg       0.53      0.52      0.51        88
weighted avg       0.54      0.56      0.53        88

Confusion Matrix:
 [[11 27]
 [12 38]]


In [None]:
daily_data = daily_data.sort_values('Date')
daily_data['Close_Price_lag1'] = daily_data['Close Price_last'].shift(1)
daily_data['Close_Price_diff'] = daily_data['Close Price_last'] - daily_data['Close_Price_lag1']
daily_data['Close_Price_diff_pct'] = daily_data['Close_Price_diff'] / daily_data['Close_Price_lag1']
daily_data = daily_data.dropna()
X = daily_data[['headline_sentiment_mean', 'headline_sentiment_max', 'abstract_sentiment_mean', 'abstract_sentiment_max',
                'headline_sentiment_count', 'abstract_sentiment_count', 'Close_Price_diff', 'Close_Price_diff_pct']]
y = daily_data['price_increase_last']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8409090909090909
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        36
           1       0.90      0.83      0.86        52

    accuracy                           0.84        88
   macro avg       0.84      0.84      0.84        88
weighted avg       0.85      0.84      0.84        88

Confusion Matrix:
 [[31  5]
 [ 9 43]]
