**Mount Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Libraries**

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, time, timedelta
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

**load sentiment data**

In [75]:
# Load the CSV from your Google Drive
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Finance Projects/Sentiment-Analysis-on-Financial-News-and-Its-Impact-on-Stock-Prices/data/apple_sentiment_dataset.csv")

**if article is after 4PM or on weekend, shift to next weekday**

In [78]:
# Function to adjust date: if article is after 4PM or on weekend, shift to next weekday
def assign_adjusted_date(row):
    dt = row['date'] # Changed from 'timestamp' to 'date'

    # Convert to datetime if it's not already
    if not isinstance(dt, datetime):
        dt = pd.to_datetime(dt)

    # If article time is 4:00 PM or later, assume impact is next trading day
    if dt.time() >= time(16, 0):
        dt += timedelta(days=1)

    # If it's Saturday → shift to Monday
    if dt.weekday() == 5:
        dt += timedelta(days=2)
    # If it's Sunday → shift to Monday
    elif dt.weekday() == 6:
        dt += timedelta(days=1)

    return dt.date()  # return only the date (not timestamp)

In [79]:
# Apply adjusted date logic
df['adjusted_date'] = df.apply(assign_adjusted_date, axis=1)

In [80]:
# Save the result to a new CSV
output_path = "/content/drive/MyDrive/Colab Notebooks/Finance Projects/Sentiment-Analysis-on-Financial-News-and-Its-Impact-on-Stock-Prices/data/apple_sentiment_time_adjusted.csv"
df.to_csv(output_path, index=False)

print(f" Adjusted dataset saved to: {output_path}")

 Adjusted dataset saved to: /content/drive/MyDrive/Colab Notebooks/Finance Projects/Sentiment-Analysis-on-Financial-News-and-Its-Impact-on-Stock-Prices/data/apple_sentiment_time_adjusted.csv


In [82]:
# Preview the date logic effect
df[['date', 'adjusted_date', 'headline']].sample(10)

Unnamed: 0,date,adjusted_date,headline
251,2025-06-10 05:50:56,2025-06-10,Evercore ISI maintains Apple stock outperform ...
487,2025-04-30 16:06:36,2025-05-01,"Seaport starts coverage of chip stocks, says s..."
437,2025-05-23 13:47:26,2025-05-23,Bitcoin price today: pulls back to $109k on pr...
71,2025-07-09 06:16:15,2025-07-09,How Trump’s trade war is upending the global e...
153,2025-06-27 13:40:42,2025-06-27,Class Action Filed Against Apple Inc. (AAPL) -...
1240,2025-02-21 01:01:02,2025-02-21,Apple Inc receives Investment Bank Analyst Rat...
353,2025-06-01 10:00:09,2025-06-02,5 big analyst AI moves: Apple may benefit from...
213,2025-06-16 12:03:33,2025-06-16,908 Devices Appoints Christopher D. Brown to i...
1211,2025-02-25 02:15:33,2025-02-25,Taiwan economy ministry has received no inform...
662,2025-05-16 10:12:39,2025-05-16,"Weeks after Amazon’s Alexa+ AI launch, a myste..."


**price data**

In [83]:
price_path = "/content/drive/MyDrive/Colab Notebooks/Finance Projects/Sentiment-Analysis-on-Financial-News-and-Its-Impact-on-Stock-Prices/data/apple_prices.csv"
df_prices = pd.read_csv(price_path)

# Ensure 'Date' column is datetime with utc=True
df_prices['Date'] = pd.to_datetime(df_prices['Date'], utc=True)

**check date range**

In [84]:
df['adjusted_date'] = pd.to_datetime(df['date']).dt.date
print(" News data date range:", df['adjusted_date'].min(), "→", df['adjusted_date'].max())


 News data date range: 2024-11-30 → 2025-07-21


In [85]:
df_prices['Date'] = pd.to_datetime(df_prices['Date']).dt.tz_localize(None).dt.date
print(" Price data date range:", df_prices['Date'].min(), "→", df_prices['Date'].max())


 Price data date range: 2024-10-01 → 2025-07-18


**match date range**

In [86]:
latest_price_date = df_prices['Date'].max()
df_sentiment = df[df['adjusted_date'] <= latest_price_date]


**rolling window to get nearest price**

In [87]:
# Create a dictionary for faster price lookups
price_dict = df_prices.set_index('Date')['Close'].to_dict()

# Function to get nearest price within ±2 days
def get_nearest_price(date, window=2):
    for delta in range(window + 1):
        for offset in [-delta, delta]:
            try_date = date + timedelta(days=offset)
            # Ensure try_date is a date object for lookup
            try_date_obj = try_date.date() if isinstance(try_date, datetime) else try_date
            if try_date_obj in price_dict:
                return price_dict[try_date_obj], try_date_obj
    return None, None

# Apply function to a copy of the DataFrame to avoid SettingWithCopyWarning
df_sentiment = df_sentiment.copy()
df_sentiment[['matched_close', 'matched_price_date']] = df_sentiment['adjusted_date'].apply(
    lambda d: pd.Series(get_nearest_price(d, window=2))
)

**merge data**

In [88]:
# Drop if no price found even within ±2 days
df_merged = df_sentiment[df_sentiment['matched_close'].notna()].copy()
print(f" Matched {len(df_merged)} news articles with prices.")

 Matched 1498 news articles with prices.


In [89]:
df_merged['date_diff'] = (pd.to_datetime(df_merged['matched_price_date']) - pd.to_datetime(df_merged['adjusted_date'])).dt.days.abs()

In [90]:
print(df_merged['date_diff'].describe())

count    1498.000000
mean        0.061415
std         0.248375
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         2.000000
Name: date_diff, dtype: float64


**save merged data**

In [92]:
# Save merged dataset
output_path = "/content/drive/MyDrive/Colab Notebooks/Finance Projects/Sentiment-Analysis-on-Financial-News-and-Its-Impact-on-Stock-Prices/data/apple_sentiment_price_merged.csv"
df_merged.to_csv(output_path, index=False)
print(f" Final dataset saved to: {output_path}")

 Final dataset saved to: /content/drive/MyDrive/Colab Notebooks/Finance Projects/Sentiment-Analysis-on-Financial-News-and-Its-Impact-on-Stock-Prices/data/apple_sentiment_price_merged.csv


In [104]:
# Sort by date to ensure correct shift
df_merged = df_merged.sort_values("matched_price_date")

# Compute next day's close price
df_merged['next_close'] = df_merged['matched_close'].shift(-1)

# Calculate % change
df_merged['pct_change'] = ((df_merged['next_close'] - df_merged['matched_close']) / df_merged['matched_close']) * 100

# Define labels
def label_movement(pct):
    if pct > 0.5:
        return 1   # Up
    elif pct < -0.5:
        return -1  # Down
    else:
        return 0   # Flat

df_merged['movement'] = df_merged['pct_change'].apply(label_movement)

# Drop last row (NaN next_close)
df_labeled = df_merged.dropna(subset=['next_close'])

print(df_labeled[['matched_price_date', 'matched_close', 'next_close', 'pct_change', 'movement']].head())


     matched_price_date  matched_close  next_close  pct_change  movement
1502         2024-11-29     236.758881  239.013428    0.952255         1
1493         2024-12-02     239.013428  239.013428    0.000000         0
1494         2024-12-02     239.013428  239.013428    0.000000         0
1495         2024-12-02     239.013428  239.013428    0.000000         0
1496         2024-12-02     239.013428  239.013428    0.000000         0


In [105]:
# Count the movement classes
movement_counts = df_labeled['movement'].value_counts().sort_index()

# Print nicely
print("📊 Movement Class Distribution:")
for label, count in movement_counts.items():
    if label == 1:
        direction = "↑ Up ( > +0.5% )"
    elif label == -1:
        direction = "↓ Down ( < -0.5% )"
    else:
        direction = "→ Flat ( -0.5% to +0.5% )"
    print(f"  {label}: {count} articles → {direction}")


📊 Movement Class Distribution:
  -1: 43 articles → ↓ Down ( < -0.5% )
  0: 1407 articles → → Flat ( -0.5% to +0.5% )
  1: 47 articles → ↑ Up ( > +0.5% )


In [106]:
from sklearn.utils import resample

# Separate classes
df_down = df_labeled[df_labeled['movement'] == -1]
df_flat = df_labeled[df_labeled['movement'] == 0]
df_up = df_labeled[df_labeled['movement'] == 1]

# Downsample '0' class
df_flat_downsampled = resample(df_flat, replace=False, n_samples=90, random_state=42)

# Combine
df_balanced = pd.concat([df_down, df_flat_downsampled, df_up])


In [108]:
# Count the movement classes
movement_counts = df_balanced['movement'].value_counts().sort_index()

# Print nicely
print("📊 Movement Class Distribution:")
for label, count in movement_counts.items():
    if label == 1:
        direction = "↑ Up ( > +0.5% )"
    elif label == -1:
        direction = "↓ Down ( < -0.5% )"
    else:
        direction = "→ Flat ( -0.5% to +0.5% )"
    print(f"  {label}: {count} articles → {direction}")


📊 Movement Class Distribution:
  -1: 43 articles → ↓ Down ( < -0.5% )
  0: 90 articles → → Flat ( -0.5% to +0.5% )
  1: 47 articles → ↑ Up ( > +0.5% )


In [113]:
df_balanced.sample()

Unnamed: 0,date,headline,cleaned_text,sentiment,sentiment_score,adjusted_date,matched_close,matched_price_date,date_diff,next_close,pct_change,movement
1204,2025-02-25 16:44:06,Cook: Apple Continues To Plan For Annual Divid...,cook apple continues plan annual dividend incr...,Positive,0.988128,2025-02-25,246.716461,2025-02-25,0,240.045212,-2.704015,-1
