In [2]:
import pandas as pd
import os

# === Load trimmed Nasdaq 100 data ===
nasdaq_path = "/home/u762545/Thesis/Data/nasdaq100_trimmed.csv"
nasdaq_df = pd.read_csv(nasdaq_path, parse_dates=["Date"])

# === Load daily sentiment scores ===
sentiment_path = "/home/u762545/Thesis/Data/daily_sentiment_aggregated.csv"
sentiment_df = pd.read_csv(sentiment_path, parse_dates=["Date"])

# === Remove timezone info (UTC) to avoid merge dtype conflict ===
nasdaq_df["Date"] = nasdaq_df["Date"].dt.tz_localize(None)
sentiment_df["Date"] = sentiment_df["Date"].dt.tz_localize(None)

# === Merge on 'Date' ===
merged_df = pd.merge(nasdaq_df, sentiment_df, on="Date", how="inner")

# === Save merged dataset ===
output_path = "/home/u762545/Thesis/Data/merged_stock_sentiment.csv"
merged_df.to_csv(output_path, index=False)

# === Confirm merge ===
print("âœ… Merge completed successfully.")
print(f"ðŸ“… Date range: {merged_df['Date'].min().date()} to {merged_df['Date'].max().date()}")
print(f"ðŸ§¾ Shape: {merged_df.shape}")
print(f"âœ… Merged dataset saved to: {output_path}")

# === Preview ===
print("\nðŸ“„ Sample of merged dataset:")
print(merged_df.head())


âœ… Merge completed successfully.
ðŸ“… Date range: 2019-01-02 to 2023-12-15
ðŸ§¾ Shape: (1249, 13)
âœ… Merged dataset saved to: /home/u762545/Thesis/Data/merged_stock_sentiment.csv

ðŸ“„ Sample of merged dataset:
        Date       Close        High         Low        Open      Volume  \
0 2019-01-02  148.757462  149.593065  144.915585  145.021237  58576700.0   
1 2019-01-03  143.897461  147.201462  143.580504  146.567563  74820200.0   
2 2019-01-04  150.054062  150.793627  145.741566  146.317838  74709300.0   
3 2019-01-07  151.840500  152.580065  149.938776  150.428610  52059300.0   
4 2019-01-08  153.214020  153.780693  150.985725  153.233219  49388700.0   

   Daily Return  sentiment_article_positive  sentiment_article_neutral  \
0           NaN                    0.339598                   0.413057   
1     -0.032671                    0.360568                   0.376129   
2      0.042785                    0.478895                   0.318257   
3      0.011905                   

In [7]:
import pandas as pd

# === Feature 1: Article Sentiment Index (Positive - Negative) ===
merged_df["article_sentiment_index"] = (
    merged_df["sentiment_article_positive"] - merged_df["sentiment_article_negative"]
)

# === Feature 2: Title Sentiment Index (Positive - Negative) ===
merged_df["title_sentiment_index"] = (
    merged_df["sentiment_title_positive"] - merged_df["sentiment_title_negative"]
)

# === Feature 3: Compound Article Sentiment Score ===
merged_df["article_sentiment_score"] = (
    merged_df["sentiment_article_positive"]
    - merged_df["sentiment_article_negative"]
    + 0.5 * merged_df["sentiment_article_neutral"]
)

# === Feature 4: Article Sentiment 3-Day Moving Average ===
merged_df["article_sentiment_ma3"] = (
    merged_df["article_sentiment_index"].rolling(window=3).mean()
)

# === Feature 5: Title Sentiment 3-Day Moving Average ===
merged_df["title_sentiment_ma3"] = (
    merged_df["title_sentiment_index"].rolling(window=3).mean()
)

# === Drop rows with NaNs introduced by rolling averages ===
merged_df = merged_df.dropna()

# Create regression target (next day's close price)
merged_df["target_close"] = merged_df["Close"].shift(-1)

# Drop final row where target will be NaN
merged_df = merged_df.dropna()


# === Preview ===
print("âœ… Sentiment feature engineering completed.\n")
print(merged_df[[
    "Date", "article_sentiment_index", "title_sentiment_index",
    "article_sentiment_score", "article_sentiment_ma3", "title_sentiment_ma3"
]].head())



âœ… Sentiment feature engineering completed.

         Date  article_sentiment_index  title_sentiment_index  \
10 2019-01-16                 0.194804               0.099603   
11 2019-01-17                 0.220157               0.124218   
12 2019-01-18                 0.364737               0.224372   
13 2019-01-22                 0.091479               0.099257   
14 2019-01-23                 0.272804               0.226597   

    article_sentiment_score  article_sentiment_ma3  title_sentiment_ma3  
10                 0.378856               0.155633             0.082549  
11                 0.398102               0.196671             0.104878  
12                 0.520226               0.259899             0.149398  
13                 0.290738               0.225458             0.149282  
14                 0.454915               0.243006             0.183409  


In [8]:
# === Step 2: Define target variable ===
# Predict the next day's Close price
merged_df["target_close"] = merged_df["Close"].shift(-1)

# Drop the last row where target is NaN after shift
merged_df = merged_df.dropna()

# === Preview the result ===
print("âœ… Target column 'target_close' created.")
print(merged_df[[
    "Date", "Close", "target_close",
    "article_sentiment_index", "article_sentiment_ma3"
]].head())


âœ… Target column 'target_close' created.
         Date       Close  target_close  article_sentiment_index  \
10 2019-01-16  155.932175    157.161514                 0.194804   
11 2019-01-17  157.161514    158.717468                 0.220157   
12 2019-01-18  158.717468    155.538361                 0.364737   
13 2019-01-22  155.538361    155.740051                 0.091479   
14 2019-01-23  155.740051    156.748489                 0.272804   

    article_sentiment_ma3  
10               0.155633  
11               0.196671  
12               0.259899  
13               0.225458  
14               0.243006  
