In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

# Load your original sentiment dataset
df_sentiment = pd.read_csv("/content/drive/MyDrive/MRP/news_textrank_with_sentiment.csv", parse_dates=['Date'])

# Keep only necessary columns
df_sentiment = df_sentiment[['Date', 'Stock_symbol', 'sentiment_numeric', 'sentiment_score', 'has_article']]

# Rename for consistency
df_sentiment.rename(columns={
    'Date': 'date',
    'Stock_symbol': 'symbol'
}, inplace=True)

# Aggregate by symbol and date
agg_funcs = {
    'sentiment_numeric': 'mean',
    'sentiment_score': 'mean',
    'has_article': 'sum'
}

sentiment_agg = df_sentiment.groupby(['symbol', 'date'], as_index=False).agg(agg_funcs)

# Rename columns for clarity
sentiment_agg.rename(columns={
    'sentiment_numeric': 'avg_sentiment',
    'sentiment_score': 'avg_sentiment_confidence',
    'has_article': 'news_count'
}, inplace=True)

# Add had_news (1 if news was present that day)
sentiment_agg['had_news'] = (sentiment_agg['news_count'] > 0).astype(int)

# Add day of week (0 = Monday, 6 = Sunday)
sentiment_agg['day_of_week'] = sentiment_agg['date'].dt.dayofweek

# Calculate 7-day rolling std of avg_sentiment (per symbol)
sentiment_agg.sort_values(by=['symbol', 'date'], inplace=True)
sentiment_agg['sentiment_std_7'] = sentiment_agg.groupby('symbol')['avg_sentiment'].transform(lambda x: x.rolling(7, min_periods=1).std().fillna(0))

# Cap news count to avoid large outliers
sentiment_agg['news_count_capped'] = sentiment_agg['news_count'].clip(upper=10)

# Final columns
final_columns = ['date', 'symbol', 'day_of_week', 'had_news', 'avg_sentiment', 'avg_sentiment_confidence', 'sentiment_std_7', 'news_count', 'news_count_capped']
sentiment_final = sentiment_agg[final_columns]

# Preview
sentiment_final.head()

Unnamed: 0,date,symbol,day_of_week,had_news,avg_sentiment,avg_sentiment_confidence,sentiment_std_7,news_count,news_count_capped
0,2016-01-06 00:00:00+00:00,A,2,1,0.0,0.999295,0.0,2,2
1,2016-01-07 00:00:00+00:00,A,3,1,0.5,0.992147,0.353553,2,2
2,2016-02-05 00:00:00+00:00,A,4,1,0.0,0.997771,0.288675,1,1
3,2016-02-10 00:00:00+00:00,A,2,1,0.0,0.999134,0.25,1,1
4,2016-02-15 00:00:00+00:00,A,0,1,0.0,0.999934,0.223607,1,1


In [3]:
# Drop weekend rows: Saturday (5) and Sunday (6)
sentiment_final = sentiment_final[sentiment_final['date'].dt.dayofweek < 5]

# Confirm drop
print(f"Remaining rows after dropping weekends: {len(sentiment_final):,}")

Remaining rows after dropping weekends: 1,725,196


In [4]:
# Basic sanity checks before saving

print(" Null / Missing Value Check ")
print(sentiment_final.isnull().sum())

print("\n Duplicate Check ")
print(f"Number of duplicate rows: {sentiment_final.duplicated().sum()}")

print("\n Data Types and Non-Null Counts ")
print(sentiment_final.info())

print("\n Summary Statistics ")
print(sentiment_final.describe(include='all'))

print("\n Sample Rows ")
display(sentiment_final.head())

 Null / Missing Value Check 
date                        0
symbol                      0
day_of_week                 0
had_news                    0
avg_sentiment               0
avg_sentiment_confidence    0
sentiment_std_7             0
news_count                  0
news_count_capped           0
dtype: int64

 Duplicate Check 
Number of duplicate rows: 0

 Data Types and Non-Null Counts 
<class 'pandas.core.frame.DataFrame'>
Index: 1725196 entries, 0 to 1873205
Data columns (total 9 columns):
 #   Column                    Dtype              
---  ------                    -----              
 0   date                      datetime64[ns, UTC]
 1   symbol                    object             
 2   day_of_week               int32              
 3   had_news                  int64              
 4   avg_sentiment             float64            
 5   avg_sentiment_confidence  float64            
 6   sentiment_std_7           float64            
 7   news_count                int64     

Unnamed: 0,date,symbol,day_of_week,had_news,avg_sentiment,avg_sentiment_confidence,sentiment_std_7,news_count,news_count_capped
0,2016-01-06 00:00:00+00:00,A,2,1,0.0,0.999295,0.0,2,2
1,2016-01-07 00:00:00+00:00,A,3,1,0.5,0.992147,0.353553,2,2
2,2016-02-05 00:00:00+00:00,A,4,1,0.0,0.997771,0.288675,1,1
3,2016-02-10 00:00:00+00:00,A,2,1,0.0,0.999134,0.25,1,1
4,2016-02-15 00:00:00+00:00,A,0,1,0.0,0.999934,0.223607,1,1


In [5]:
# Save the final sentiment features to a new CSV
sentiment_final.to_csv("/content/drive/MyDrive/MRP/news_preprocessed_features_engineered.csv", index=False)