# Market-Neutral News Sentiment Strategy - Data Preparation
## Step 1: Load and Clean News Sentiment Data

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
from sentiment_signal import SentimentSignal
from feature_engineering import FeatureEngineer

In [None]:
# Load mock news sentiment data (replace with actual data feed)
# Expected columns: ticker, timestamp, p_positive, p_negative, p_neutral, relevance_score, topic

news_df = pd.DataFrame({
    'ticker': ['AAPL', 'AAPL', 'MSFT', 'GOOGL'],
    'timestamp': pd.to_datetime(['2023-01-01 10:00', '2023-01-01 14:00', '2023-01-01 11:00', '2023-01-01 15:00']),
    'p_positive': [0.7, 0.3, 0.6, 0.2],
    'p_negative': [0.1, 0.5, 0.2, 0.6],
    'p_neutral': [0.2, 0.2, 0.2, 0.2],
    'relevance_score': [0.9, 0.8, 0.85, 0.75],
    'topic': ['earnings', 'regulation', 'product', 'legal']
})

news_df['date'] = news_df['timestamp'].dt.date
news_df.head()

In [None]:
# Calculate sentiment signals
signal_calculator = SentimentSignal(relevance_threshold=0.7)
sentiment_scores = signal_calculator.aggregate_stock_sentiment(news_df)
sentiment_scores = signal_calculator.cross_sectional_rank(sentiment_scores)
sentiment_scores.head()

## Step 2: Load Price and Sector Data

In [None]:
# Load price data (replace with actual data)
returns_df = pd.DataFrame({
    'ticker': ['AAPL', 'MSFT', 'GOOGL'],
    'date': pd.to_datetime(['2023-01-02', '2023-01-02', '2023-01-02']).date,
    'return': [0.02, -0.01, 0.015]
})

sector_df = pd.DataFrame({
    'ticker': ['AAPL', 'MSFT', 'GOOGL'],
    'gics_sector': ['Technology', 'Technology', 'Communication Services']
})

returns_df.head()

In [None]:
# Calculate sector-relative returns
feature_engineer = FeatureEngineer()
returns_with_sector = feature_engineer.calculate_sector_relative_returns(returns_df, sector_df)
returns_with_sector.head()

## Step 3: Merge and Save Processed Data

In [None]:
# Merge sentiment with returns
final_df = sentiment_scores.merge(returns_with_sector, on=['ticker', 'date'], how='inner')
final_df['year'] = pd.to_datetime(final_df['date']).dt.year

# Save processed data
final_df.to_csv('../data/processed_data.csv', index=False)
print(f"Saved {len(final_df)} records")