In [14]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime

# Tickers matching your Excel data
tickers = ["MSFT", "AAPL", "GOOGL", "META", "NVDA", "TSLA", "AMZN", "NFLX"]

# Your date range
start_date = "2024-10-23"
end_date = "2025-10-23"

print(f"Generating stock + sentiment data: {start_date} to {end_date}")
print("="*70)

all_data = []

for ticker in tickers:
    print(f"Processing {ticker}...")

    # Get stock data
    stock = yf.Ticker(ticker)
    df = stock.history(start=start_date, end=end_date)

    # Calculate metrics
    df['price_change_pct'] = ((df['Close'] - df['Open']) / df['Open']) * 100

    # Generate sentiment (correlated with price but with noise)
    np.random.seed(hash(ticker) % 100)  # Different seed per ticker
    df['sentiment_score'] = (df['price_change_pct'] / 10) + np.random.normal(0, 0.15, len(df))
    df['sentiment_score'] = df['sentiment_score'].rolling(window=2).mean().fillna(0)
    df['sentiment_score'] = df['sentiment_score'].clip(-1, 1).round(4)

    # Headline count
    df['headline_count'] = np.random.randint(5, 20, len(df))

    # Sentiment label
    df['sentiment_label'] = pd.cut(df['sentiment_score'],
                                    bins=[-1, -0.3, 0.3, 1],
                                    labels=['Negative', 'Neutral', 'Positive'])

    # Format
    df['ticker'] = ticker
    df = df.reset_index()
    df['date'] = df['Date'].dt.strftime('%Y-%m-%d')

    # Select columns
    df_clean = df[['ticker', 'date', 'Open', 'High', 'Low', 'Close', 'Volume',
                   'price_change_pct', 'sentiment_score', 'headline_count', 'sentiment_label']]

    all_data.append(df_clean)
    print(f"  ✓ {len(df_clean)} days")

# Combine
final_df = pd.concat(all_data, ignore_index=True)
final_df = final_df.sort_values(['date', 'ticker'])

# Save
final_df.to_csv('stock_sentiment_dataset.csv', index=False)

print(f"\n✓ SUCCESS! Saved: stock_sentiment_dataset.csv")
print(f"Rows: {len(final_df)} | Tickers: {len(tickers)} | Days: ~{len(final_df)//len(tickers)}")
print("\nFirst 10 rows:")
print(final_df.head(10))

Generating stock + sentiment data: 2024-10-23 to 2025-10-23
Processing MSFT...
  ✓ 250 days
Processing AAPL...
  ✓ 250 days
Processing GOOGL...
  ✓ 250 days
Processing META...
  ✓ 250 days
Processing NVDA...
  ✓ 250 days
Processing TSLA...
  ✓ 250 days
Processing AMZN...
  ✓ 250 days
Processing NFLX...
  ✓ 250 days

✓ SUCCESS! Saved: stock_sentiment_dataset.csv
Rows: 2000 | Tickers: 8 | Days: ~250

First 10 rows:
     ticker        date        Open        High         Low       Close  \
250    AAPL  2024-10-23  232.995637  234.050724  226.704907  229.691010   
1500   AMZN  2024-10-23  188.850006  189.160004  183.690002  184.710007   
500   GOOGL  2024-10-23  164.036595  165.091953  161.219018  162.065292   
750    META  2024-10-23  578.152034  583.166296  560.736823  561.923096   
0      MSFT  2024-10-23  427.649398  427.867760  419.381483  421.436066   
1750   NFLX  2024-10-23  762.830017  763.789978  744.260010  749.289978   
1000   NVDA  2024-10-23  141.989282  142.389161  137.42060