In [1]:
import os
import pandas as pd
import yfinance as yf
from kaggle.api.kaggle_api_extended import KaggleApi
from datetime import datetime, timedelta

In [2]:

# Configuration
STOCK_SYMBOL = "TSLA"  # Example: TESLA
os.makedirs("data/raw", exist_ok=True)

In [3]:
# --------------------------------------------------------------------------
# STEP 1: Download and Process Kaggle Dataset (Tweets)
# --------------------------------------------------------------------------
def download_tweets():
    try:
        api = KaggleApi()
        api.authenticate()
        api.dataset_download_files(
            "equinxx/stock-tweets-for-sentiment-analysis-and-prediction",
            path="data/raw",
            unzip=True
        )
        print("Kaggle dataset downloaded and unzipped")
        
        # Load and filter for TSLA tweets
        tweets_df = pd.read_csv("data/raw/stock_tweets.csv")
        tsla_tweets = tweets_df[tweets_df['Stock Name'] == STOCK_SYMBOL].copy()
        
        # Standardize column names
        tsla_tweets = tsla_tweets.rename(columns={
            'Date': 'date',
            'Tweet': 'text'
        })[['date', 'text']]  # Keep only needed columns
        
        # Save processed tweets
        tsla_tweets.to_csv(f"data/raw/{STOCK_SYMBOL}_tweets.csv", index=False)
        print(f"Saved {len(tsla_tweets)} TSLA tweets")
        
        # Get date range
        tsla_tweets['date'] = pd.to_datetime(tsla_tweets['date'])
        return tsla_tweets['date'].min(), tsla_tweets['date'].max()
    
    except Exception as e:
        print(f"Error processing tweets: {e}")
        # Create empty file
        pd.DataFrame(columns=['date', 'text']).to_csv(
            f"data/raw/{STOCK_SYMBOL}_tweets.csv", 
            index=False
        )
        print(f"Created empty {STOCK_SYMBOL}_tweets.csv as fallback")
        return None, None

In [4]:
# --------------------------------------------------------------------------
# STEP 2: Fetch Stock Prices (Aligned with Tweets)
# --------------------------------------------------------------------------
def download_stock_prices(start_date=None, end_date=None):
    try:
        if start_date and end_date:
            print(f"Fetching stock data for {start_date.date()} to {end_date.date()}")
            stock_data = yf.download(
                STOCK_SYMBOL,
                start=start_date,
                end=end_date + timedelta(days=1),
                progress=False
            )
        else:
            print("Using default 1-year period for stock data")
            stock_data = yf.download(
                STOCK_SYMBOL,
                period="1y",
                progress=False
            )
        
        stock_data.reset_index(inplace=True)
        stock_data.to_csv(
            f"data/raw/{STOCK_SYMBOL}_prices.csv", 
            index=False
        )
        print(f"Saved {len(stock_data)} days of stock data")
    
    except Exception as e:
        print(f"Yahoo Finance failed: {e}")
        pd.DataFrame(columns=['Date', 'Open', 'High', 'Low', 'Close', 'Volume']).to_csv(
            f"data/raw/{STOCK_SYMBOL}_prices.csv", 
            index=False
        )
        print(f"Created empty {STOCK_SYMBOL}_prices.csv as fallback")

In [5]:
# --------------------------------------------------------------------------
# RUN PIPELINE
# --------------------------------------------------------------------------
print("🔍 Starting data collection...")
date_range = download_tweets()

if date_range[0] and date_range[1]:
    download_stock_prices(date_range[0], date_range[1])
else:
    print("⚠️ Proceeding with default stock data download")
    download_stock_prices()

print("\n✅ Data collection complete!")
print(f"- Tweets: data/raw/{STOCK_SYMBOL}_tweets.csv")
print(f"- Prices: data/raw/{STOCK_SYMBOL}_prices.csv")

🔍 Starting data collection...
Dataset URL: https://www.kaggle.com/datasets/equinxx/stock-tweets-for-sentiment-analysis-and-prediction
Kaggle dataset downloaded and unzipped
Saved 37422 TSLA tweets
Fetching stock data for 2021-09-30 to 2022-09-29
YF.download() has changed argument auto_adjust default to True
Saved 253 days of stock data

✅ Data collection complete!
- Tweets: data/raw/TSLA_tweets.csv
- Prices: data/raw/TSLA_prices.csv
