In [None]:
# # Task 1: Preprocess and Explore the Data

# ## Executive Summary
# **Objective:** Prepare and analyze historical financial data for TSLA, BND, and SPY to inform forecasting models.

# **Scope:**
# 1.  **Data Extraction:** Jan 1, 2015 â€“ Jan 15, 2026 (YFinance)
# 2.  **Cleaning:** Handling missing values and ensuring correct data types.
# 3.  **EDA:** Visualizing trends, volatility, and rolling statistics.
# 4.  **Analysis:** Testing for stationarity (ADF) and calculating risk metrics (VaR, Sharpe).
# 5.  **Scaling:** Normalizing data for potential deep learning models.

In [7]:
# 1. Setup and Imports
import sys
import os

# Add the 'src' directory to the system path to import modules
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import custom modules from src/
from src.data_loader import fetch_financial_data, clean_data
from src.analysis import calculate_daily_returns, check_stationarity, calculate_risk_metrics, detect_outliers

# Configure Plotting Style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['font.size'] = 12

ModuleNotFoundError: No module named 'pandas'

In [None]:
## 2. Data Extraction & Cleaning
We fetch data for **Tesla (TSLA)** (High Growth), **Vanguard Bond ETF (BND)** (Safety), and **S&P 500 (SPY)** (Benchmark).

In [None]:
tickers = ['TSLA', 'BND', 'SPY']
start_date = '2015-01-01'
end_date = '2026-01-15'

# Fetch Data
raw_data = fetch_financial_data(tickers, start_date, end_date)

# Clean Data (Handle Missing Values, Fix Index)
data = clean_data(raw_data)

# Extract Adjusted Close Prices (Best for analysis as it accounts for splits/dividends)
# Note: Depending on yfinance version, the structure might be different. 
# We assume MultiIndex (Ticker, Price Type)
try:
    close_df = data.xs('Adj Close', level=1, axis=1)
except:
    # Fallback if structure is flat
    close_df = data['Adj Close'] if 'Adj Close' in data else data['Close']

# Ensure we have the specific tickers
close_df = close_df[tickers]

# Display first 5 rows to verify structure
print("Data Shape:", close_df.shape)
close_df.head()

In [None]:
## 3. Exploratory Data Analysis (EDA)

### Visualization A: Closing Price Over Time
Comparing the growth trajectory of the three assets.

In [None]:
close_df.plot(linewidth=1.5)
plt.title('Historical Closing Prices (2015-2026)')
plt.ylabel('Price (USD)')
plt.xlabel('Date')
plt.legend(loc='upper left')
plt.show()

In [None]:
### Visualization B: Daily Returns (Volatility Analysis)
We calculate daily percentage changes to observe volatility clusters.

In [None]:
returns_df = calculate_daily_returns(close_df)

plt.figure(figsize=(14, 6))
plt.plot(returns_df.index, returns_df['TSLA'], label='TSLA', alpha=0.6)
plt.plot(returns_df.index, returns_df['BND'], label='BND', alpha=0.6)
plt.title('Daily Returns: Volatility Comparison')
plt.ylabel('Daily % Change')
plt.legend()
plt.show()

In [None]:
### Visualization C: Rolling Mean & Standard Deviation
**Rubric Requirement:** Analyze short-term trends and fluctuations.

In [None]:
# Rolling window of 30 days (approx 1 trading month)
rolling_mean = close_df['TSLA'].rolling(window=30).mean()
rolling_std = close_df['TSLA'].rolling(window=30).std()

fig, ax1 = plt.subplots(figsize=(14, 7))

color = 'tab:blue'
ax1.set_xlabel('Date')
ax1.set_ylabel('Price', color=color)
ax1.plot(close_df.index, close_df['TSLA'], label='TSLA Price', color=color, alpha=0.3)
ax1.plot(rolling_mean.index, rolling_mean, label='30-Day Rolling Mean', color='orange', linewidth=2)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:green'
ax2.set_ylabel('Volatility (Std Dev)', color=color)
ax2.plot(rolling_std.index, rolling_std, label='30-Day Rolling Std Dev', color=color, linestyle='--')
ax2.tick_params(axis='y', labelcolor=color)

plt.title('TSLA: Price Trend vs. Rolling Volatility')
fig.tight_layout()
plt.show()

In [None]:
## 4. Outlier Detection
Identifying days with extreme returns (greater than 3 standard deviations).

In [None]:
tsla_outliers = detect_outliers(returns_df['TSLA'])

plt.figure(figsize=(14, 6))
plt.scatter(returns_df.index, returns_df['TSLA'], alpha=0.3, label='Normal Returns')
plt.scatter(tsla_outliers.index, tsla_outliers, color='red', label='Outliers (>3 Std Dev)')
plt.title('TSLA Return Anomalies (Outlier Detection)')
plt.legend()
plt.show()

print(f"Detected {len(tsla_outliers)} outlier days for TSLA.")

In [None]:
## 5. Stationarity Test (ADF Test)
We use the Augmented Dickey-Fuller test to check if the data is stationary (a requirement for ARIMA models).

*   **H0:** The series is non-stationary.
*   **H1:** The series is stationary.
*   **Rule:** If p-value < 0.05, we reject H0 (Stationary).

In [None]:
print("--- Stationarity: TSLA Closing Prices ---")
print(check_stationarity(close_df['TSLA']))

print("\n--- Stationarity: TSLA Daily Returns ---")
print(check_stationarity(returns_df['TSLA']))

In [None]:
**Interpretation:** 
*   **Prices:** Usually have a high p-value (Non-stationary). This confirms we must use differencing ('d' parameter) in ARIMA.
*   **Returns:** Usually have a low p-value (Stationary). This confirms returns are safe to model directly.

In [None]:
## 6. Risk Metrics Analysis
Calculating Value at Risk (VaR) and Sharpe Ratio to quantify risk/reward.

In [None]:
risk_metrics = calculate_risk_metrics(returns_df)
print(risk_metrics)

In [None]:
## 7. Data Normalization/Scaling
For Machine Learning models (like LSTM in Task 2), data often needs to be scaled between 0 and 1.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# Reshape for scaler (needs 2D array)
tsla_values = close_df['TSLA'].values.reshape(-1, 1)
scaled_data = scaler.fit_transform(tsla_values)

print(f"Scaled Data Sample (First 5):\n {scaled_data[:5]}")

In [None]:
## 8. Export Processed Data
Saving the cleaned data for use in the modeling notebook.

In [None]:
# Ensure directory exists
os.makedirs('../data/processed', exist_ok=True)

close_df.to_csv('../data/processed/cleaned_close_prices.csv')
returns_df.to_csv('../data/processed/daily_returns.csv')

print("Data successfully saved to data/processed/")