# Jupyter Notebook: Docker Setup and Financial Metrics Exploration
This notebook provides a step-by-step guide for setting up Docker to containerize the project and explores financial metrics computation using PySpark and Delta Lake.

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables if .env file exists
try:
    load_dotenv()
except:
    print("No .env file found, using default paths")

# Define data paths using environment variables or defaults
DATA_DIR = os.getenv("DATA_DIR", "data")
CLEANED_DELTA_TABLE_PATH = os.getenv("CLEANED_DELTA_TABLE_PATH", os.path.join(DATA_DIR, "delta_tables/cleaned_tech_stocks"))

# Ensure directories exist
os.makedirs(CLEANED_DELTA_TABLE_PATH, exist_ok=True)

# Setup Dockerfile
Write a Dockerfile to containerize the project, including dependencies and environment setup.

```dockerfile
# Use the official Python image as the base image
FROM python:3.12-slim

# Set the working directory
WORKDIR /app

# Copy the requirements file into the container
COPY requirements.txt ./

# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Copy the entire project into the container
COPY . .

# Expose the port for the Streamlit app
EXPOSE 8501

# Command to run the Streamlit app
CMD ["streamlit", "run", "src/streamlit_dashboard.py"]
```

# Build Docker Image
Use the `docker build` command to create a Docker image for the project.

In [None]:
# Build the Docker image
import os

os.system("docker build -t stock-market-analysis .")
print("✅ Docker image built successfully.")

# Run Docker Container
Run the Docker container using the `docker run` command, exposing necessary ports.

In [None]:
os.system("docker run -p 8501:8501 stock-market-analysis")
print("✅ Docker container is running. Access the dashboard at http://localhost:8501.")

# Access the Dashboard
Provide instructions to access the Streamlit dashboard in a web browser.

1. Open your web browser.
2. Navigate to `http://localhost:8501`.
3. Explore the Streamlit dashboard for stock market analysis.

# Import Required Libraries
Import necessary libraries such as PySpark, Delta Lake, and visualization tools.

In [2]:
# Import libraries
import pyspark
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
import pandas as pd
import plotly.express as px

# Load Data from Delta Lake
Load cleaned data from Delta Lake for financial metrics computation.

In [None]:
# Initialize Spark session
spark = (SparkSession.builder
    .appName("FinancialMetrics")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
    .getOrCreate())

# Print the path we're trying to load
print(f"Loading data from {CLEANED_DELTA_TABLE_PATH}")

# Try to load the data - this may fail if the delta table doesn't exist yet
try:
    df = spark.read.format("delta").load(CLEANED_DELTA_TABLE_PATH)
    df.show(5)
    print(f"Successfully loaded data with {df.count()} rows")
except Exception as e:
    print(f"Error loading delta table: {e}")
    print("You may need to run the data processing pipeline first.")

# Compute Financial Metrics
Calculate metrics like RSI, Moving Averages, and Sharpe Ratio using PySpark.

In [None]:
from pyspark.sql.functions import col, avg, stddev, lag, when
from pyspark.sql.window import Window

# Define a window specification
window_spec = Window.partitionBy("Ticker").orderBy("Date")

# Compute Moving Average (20-day)
df = df.withColumn("MA_20", avg("Close").over(window_spec.rowsBetween(-19, 0)))

# Compute RSI (Relative Strength Index)
df = df.withColumn("Change", col("Close") - lag("Close", 1).over(window_spec))
df = df.withColumn("Gain", when(col("Change") > 0, col("Change")).otherwise(0))
df = df.withColumn("Loss", when(col("Change") < 0, -col("Change")).otherwise(0))
df = df.withColumn("Avg_Gain", avg("Gain").over(window_spec.rowsBetween(-13, 0)))
df = df.withColumn("Avg_Loss", avg("Loss").over(window_spec.rowsBetween(-13, 0)))
df = df.withColumn("RS", col("Avg_Gain") / col("Avg_Loss"))
df = df.withColumn("RSI", 100 - (100 / (1 + col("RS"))))

# Compute Sharpe Ratio
df = df.withColumn("Daily_Return", (col("Close") - lag("Close", 1).over(window_spec)) / lag("Close", 1).over(window_spec))
df = df.withColumn("Mean_Return", avg("Daily_Return").over(window_spec.rowsBetween(-19, 0)))
df = df.withColumn("Std_Dev_Return", stddev("Daily_Return").over(window_spec.rowsBetween(-19, 0)))
df = df.withColumn("Sharpe_Ratio", col("Mean_Return") / col("Std_Dev_Return"))

df.show(5)

# Visualize Financial Metrics
Create visualizations for the computed metrics using libraries like Matplotlib or Plotly.

In [None]:
# Convert Spark DataFrame to Pandas DataFrame for visualization
pandas_df = df.select("Date", "Ticker", "Close", "MA_20", "RSI", "Sharpe_Ratio").toPandas()

# Plot Moving Average
fig_ma = px.line(pandas_df, x="Date", y="MA_20", color="Ticker", title="20-Day Moving Average")
fig_ma.show()

# Plot RSI
fig_rsi = px.line(pandas_df, x="Date", y="RSI", color="Ticker", title="RSI (Relative Strength Index)")
fig_rsi.show()

# Plot Sharpe Ratio
fig_sharpe = px.line(pandas_df, x="Date", y="Sharpe_Ratio", color="Ticker", title="Sharpe Ratio")
fig_sharpe.show()

# Insights and Observations
Analyze the visualizations and provide insights into the financial metrics.

- **Moving Average**: The 20-day moving average smooths out short-term fluctuations and highlights longer-term trends.
- **RSI**: Stocks with RSI above 70 are overbought, while those below 30 are oversold.
- **Sharpe Ratio**: A higher Sharpe Ratio indicates better risk-adjusted returns.