# Notebook 4: Baseline Model - Logistic Regression
**Author:** [Your Name]  
**Date:** December 2024

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

os.makedirs('../models', exist_ok=True)
os.makedirs('../visuals/model_performance', exist_ok=True)
print("Setup complete")

In [None]:
# Load data
data = pd.read_csv('../data/raw/stock_prices_with_indicators.csv', parse_dates=['date'])
print(f"Data loaded: {data.shape}")

## 1. Feature Selection

In [None]:
# Select features
features = ['momentum_10', 'momentum_20', 'sma_20', 'sma_50', 'rsi_14', 
            'macd', 'atr_14', 'volatility_20', 'volume_ratio', 'bb_width']

# Remove missing values
data = data[features + ['trend_label', 'ticker', 'date']].dropna()

print(f"Clean data: {data.shape}")
print(f"Features: {len(features)}")

## 2. Train-Test Split

In [None]:
# Sort by date (temporal split)
data = data.sort_values('date')

# Split 70-30
split_idx = int(len(data) * 0.7)
train = data.iloc[:split_idx]
test = data.iloc[split_idx:]

X_train = train[features]
y_train = train['trend_label']
X_test = test[features]
y_test = test['trend_label']

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

## 3. Scale Features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled")

## 4. Train Model

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)

print("Model trained")

## 5. Evaluate Model

In [None]:
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 6. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('../visuals/model_performance/confusion_matrix_logreg.png', dpi=300)
print("Saved: confusion_matrix_logreg.png")

## 7. Save Model

In [None]:
joblib.dump(model, '../models/logistic_regression_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

# Save metrics
metrics = pd.DataFrame([{
    'model': 'Logistic Regression',
    'accuracy': accuracy,
    'train_size': len(X_train),
    'test_size': len(X_test)
}])
metrics.to_csv('../models/baseline_metrics.csv', index=False)

print("Model saved")
print(f"\nBaseline Accuracy: {accuracy:.2%}")