# File 3A: Logistic Regression Model
## Stock Price Volatility & Sentiment ML Project

**Purpose:** Train and evaluate Logistic Regression model

**What is Logistic Regression?**
- Simple, interpretable classification model
- Good baseline for binary classification (Up/Down)
- Fast to train
- Works well with linearly separable data

**We'll train:**
- Model with full features (23 features)
- Model with PCA features (7 components)

---

## Step 1: Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('‚úÖ Libraries loaded successfully!')

## Step 2: Load Preprocessed Data

In [None]:
print('='*70)
print('LOADING PREPROCESSED DATA')
print('='*70)

# UPDATE THIS PATH
processed_dir = '/Users/aryan/Desktop/Stock-Price-Volatility-Sentiment-ML/data/processed'

# Load training and testing data
X_train = np.load(f'{processed_dir}/X_train_scaled.npy')
X_test = np.load(f'{processed_dir}/X_test_scaled.npy')
X_train_pca = np.load(f'{processed_dir}/X_train_pca.npy')
X_test_pca = np.load(f'{processed_dir}/X_test_pca.npy')
y_train = np.load(f'{processed_dir}/y_train.npy')
y_test = np.load(f'{processed_dir}/y_test.npy')

print('\n‚úÖ Data loaded successfully!')
print(f'\nData shapes:')
print(f'   X_train (full): {X_train.shape}')
print(f'   X_test (full): {X_test.shape}')
print(f'   X_train (PCA): {X_train_pca.shape}')
print(f'   X_test (PCA): {X_test_pca.shape}')
print(f'   y_train: {y_train.shape}')
print(f'   y_test: {y_test.shape}')

print(f'\nTarget distribution:')
print(f'   Train - Up: {y_train.sum()}/{len(y_train)} ({y_train.mean()*100:.1f}%)')
print(f'   Test - Up: {y_test.sum()}/{len(y_test)} ({y_test.mean()*100:.1f}%)')

## Step 3: Train Logistic Regression with Full Features

In [None]:
print('\n' + '='*70)
print('TRAINING LOGISTIC REGRESSION - FULL FEATURES')
print('='*70)

print('\nTraining with 23 features...')
print('Hyperparameters:')
print('   - Solver: lbfgs')
print('   - Max iterations: 1000')
print('   - Random state: 42')

# Create and train model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

print('\n‚úÖ Model trained successfully!')

## Step 4: Evaluate Full Features Model

In [None]:
# Make predictions
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

# Calculate accuracy
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

print('='*70)
print('RESULTS - FULL FEATURES MODEL')
print('='*70)
print(f'\nüìä Training Accuracy: {train_acc*100:.2f}%')
print(f'üìä Testing Accuracy: {test_acc*100:.2f}%')

# Check overfitting
diff = train_acc - test_acc
if diff > 0.05:
    print(f'\n‚ö†Ô∏è  Overfitting detected! Difference: {diff*100:.2f}%')
elif diff < -0.05:
    print(f'\n‚ö†Ô∏è  Underfitting detected! Difference: {diff*100:.2f}%')
else:
    print(f'\n‚úÖ Good fit! Difference: {diff*100:.2f}%')

# Check target achievement
if test_acc >= 0.55:
    print(f'‚úÖ Target accuracy (55%) ACHIEVED!')
else:
    print(f'‚ö†Ô∏è  Below target accuracy (55%)')

In [None]:
# Detailed classification report
print('\n' + '='*70)
print('DETAILED CLASSIFICATION REPORT')
print('='*70)
print(classification_report(y_test, y_pred_test, 
                          target_names=['Down (0)', 'Up (1)'],
                          digits=4))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Down (0)', 'Up (1)'],
            yticklabels=['Down (0)', 'Up (1)'],
            annot_kws={'size': 16, 'weight': 'bold'})

plt.title(f'Logistic Regression - Confusion Matrix\nAccuracy: {test_acc*100:.2f}%',
         fontsize=14, fontweight='bold', pad=15)
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('/Users/aryan/Desktop/Stock-Price-Volatility-Sentiment-ML/visualizations/03A_lr_confusion_matrix.png',
           dpi=300, bbox_inches='tight')
plt.show()

print('‚úÖ Saved: 03A_lr_confusion_matrix.png')

## Step 5: Train Logistic Regression with PCA Features

In [None]:
print('\n' + '='*70)
print('TRAINING LOGISTIC REGRESSION - PCA FEATURES')
print('='*70)

print('\nTraining with 7 PCA components...')

# Create and train model with PCA features
lr_model_pca = LogisticRegression(max_iter=1000, random_state=42)
lr_model_pca.fit(X_train_pca, y_train)

print('\n‚úÖ PCA model trained successfully!')

In [None]:
# Evaluate PCA model
y_pred_pca_test = lr_model_pca.predict(X_test_pca)
test_acc_pca = accuracy_score(y_test, y_pred_pca_test)

print('='*70)
print('RESULTS - PCA MODEL')
print('='*70)
print(f'\nüìä Testing Accuracy (PCA): {test_acc_pca*100:.2f}%')

# Compare with full features
print(f'\nüîÑ Comparison:')
print(f'   Full features (23): {test_acc*100:.2f}%')
print(f'   PCA features (7): {test_acc_pca*100:.2f}%')
print(f'   Difference: {(test_acc - test_acc_pca)*100:.2f}%')

if test_acc_pca >= test_acc * 0.95:
    print(f'\n‚úÖ PCA maintains good performance with fewer features!')
else:
    print(f'\n‚ö†Ô∏è  PCA significantly reduces performance')

## Step 6: Feature Coefficients Analysis

Let's see which features are most important for prediction.

In [None]:
# Get feature coefficients
coefficients = lr_model.coef_[0]

# Load feature names
with open(f'{processed_dir}/feature_names.txt', 'r') as f:
    feature_names = [line.strip() for line in f]

# Create DataFrame
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
})

# Sort by absolute value
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)

print('='*70)
print('TOP 10 MOST IMPORTANT FEATURES')
print('='*70)
print(coef_df[['Feature', 'Coefficient']].head(10).to_string(index=False))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))

top_10 = coef_df.head(10)
colors = ['green' if x > 0 else 'red' for x in top_10['Coefficient']]

plt.barh(range(len(top_10)), top_10['Coefficient'], color=colors, alpha=0.7, edgecolor='black')
plt.yticks(range(len(top_10)), top_10['Feature'])
plt.xlabel('Coefficient Value', fontsize=12, fontweight='bold')
plt.title('Top 10 Most Important Features\n(Green = Positive influence, Red = Negative influence)',
         fontsize=14, fontweight='bold', pad=15)
plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
plt.grid(alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('/Users/aryan/Desktop/Stock-Price-Volatility-Sentiment-ML/visualizations/03A_lr_feature_importance.png',
           dpi=300, bbox_inches='tight')
plt.show()

print('‚úÖ Saved: 03A_lr_feature_importance.png')

## Step 7: ROC Curve

In [None]:
# Calculate ROC curve
y_pred_proba = lr_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='blue', linewidth=2.5, label=f'Logistic Regression (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier', alpha=0.5)

plt.xlabel('False Positive Rate', fontsize=13, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=13, fontweight='bold')
plt.title('ROC Curve - Logistic Regression', fontsize=16, fontweight='bold', pad=15)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('/Users/aryan/Desktop/Stock-Price-Volatility-Sentiment-ML/visualizations/03A_lr_roc_curve.png',
           dpi=300, bbox_inches='tight')
plt.show()

print('‚úÖ Saved: 03A_lr_roc_curve.png')
print(f'\nüìä AUC Score: {roc_auc:.4f}')

## Step 8: Save Models

In [None]:
print('='*70)
print('SAVING MODELS')
print('='*70)

# Create models directory
import os
models_dir = '/Users/aryan/Desktop/Stock-Price-Volatility-Sentiment-ML/models'
os.makedirs(models_dir, exist_ok=True)

# Save both models
joblib.dump(lr_model, f'{models_dir}/logistic_regression.pkl')
joblib.dump(lr_model_pca, f'{models_dir}/logistic_regression_pca.pkl')

print('\n‚úÖ Models saved successfully!')
print(f'   - logistic_regression.pkl')
print(f'   - logistic_regression_pca.pkl')
print(f'\nLocation: {models_dir}')