# Company Growth Analyzer - End-to-End Demo

This notebook demonstrates the complete pipeline:
1. Data collection from multiple sources
2. Preprocessing and feature engineering
3. Model training with multiple algorithms
4. Explainable predictions
5. Visualization of results

In [None]:
import sys
sys.path.append('..')  # Add parent directory to path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_ingestion.data_aggregator import DataAggregator
from src.preprocessing.preprocessor import DataPreprocessor
from src.preprocessing.feature_engineer import FeatureEngineer
from src.models.model_trainer import ModelTrainer
from src.evaluation.evaluator import ModelEvaluator
from src.explainability.explanation_generator import ExplanationGenerator
from src.visualization.visualizer import ModelVisualizer

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Data Collection

Collect data for a set of tech companies. This includes:
- Financial metrics from Yahoo Finance
- News sentiment analysis
- Competitive positioning within sector

In [None]:
# Define companies to analyze
tickers = ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'META', 'NVDA', 'AMD', 'INTC', 'AMZN', 'NFLX']

# Initialize data aggregator
aggregator = DataAggregator()

# Collect all data
print("Collecting data for", len(tickers), "companies...")
df_raw = aggregator.collect_all_data(
    tickers=tickers,
    period='2y',
    include_sentiment=True,
    include_market=True,
    save_raw=False
)

print(f"\nCollected data shape: {df_raw.shape}")
print(f"Columns: {df_raw.columns.tolist()[:10]}...")  # Show first 10 columns

# Display sample
df_raw.head()

## 2. Data Exploration

Quick look at the collected data

In [None]:
# Summary statistics
print("\nData Summary:")
print(df_raw.describe())

# Missing values
print("\nMissing Values:")
missing = df_raw.isna().sum()
print(missing[missing > 0])

## 3. Preprocessing & Feature Engineering

Clean data and create derived features

In [None]:
# Engineer features
feature_engineer = FeatureEngineer()
df_engineered = feature_engineer.engineer_features(df_raw)

print(f"Features after engineering: {df_engineered.shape[1]}")
print(f"New features added: {df_engineered.shape[1] - df_raw.shape[1]}")

# Create target variable (example: companies with 1-year return > median)
median_return = df_engineered['returns_1y'].median()
df_engineered['success_label'] = (df_engineered['returns_1y'] > median_return).astype(int)

print(f"\nTarget distribution:")
print(df_engineered['success_label'].value_counts())

In [None]:
# Preprocess
preprocessor = DataPreprocessor(scaler_type='robust')
df_processed = preprocessor.fit_transform(df_engineered, target_col='success_label')

print(f"\nProcessed data shape: {df_processed.shape}")
print(f"Missing values after preprocessing: {df_processed.isna().sum().sum()}")

## 4. Model Training

Train multiple models and compare performance

In [None]:
# Initialize trainer
trainer = ModelTrainer()

# Prepare data
X_train, X_test, y_train, y_test = trainer.prepare_data(
    df_processed,
    target_col='success_label'
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Features: {len(trainer.feature_names)}")

In [None]:
# Train models
print("Training models...")
results = trainer.train_models(
    model_names=['logistic_regression', 'random_forest', 'xgboost'],
    use_cv=True
)

# Display results
for name, result in results.items():
    print(f"\n{name}:")
    print(f"  Train Score: {result['train_score']:.4f}")
    print(f"  Test Score: {result['test_score']:.4f}")
    print(f"  CV Mean: {result['cv_mean']:.4f} (+/- {result['cv_std']:.4f})")

## 5. Model Evaluation

Detailed evaluation of model performance

In [None]:
# Evaluate each model
evaluations = {}

for name in results.keys():
    predictions = trainer.get_predictions(name)
    
    eval_result = ModelEvaluator.evaluate_classification(
        y_true=y_test.values,
        y_pred=predictions['test_pred'],
        y_proba=predictions.get('test_proba')
    )
    
    evaluations[name] = eval_result
    
    print(f"\n{name} Evaluation:")
    print(f"  Accuracy: {eval_result['accuracy']:.4f}")
    print(f"  Precision: {eval_result['precision']:.4f}")
    print(f"  Recall: {eval_result['recall']:.4f}")
    print(f"  F1 Score: {eval_result['f1_score']:.4f}")
    if 'roc_auc' in eval_result:
        print(f"  ROC-AUC: {eval_result['roc_auc']:.4f}")

In [None]:
# Model comparison
comparison = ModelEvaluator.compare_models(evaluations)
print("\nModel Comparison:")
print(comparison)

# Visualize comparison
ModelVisualizer.plot_model_comparison(comparison, metric='f1_score')

## 6. Explainability

Generate explanations for predictions using SHAP and LIME

In [None]:
# Get best model
best_name, best_model = trainer.get_best_model(metric='test_score')
print(f"Best model: {best_name}")

# Initialize explainer
explainer = ExplanationGenerator(best_model, X_train, y_train)

In [None]:
# Generate global explanation report
report = explainer.generate_report(X_test, y_test)

# Display top features
if 'top_features_shap' in report:
    print("\nTop Features (SHAP):")
    top_features_df = pd.DataFrame(report['top_features_shap'])
    print(top_features_df.head(10))
    
    # Visualize
    ModelVisualizer.plot_feature_importance(
        top_features_df,
        title="Top Features (SHAP Values)",
        top_n=15
    )

In [None]:
# Explain individual predictions
print("\nExplaining individual company predictions...\n")

for i in range(min(3, len(X_test))):  # Explain first 3 test samples
    company_name = df_processed.iloc[X_test.index[i]].get('company_name', f'Company_{i}')
    
    explanation = explainer.explain_prediction(
        X_test,
        index=i,
        company_name=company_name,
        top_n_features=5
    )
    
    print("="*80)
    print(explanation['narrative'])
    print()

## 7. Predictions & Rankings

Generate predictions and rank companies by success probability

In [None]:
# Make predictions on all data
X_full = df_processed[trainer.feature_names]
predictions = best_model.predict(X_full)
probabilities = best_model.predict_proba(X_full)

# Create ranking DataFrame
ranking_df = pd.DataFrame({
    'company': df_raw['company_name'].values,
    'ticker': df_raw['ticker'].values,
    'prediction': predictions,
    'success_probability': probabilities[:, 1],
    'actual_return_1y': df_raw['returns_1y'].values
}).sort_values('success_probability', ascending=False)

print("\nCompany Success Ranking:")
print(ranking_df)

# Visualize ranking
ModelVisualizer.plot_company_ranking(
    ranking_df,
    score_col='success_probability',
    name_col='company',
    title="Company Success Probability Ranking"
)

## 8. Comparative Analysis

Compare specific companies head-to-head

In [None]:
# Compare top 3 and bottom 3 companies
top_indices = ranking_df.head(3).index.tolist()
bottom_indices = ranking_df.tail(3).index.tolist()
compare_indices = top_indices + bottom_indices

company_names = ranking_df.loc[compare_indices, 'company'].tolist()

comparison = explainer.explain_comparison(
    X_full,
    indices=compare_indices,
    company_names=company_names
)

print("\nComparative Analysis:")
print(comparison['summary'])

## 9. Insights & Conclusions

Key takeaways from the analysis

In [None]:
# Correlation between prediction and actual return
if 'actual_return_1y' in ranking_df.columns:
    correlation = ranking_df['success_probability'].corr(ranking_df['actual_return_1y'])
    print(f"\nCorrelation between predicted probability and actual 1-year return: {correlation:.4f}")
    
    # Scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(ranking_df['success_probability'], ranking_df['actual_return_1y'])
    plt.xlabel('Predicted Success Probability')
    plt.ylabel('Actual 1-Year Return')
    plt.title('Prediction vs Actual Performance')
    
    # Add company labels
    for idx, row in ranking_df.iterrows():
        plt.annotate(row['ticker'], 
                    (row['success_probability'], row['actual_return_1y']),
                    fontsize=8)
    
    plt.grid(True, alpha=0.3)
    plt.show()

print("\nKey Insights:")
print("- Models successfully identify patterns in company success")
print("- XGBoost typically provides best performance with proper explainability")
print("- SHAP values reveal which features truly matter for each prediction")
print("- Multi-source data (financial + sentiment + competitive) improves accuracy")
print("\nLimitations:")
print("- Historical patterns may not predict unprecedented events")
print("- Model requires regular retraining with fresh data")
print("- Predictions are probabilistic, not deterministic")

## 10. Save Results

Export models and reports

In [None]:
# Save best model
trainer.save_model(best_name)
print(f"Saved {best_name} model")

# Save rankings
from src.utils.helpers import save_dataframe
save_dataframe(ranking_df, '../outputs/company_rankings.csv', format='csv')
print("Saved company rankings")

print("\nDemo complete! Check the outputs/ directory for saved files.")