# EMNIST Letter OCR – Run Pipelines and App

This notebook lets you:

- Run several `OCRPipeline` configurations (baseline, augmentation, PCA).
- Save artifacts for the GUI.
- Plot a confusion matrix for the best model.
- Optionally launch the Streamlit app.



In [None]:
from pathlib import Path

from src.ocr_project.pipeline import OCRPipeline
from src.ocr_project.features.pca_features import PCAFeatures
from src.ocr_project import config


def run_pipeline_config(name: str, **kwargs):
    print(f"\n=== Running pipeline: {name} ===")
    pipeline = OCRPipeline(**kwargs)
    results = pipeline.run()
    for model_name, result in results.items():
        print(f"  {model_name}: accuracy = {result.accuracy:.4f}")
    return results



In [None]:
# 1) Baseline: small subset, no augmentation, raw pixels
baseline_results = run_pipeline_config(
    "baseline_small",
    train_limit=10000,
    test_limit=2000,
    use_augmentation=False,
    feature_extractor=None,
)

# 2) Augmented data, same size, raw pixels
aug_results = run_pipeline_config(
    "augmented_small",
    train_limit=10000,
    test_limit=2000,
    use_augmentation=True,
    feature_extractor=None,
)

# 3) Augmented + PCA features on a larger subset
pca_results = run_pipeline_config(
    "augmented_pca",
    train_limit=20000,
    test_limit=5000,
    use_augmentation=True,
    feature_extractor=PCAFeatures(n_components=50),
)



In [None]:
# Plot a confusion matrix for the Random Forest model

from src.ocr_project.io.dataset_loader import EmnistLoader
from src.ocr_project.io.persistence import load_model_artifact
from src.ocr_project.preprocess.transformer import Transformer
from src.ocr_project.evaluation.visualizer import Visualizer

# Load test data (same limits as in the PCA run)
loader = EmnistLoader(config.EMNIST_LETTERS_TRAIN, config.EMNIST_LETTERS_TEST)
test_split = loader.load_test(limit=5000)
X_test = Transformer.normalize(Transformer.flatten(test_split.images))
y_test = test_split.labels

# Load the Random Forest artifact produced by the last pipeline run
rf_artifact = load_model_artifact(Path("artifacts") / "random_forest.pkl")
rf_model = rf_artifact["model"]

y_pred = rf_model.predict(X_test)
Visualizer.plot_confusion_matrix(y_test, y_pred, normalize=False)



In [None]:
# Optional: launch the Streamlit app from the notebook (may block the kernel)
# You can also run this from a terminal instead:
#   streamlit run app.py

!streamlit run app.py


# EMNIST Letter OCR – Pipeline and App

This notebook lets you:

- Train the Decision Tree and Random Forest models via the `OCRPipeline` class.
- Save model artifacts for the Streamlit GUI.
- (Optionally) launch the Streamlit app from within the notebook.



In [1]:
from pathlib import Path

from src.ocr_project.pipeline import OCRPipeline, run_default_pipeline
from src.ocr_project import config

# Option 1: use the helper that relies on default config
results = run_default_pipeline()

for name, result in results.items():
    print(f"{name}: accuracy = {result.accuracy:.4f}")

# Option 2: create and run a custom pipeline (example)
# pipeline = OCRPipeline(
#     train_csv=config.EMNIST_LETTERS_TRAIN,
#     test_csv=config.EMNIST_LETTERS_TEST,
#     artifacts_dir=Path("artifacts"),
# )
# results = pipeline.run()
# results


decision_tree: accuracy = 0.5574
random_forest: accuracy = 0.8354


In [None]:
# Optional: launch the Streamlit app from the notebook (may block the kernel)
# You can also run this from a terminal instead:
#   streamlit run app.py

!streamlit run app.py
