# Demo: Softmax Regression on 20 Newsgroups
This notebook walks through the full pipeline: data loading, TF‑IDF vectorization, model training, loss visualization, and evaluation.

In [2]:
import os, sys
# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import numpy as np, matplotlib.pyplot as plt
from src.utils import load_data, vectorize
from src.softmax import SoftmaxRegression
# Load data
train_texts, train_labels = load_data('train')
test_texts, test_labels = load_data('test')


ModuleNotFoundError: No module named 'numpy'

In [None]:
# Vectorize using TF‑IDF (20k features)
X_train, X_test, _ = vectorize(train_texts, test_texts, max_features=20000)
n_features = X_train.shape[1]
n_classes = np.max(train_labels) + 1


In [None]:
# Initialize model
model = SoftmaxRegression(n_features=n_features, n_classes=n_classes, learning_rate=0.05, reg_lambda=0.001)


In [None]:
# Train model (100 epochs, batch size 128)
loss_history = model.fit(X_train, train_labels, epochs=100, batch_size=128, verbose=True)


Epoch 1/100 - Loss: 2.9907
Epoch 10/100 - Loss: 2.9573
Epoch 20/100 - Loss: 2.9243
Epoch 30/100 - Loss: 2.8918
Epoch 40/100 - Loss: 2.8597
Epoch 50/100 - Loss: 2.8279
Epoch 60/100 - Loss: 2.7965
Epoch 70/100 - Loss: 2.7655
Epoch 80/100 - Loss: 2.7349
Epoch 90/100 - Loss: 2.7047
Epoch 100/100 - Loss: 2.6750


In [None]:
# Plot loss curve
os.makedirs('results', exist_ok=True)
plt.figure(figsize=(8,5))
plt.plot(range(1, len(loss_history)+1), loss_history, marker='o')
plt.title('Training Loss Curve')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.savefig(os.path.join('results', 'loss_curve.png'))
plt.close()


In [None]:
# Evaluate on test set
y_pred = model.predict(X_test)
accuracy = np.mean(y_pred == test_labels)
print(f'Test Accuracy: {accuracy:.4f}')
with open(os.path.join('results', 'accuracy.txt'), 'w') as f:
    f.write(f'Test Accuracy: {accuracy:.4f}\n')


Test Accuracy: 0.6148
