# 02 AI Pipeline: Embeddings + Classifier

In [None]:

import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sentence_transformers import SentenceTransformer


In [None]:

dataset = load_dataset("ag_news")

df = pd.DataFrame(dataset["train"])
df = df.sample(n=5000, random_state=42)

texts = df["text"].str.lower().tolist()
labels = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


In [None]:

embedder = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = embedder.encode(X_train, show_progress_bar=True)
X_test_emb = embedder.encode(X_test, show_progress_bar=True)


In [None]:

clf = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)
clf.fit(X_train_emb, y_train)

preds = clf.predict(X_test_emb)

acc = accuracy_score(y_test, preds)
prec = precision_score(y_test, preds, average="macro")
rec = recall_score(y_test, preds, average="macro")

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(classification_report(y_test, preds))
