In [4]:
print("✅ Jupyter is running code")


✅ Jupyter is running code


In [2]:
import sklearn
print("✅ scikit-learn version:", sklearn.__version__)


✅ scikit-learn version: 1.6.1


In [3]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=200).fit(X_train, y_train)
print("✅ Test Accuracy:", clf.score(X_test, y_test))


✅ Test Accuracy: 1.0


In [5]:

train_texts = [
    "Stocks rally as market shows growth",          # Business
    "Company profits increase in Q4 earnings",      # Business
    "Football team wins championship final",       # Sports
    "Olympic games bring record attendance",        # Sports
    "New discovery in space telescope images",      # Science
    "Scientists develop cure for rare disease",     # Science
    "Government passes new education reform",       # Politics
    "President gives speech at summit",            # Politics
]

train_labels = [
    "business", "business",
    "sports", "sports",
    "science", "science",
    "politics", "politics"
]

# -----------------------
# 2. Preprocess (Bag-of-Words)
# -----------------------
def tokenize(text):
    return text.lower().split()

vocab = set()
for text in train_texts:
    vocab.update(tokenize(text))
vocab = sorted(vocab)
word2idx = {w:i for i,w in enumerate(vocab)}

def vectorize(text):
    vec = [0]*len(vocab)
    for w in tokenize(text):
        if w in word2idx:
            vec[word2idx[w]] += 1
    return vec

X = [vectorize(t) for t in train_texts]
y = train_labels
classes = sorted(set(y))

# -----------------------
# 3. Train a Simple Model (Naive Bayes-like)
# -----------------------
from collections import defaultdict
import math

# Count words per class
word_counts = {c:[0]*len(vocab) for c in classes}
class_counts = defaultdict(int)

for vec,label in zip(X,y):
    class_counts[label] += 1
    for i,val in enumerate(vec):
        word_counts[label][i] += val

# Convert counts to log probabilities
class_priors = {c: math.log(class_counts[c]/len(y)) for c in classes}
word_log_probs = {}

for c in classes:
    total = sum(word_counts[c]) + len(vocab)  # Laplace smoothing
    word_log_probs[c] = [math.log((count+1)/total) for count in word_counts[c]]

# -----------------------
# 4. Prediction Function
# -----------------------
def predict(text):
    vec = vectorize(text)
    scores = {}
    for c in classes:
        score = class_priors[c]
        for i,val in enumerate(vec):
            if val>0:
                score += val * word_log_probs[c][i]
        scores[c] = score
    return max(scores, key=scores.get)

# -----------------------
# 5. Test Predictions
# -----------------------
test_texts = [
    "NASA launches new satellite",
    "Team wins the world cup",
    "Stock prices fall in market crash",
    "Prime minister announces new law"
]

for t in test_texts:
    print(f"Headline: {t}\n Predicted Category: {predict(t)}\n")


Headline: NASA launches new satellite
 Predicted Category: politics

Headline: Team wins the world cup
 Predicted Category: sports

Headline: Stock prices fall in market crash
 Predicted Category: business

Headline: Prime minister announces new law
 Predicted Category: politics

