# Import Required Libraries

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from stop_words import get_stop_words
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Load and Explore Dataset

In [17]:
df = pd.read_csv('./Scrapper/polsatnews_articles_clean.csv')
print(df.head())
print(df.info())
print(df['category'].value_counts())
print(f"Total records: {len(df)}")
print("2k records are generally sufficient for a small neural network with balanced categories, but more data can improve performance.")

  category                                              title  \
0   Polska  "Prezydent jest na końcu łańcucha". Siemoniak ...   
1   Polska  Jarosław Sosnowski jak Tomasz Komenda? Siedzi ...   
2   Polska  ZUS przesuwa terminy 800 plus. Sprawdź, kto do...   
3   Polska  Dentysta wyłudził ponad 1,2 mln zł od NFZ. Zap...   
4   Polska  Zderzenie tramwajów w Krakowie. Wielu poszkodo...   

                                                 url  \
0  https://www.polsatnews.pl/wiadomosc/2025-12-03...   
1  https://www.polsatnews.pl/wiadomosc/2025-12-03...   
2  https://www.polsatnews.pl/wiadomosc/2025-12-03...   
3  https://www.polsatnews.pl/wiadomosc/2025-12-03...   
4  https://www.polsatnews.pl/wiadomosc/2025-12-03...   

                         published  \
0  Wed, 03 Dec 2025 20:15:00 +0100   
1  Wed, 03 Dec 2025 20:09:00 +0100   
2  Wed, 03 Dec 2025 19:46:00 +0100   
3  Wed, 03 Dec 2025 19:43:00 +0100   
4  Wed, 03 Dec 2025 18:36:00 +0100   

                                           

# Preprocess Text Data

Zainstaluj wymagane biblioteki: pip install -r requirements.txt. NLTK punkt zostanie pobrany automatycznie.

**Dlaczego nie spaCy?** SpaCy ma znane problemy z kompilacją na macOS z procesorami M1/M2, powodując błędy takie jak '__reduce_cython__'. NLTK jest prostszą alternatywą, stabilną na tym sprzęcie, choć bez zaawansowanych funkcji jak lematyzacja. Dla polskiego tekstu używa podstawowych stopwords z biblioteki stop-words i tokenizacji z NLTK.

In [18]:
nltk.download('punkt')

stop_words = set(get_stop_words('polish'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text'])

[nltk_data] Downloading package punkt to /Users/mptb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Prepare Training Data

In [19]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_test = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Build Neural Network Model

In [20]:
num_classes = len(label_encoder.classes_)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

class TextClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, 256)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

model = TextClassifier(X_train.shape[1], num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the Model

In [21]:
X_train = X_train.to(device)
y_train = y_train.to(device)
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

epochs = 10
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

Epoch 1/10, Loss: 1.7896
Epoch 2/10, Loss: 1.7220
Epoch 3/10, Loss: 1.6324
Epoch 4/10, Loss: 1.3838
Epoch 5/10, Loss: 0.8426
Epoch 6/10, Loss: 0.4035
Epoch 7/10, Loss: 0.2291
Epoch 8/10, Loss: 0.0930
Epoch 9/10, Loss: 0.0676
Epoch 10/10, Loss: 0.0281


# Evaluate Model Performance

In [22]:
X_test = X_test.to(device)
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, y_pred_classes = torch.max(outputs, 1)

print(f"Accuracy: {accuracy_score(y_test.cpu().numpy(), y_pred_classes.cpu().numpy())}")
print(classification_report(y_test.cpu().numpy(), y_pred_classes.cpu().numpy(), target_names=label_encoder.classes_))

Accuracy: 0.7833333333333333
              precision    recall  f1-score   support

      Biznes       1.00      0.82      0.90        11
        Moto       0.64      1.00      0.78         7
      Polska       0.55      0.60      0.57        10
       Sport       1.00      0.86      0.92         7
 Technologie       0.90      0.75      0.82        12
       Świat       0.77      0.77      0.77        13

    accuracy                           0.78        60
   macro avg       0.81      0.80      0.79        60
weighted avg       0.81      0.78      0.79        60



# Predict Categories on New Articles

In [23]:
def predict_category(text):
    processed = preprocess_text(text)
    vectorized = vectorizer.transform([processed])
    input_tensor = torch.tensor(vectorized.toarray(), dtype=torch.float32).to(device)
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted_class = torch.max(output, 1)
    return label_encoder.inverse_transform(predicted_class.cpu().numpy())[0]

new_article = "Nowy prezydent został wybrany w wyborach."
predicted_category = predict_category(new_article)
print(f"Predicted category: {predicted_category}")

Predicted category: Polska


# Simple Web Application with Flask

In [24]:
# Simple Web Application with Flask

Aby uruchomić aplikację webową, uruchom plik app.py w terminalu:

```bash
python app.py
```

Aplikacja będzie dostępna na http://127.0.0.1:5000

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/mptb/Documents/Studia/Data_Science/2_sem/EDT/Project/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/mptb/Documents/Studia/Data_Science/2_sem/EDT/Project/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/Users/mptb/Documents/Studia/Data_Science/2_sem/EDT/Project/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mptb/Documents/Studia/Data_Science/2_sem/EDT/Project/.venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 711, in initialize
    self.ini

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
