In [4]:
# INSTALL REQUIRED LIBRARIES (if needed)
# Install necessary libraries for machine learning, NLP, and deep learning
!pip install torch torchvision torchaudio scikit-learn pandas numpy



In [5]:
# IMPORT LIBRARIES
import pandas as pd # For handling tabular data (loading, filtering, etc.)
# From scikit-learn:
from sklearn.model_selection import train_test_split # To split dataset into training and testing subsets
from sklearn.preprocessing import LabelEncoder # To convert text labels into numeric classes
from sklearn.feature_extraction.text import TfidfVectorizer # To transform text into TF-IDF vectors
# From PyTorch:
import torch # Core library for deep learning
from torch.utils.data import DataLoader, TensorDataset # For batching and organizing data for training

# LOAD AND EXPLORE THE DATASET
file_path = 'news.tsv' # Define the path to the dataset file (TSV = Tab-Separated Values)

df_news = pd.read_csv(file_path, sep='\t', header=None) # Load the TSV file into a pandas DataFrame, with no header row specified
# Assign column names based on dataset documentation 
df_news.columns = [
    'NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 
    'URL', 'TitleEntities', 'AbstractEntities'
]
# Display the first few rows of the dataset to inspect structure and content
print("Primeras filas del dataset:")
print(df_news.head())

# SELECT RELEVANT COLUMNS FOR THE TASK
# Keep only the 'Title' and 'Category' columns for classification
# Drop rows with missing values (NaN)
df = df_news[['Title', 'Category']].dropna()

# Display all unique categories in the dataset
print("\nCategorías disponibles:")
print(df['Category'].unique())

# Show a random sample of 5 entries for inspection
print("\nEjemplo de datos utilizados para el entrenamiento:")
print(df.sample(5))

# ENCODE CATEGORIES INTO NUMERIC LABELS

# Initialize the label encoder
le = LabelEncoder()
df['Category'] = le.fit_transform(df['Category']) # Convert text categories to integer labels and update the 'Category' column

# SPLIT DATA INTO TRAINING AND TESTING SETS
# Use 80% of the data for training and 20% for testing
# X = news titles (input), y = encoded category labels (target)
X_train, X_test, y_train, y_test = train_test_split(df['Title'], df['Category'], test_size=0.2, random_state=42)

# TRANSFORM TEXT INTO TF-IDF FEATURE VECTORS

vectorizer = TfidfVectorizer(max_features=5000) # Initialize TF-IDF vectorizer (max 5000 most frequent words)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray() # Fit the vectorizer on the training text and convert it to dense vectors
X_test_tfidf = vectorizer.transform(X_test).toarray() # Transform the test text using the already-fitted vectorizer

# CONVERT DATA INTO PYTORCH TENSORS
# Convert TF-IDF feature matrices and labels into PyTorch tensors
X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32) 
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# CREATE DATASETS AND DATALOADERS
# Combine features and labels into PyTorch TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
# Create DataLoaders to feed data in batches during training
# Shuffle training data to improve learning
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# Test loader does not shuffle to preserve data order
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Primeras filas del dataset:
   NewsID   Category               SubCategory  \
0  N88753  lifestyle           lifestyleroyals   
1  N45436       news  newsscienceandtechnology   
2  N23144     health                weightloss   
3  N86255     health                   medical   
4  N93187       news                 newsworld   

                                               Title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1    Walmart Slashes Prices on Last-Generation iPads   
2                      50 Worst Habits For Belly Fat   
3  Dispose of unwanted prescription drugs during ...   
4  The Cost of Trump's Aid Freeze in the Trenches...   

                                            Abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  Apple's new iPad releases bring big deals on l...   
2  These seemingly harmless habits are holding yo...   
3                                                NaN   
4  Lt. Ivan Molchanets peeked over a parapet of s...  

In [6]:
# IMPORT MODULES FOR BUILDING NEURAL NETWORKS
import torch.nn as nn # Provides base classes for building neural network layers
import torch.nn.functional as F # Provides common activation functions like ReLU, etc.

# DEFINE A SIMPLE FEEDFORWARD NEURAL NETWORK
# This class defines the architecture of the neural network for text classification
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        # First fully connected layer:
        # Input: TF-IDF vector of length 'input_dim'
        # Output: a hidden representation of size 'hidden_dim'
        self.fc1 = nn.Linear(input_dim, hidden_dim)
         # Second fully connected layer:
        # Input: hidden representation
        # Output: scores for each of the 'output_dim' classes
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    # Define the forward pass through the network
    def forward(self, x):
        # Apply ReLU activation to the output of the first layer
        x = F.relu(self.fc1(x))
        # Output raw scores (logits) from the second layer
        # (CrossEntropyLoss will handle softmax internally)
        x = self.fc2(x)
        return x

# SET MODEL HYPERPARAMETERS
input_dim = X_train_tfidf.shape[1] # Number of input features = number of TF-IDF terms (columns)
hidden_dim = 100 # Number of hidden units in the middle layer (can be tuned)
output_dim = len(le.classes_) # Number of output classes (equal to the number of unique categories)

# INSTANTIATE THE MODEL
model = TextClassifier(input_dim, hidden_dim, output_dim) # Create an instance of the neural network with the specified dimensions

In [7]:
# IMPORT OPTIMIZER MODULE
import torch.optim as optim # Contains optimization algorithms like SGD, Adam, etc.

# DEFINE LOSS FUNCTION AND OPTIMIZER
# CrossEntropyLoss is commonly used for multi-class classification problems.
# It combines LogSoftmax and Negative Log Likelihood Loss in one function.
criterion = nn.CrossEntropyLoss()
# Adam optimizer is used to update the model weights based on the gradients.
# It is an adaptive optimizer that often works well out-of-the-box.
# lr=0.001 sets the learning rate (step size for weight updates).
optimizer = optim.Adam(model.parameters(), lr=0.001)

# DEFINE THE TRAINING FUNCTION
# This function trains the model over multiple epochs using the provided DataLoader.
def train_model(model, train_loader, criterion, optimizer, epochs=50):
    # Set the model to training mode (activates dropout, etc. if used)
    model.train()
    # Loop through the specified number of epochs
    for epoch in range(epochs):
        total_loss = 0 # Accumulate total loss for reporting
        # Loop through each mini-batch in the training DataLoader
        for inputs, labels in train_loader:
            optimizer.zero_grad() # Reset gradients to zero
            outputs = model(inputs) # Forward pass: compute predictions
            loss = criterion(outputs, labels) # Compute the loss
            loss.backward() # Backward pass: compute gradients
            optimizer.step() # Update model parameters
            total_loss += loss.item() # Accumulate loss for this batch
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}') # Print average loss for the current epoch

# TRAIN THE MODEL
train_model(model, train_loader, criterion, optimizer, epochs=10) # Call the training function for 10 epochs

Epoch 1/10, Loss: 1.2121
Epoch 2/10, Loss: 0.8165
Epoch 3/10, Loss: 0.7180
Epoch 4/10, Loss: 0.6536
Epoch 5/10, Loss: 0.6004
Epoch 6/10, Loss: 0.5490
Epoch 7/10, Loss: 0.4987
Epoch 8/10, Loss: 0.4498
Epoch 9/10, Loss: 0.4008
Epoch 10/10, Loss: 0.3525


In [8]:
# IMPORT METRICS FOR MODEL EVALUATION
from sklearn.metrics import accuracy_score, classification_report
# accuracy_score: computes the percentage of correctly predicted labels
# classification_report: provides precision, recall, f1-score per class

# DEFINE MODEL EVALUATION FUNCTION
def evaluate_model(model, test_loader): # This function evaluates the trained model on the test dataset
    model.eval() # Set the model to evaluation mode (disables dropout, etc.)
    all_preds = [] # To store all predicted labels
    all_labels = [] # To store all true labels
    # Disable gradient calculations for faster and more memory-efficient inference
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs) # Forward pass to get predictions
            _, preds = torch.max(outputs, 1) # Select class with highest predicted score
            all_preds.extend(preds.numpy()) # Store predictions
            all_labels.extend(labels.numpy()) # Store true labels
    
    # Generate a detailed classification report:
    # Includes precision, recall, f1-score and support for each class
    report = classification_report(
        all_labels,
        all_preds,
        labels=list(range(len(le.classes_))), # Specify the range of class indices   
        target_names=le.classes_              # Use original class names for readability
    )
    # Calculate overall accuracy: percentage of correct predictions
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy, report # Return both metrics for further use

# EVALUATE THE MODEL ON TEST DATA
# Call the evaluation function using the test data loader
accuracy, report = evaluate_model(model, test_loader)
# Print overall test accuracy
print(f'\nAccuracy en test: {accuracy:.4f}')
# Print detailed classification report per class
print('Reporte de clasificación:\n')
print(report)


Accuracy en test: 0.6854
Reporte de clasificación:

               precision    recall  f1-score   support

        autos       0.53      0.55      0.54       612
entertainment       0.43      0.34      0.38       153
      finance       0.53      0.48      0.51      1227
 foodanddrink       0.62      0.64      0.63       881
        games       0.00      0.00      0.00         0
       health       0.60      0.56      0.58       608
         kids       0.00      0.00      0.00        15
    lifestyle       0.46      0.43      0.45       897
   middleeast       0.00      0.00      0.00         0
       movies       0.59      0.48      0.53       179
        music       0.57      0.48      0.52       249
         news       0.68      0.72      0.70      6140
 northamerica       0.00      0.00      0.00         0
       sports       0.89      0.90      0.90      6368
       travel       0.41      0.39      0.40       958
           tv       0.31      0.34      0.32       243
        vid

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:

# FUNCTION TO PREDICT CATEGORY FOR NEW TEXT

# This function takes a raw news title and returns its predicted category.
# Inputs:
# - text: a string (news title)
# - model: the trained classification model
# - vectorizer: the fitted TF-IDF vectorizer
# - label_encoder: the LabelEncoder used to encode original category labels
def predict_category(text, model, vectorizer, label_encoder):
    model.eval() # Set the model to evaluation mode (disables dropout, etc.)
    text_tfidf = vectorizer.transform([text]).toarray() # Convert the input text into a TF-IDF vector using the same vectorizer as during training
    text_tensor = torch.tensor(text_tfidf, dtype=torch.float32) # Convert the TF-IDF vector into a PyTorch tensor
    # Turn off gradient tracking for inference
    with torch.no_grad():
        outputs = model(text_tensor) # Get model predictions (logits)
        _, preds = torch.max(outputs, 1) # Select the index of the class with the highest predicted score
    category = label_encoder.inverse_transform(preds.numpy()) # Convert the predicted class index back into the original category label
    return category[0] # Return the predicted label as a string

# EXAMPLE: PREDICT CATEGORY FOR A SAMPLE NEWS TITLE
# Example input: a new news headline in Spanish 
new_text = "Doctors go strike"
predicted_category = predict_category(new_text, model, vectorizer, le) # Predict the category of the example text using the trained model

# Print the input text and its predicted category
print(f'\nTexto de ejemplo: "{new_text}"')
print(f'Categoría predicha: "{predicted_category}"')


Texto de ejemplo: "Doctors go strike"
Categoría predicha: "health"
