# News Topic Classification

The project involves a large dataset of news articles collected over several years. These articles cover a wide range of topics such as world events, sports, business, and science/technology. Each article headline is labeled with a number from 0 to 3, indicating its category, as described below. 

| Value | Topic        |
|:------|:-------------|
| 0     | World        |
| 1     | Sports       |
| 2     | Business     |
| 3     | Sci/Tech     |


Our goal is to create a model that, given an unknown article headline, can classify it into one of these 4 topics.

This specific notebook focuses on fine tuning a tranformer as a way to solve the problem in hand.

# Importing the Data Set
Our dataset consists of only two columns, *text* and *label*, as shown below:

In [1]:
import pandas as pd
df = pd.read_csv('training_data.csv')
df.head(10)

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2
5,"Stocks End Up, But Near Year Lows (Reuters) Re...",2
6,Money Funds Fell in Latest Week (AP) AP - Asse...,2
7,Fed minutes show dissent over inflation (USATO...,2
8,Safety Net (Forbes.com) Forbes.com - After ear...,2
9,Wall St. Bears Claw Back Into the Black NEW Y...,2


## Creating the train functions
###### Analyzing various models to see which one is better off even without fine-tuning.

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def load_model_and_tokenizer(model_name):
    """
    Load the appropriate model and tokenizer based on the model name.
    """
    if model_name == "lucasresck/bert-base-cased-ag-news":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name)
    elif model_name == "fabriceyhc/bert-base-uncased-ag_news":
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name)
    else:  # Default to AutoModelForSequenceClassification for other models
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

def train_data(training_data, test_data):
    X_train = training_data['text']
    y_train = training_data['label']
    X_val = test_data['text']
    y_val = test_data['label']

    transformer_models = [
        "mrm8488/bert-mini-finetuned-age_news-classification"
    ]

    for model_name in transformer_models:
        tokenizer, model = load_model_and_tokenizer(model_name)
        transformer_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)
        
        label_mapping = {'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

        transformer_predictions = transformer_pipeline(X_val.tolist())
        y_pred_transformer = [label_mapping[pred['label']] for pred in transformer_predictions]
        
        print('----------------------------------------------------------------')
        print(f"Transformer Model ({model_name}) Accuracy:", accuracy_score(y_val, y_pred_transformer))
        print(f"Transformer Model ({model_name}) Confusion Matrix:")
        print(confusion_matrix(y_val, y_pred_transformer))
        print(f"Transformer Model ({model_name}) Classification Report:")
        print(classification_report(y_val, y_pred_transformer))

training_data = pd.read_csv('training_data.csv')
test_data = pd.read_csv('test_data.csv')


train_data(training_data, test_data)

----------------------------------------------------------------
Transformer Model (mrm8488/bert-mini-finetuned-age_news-classification) Accuracy: 0.8
Transformer Model (mrm8488/bert-mini-finetuned-age_news-classification) Confusion Matrix:
[[4 0 1 0]
 [1 4 0 0]
 [1 0 4 0]
 [1 0 0 4]]
Transformer Model (mrm8488/bert-mini-finetuned-age_news-classification) Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.80      0.67         5
           1       1.00      0.80      0.89         5
           2       0.80      0.80      0.80         5
           3       1.00      0.80      0.89         5

    accuracy                           0.80        20
   macro avg       0.84      0.80      0.81        20
weighted avg       0.84      0.80      0.81        20



# Finetuning the model

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

train_df = pd.read_csv('training_data.csv')
train_df_subset = train_df.sample(n=2000, random_state=42)

test_df = pd.read_csv('test_data.csv')

train_dataset = Dataset.from_pandas(train_df_subset)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

model_name = "mrm8488/bert-mini-finetuned-age_news-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4) 

def tokenize_function(examples):
    max_length = min(512, max(len(text) for text in examples["text"]))
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

trainer.save_model("./fine-tuned-mrm8488")
tokenizer.save_pretrained("./fine-tuned-mrm8488")


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.26416516304016113, 'eval_runtime': 0.5111, 'eval_samples_per_second': 39.128, 'eval_steps_per_second': 5.869, 'epoch': 1.0}
{'loss': 0.1694, 'grad_norm': 0.19779416918754578, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.21113094687461853, 'eval_runtime': 0.4213, 'eval_samples_per_second': 47.476, 'eval_steps_per_second': 7.121, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.17189940810203552, 'eval_runtime': 0.4064, 'eval_samples_per_second': 49.211, 'eval_steps_per_second': 7.382, 'epoch': 3.0}
{'train_runtime': 1285.4408, 'train_samples_per_second': 4.668, 'train_steps_per_second': 0.583, 'train_loss': 0.15423658752441408, 'epoch': 3.0}


('./fine-tuned-mrm8488\\tokenizer_config.json',
 './fine-tuned-mrm8488\\special_tokens_map.json',
 './fine-tuned-mrm8488\\vocab.txt',
 './fine-tuned-mrm8488\\added_tokens.json',
 './fine-tuned-mrm8488\\tokenizer.json')

In [4]:
# Load the fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained("./fine-tuned-mrm8488")

# Define a function to get predictions from the model
def get_predictions(model, tokenizer, dataset):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    predictions = []
    labels = []

    for batch in torch.utils.data.DataLoader(dataset, batch_size=8):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        predictions.extend(torch.argmax(outputs.logits, axis=1).tolist())
        labels.extend(batch["labels"].tolist())

    return predictions, labels

# Get predictions on the test dataset
test_predictions, test_labels = get_predictions(model, tokenizer, tokenized_datasets["test"])

# Print evaluation metrics
from sklearn.metrics import classification_report
print(classification_report(test_labels, test_predictions))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       1.00      1.00      1.00         5
           2       0.80      0.80      0.80         5
           3       1.00      1.00      1.00         5

    accuracy                           0.90        20
   macro avg       0.90      0.90      0.90        20
weighted avg       0.90      0.90      0.90        20

