Load and Explore the Dataset

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('MN-DS-news-classification.csv')

# Display the first few rows to understand its structure
df.head()

# Check for any missing data
df.isnull().sum()


data_id                0
id                     0
date                   0
source                 0
title                  0
content                0
author              3312
url                    0
published              0
published_utc          0
collection_utc         0
category_level_1       0
category_level_2       0
dtype: int64

Preprocessing the Text Data

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download stopwords if not already available
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenize and remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin the tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the 'content' column
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Display the first few rows of the cleaned content
df[['content', 'cleaned_content']].head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahpre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahpre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,content,cleaned_content
0,The Virginia woman whose 2-year-old son was fo...,virginia woman whose 2yearold son found trash ...
1,Authorities are trying to determine if anyone ...,authorities trying determine anyone helped two...
2,A 13-year-old suspect in a double homicide who...,13yearold suspect double homicide escaped cust...
3,The mother of two young children found hanging...,mother two young children found hanging pennsy...
4,"""One family member said Derek “can be violent ...",one family member said derek “ violent attacke...


Text Vectorization

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned content
X = tfidf.fit_transform(df['cleaned_content'])

# Check the shape of the transformed data
X.shape


(10917, 5000)

Split the Data into Training and Test Sets

In [4]:
from sklearn.model_selection import train_test_split

# Split data into features (X) and target (y)
X = tfidf.transform(df['cleaned_content'])
y = df['category_level_2']  # We are using 'category_level_2' for classification

# Split into training and testing sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the train and test sets
X_train.shape, X_test.shape



((8733, 5000), (2184, 5000))

Model Selection and Training

Logistic Regression model

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")


Logistic Regression Accuracy: 0.6122


Naive Bayes Model

In [6]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb:.4f}")


Naive Bayes Accuracy: 0.5595


 Support Vector Machine (SVM)

In [7]:
from sklearn.svm import SVC

# Initialize and train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")


SVM Accuracy: 0.6113


 Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")


Random Forest Accuracy: 0.5682


BERT 

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load the BERT tokenizer and model for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=109)

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='weighted'),
        "precision": precision_score(labels, predictions, average='weighted'),
        "recall": recall_score(labels, predictions, average='weighted')
    }

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['cleaned_content'], padding='max_length', truncation=True, max_length=512)

# Convert category labels to integers if they aren't already
def prepare_dataset(df):
    # Create a mapping of unique categories to integers if needed
    if not np.issubdtype(df['category_level_2'].dtype, np.integer):
        categories = df['category_level_2'].unique()
        category_to_id = {category: idx for idx, category in enumerate(categories)}
        df = df.copy()
        df['labels'] = df['category_level_2'].map(category_to_id)
    else:
        df = df.copy()
        df['labels'] = df['category_level_2']
    
    return df[['cleaned_content', 'labels']]

# Prepare the dataset with proper labels
df_prepared = prepare_dataset(df)
dataset = Dataset.from_pandas(df_prepared)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the data into training and test sets
split_ratio = 0.8
train_size = int(split_ratio * len(tokenized_datasets))

# Shuffle the data once, then split
tokenized_datasets = tokenized_datasets.shuffle(seed=42)
train_dataset = tokenized_datasets.select(range(train_size))
test_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Set up the optimized training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",       # Evaluate during training
    eval_steps=500,                    # Evaluate every 500 steps
    save_strategy="steps",             # Save checkpoint on the same schedule
    save_steps=500,                    # Save every 500 steps
    save_total_limit=3,                # Keep only the 3 best checkpoints
    learning_rate=5e-5,                # Slightly higher learning rate
    per_device_train_batch_size=16,    # Increase if your GPU has enough memory
    per_device_eval_batch_size=32,     # Can usually be larger than train batch size
    num_train_epochs=5,                # Train for more epochs
    weight_decay=0.01,                 # Good default for BERT
    load_best_model_at_end=True,       # Load the best model at the end of training
    metric_for_best_model="accuracy",  # Optimize for accuracy
    greater_is_better=True,            # Higher accuracy is better
    warmup_ratio=0.1,                  # Warm up learning rate over 10% of steps
    logging_steps=100,                 # Log training metrics every 100 steps
    gradient_accumulation_steps=2,     # Accumulate gradients to simulate larger batches
    # fp16=True,                       # Enable for faster training if your GPU supports it
)

# Initialize Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Add metrics function
)

# Make sure to install accelerate first if you haven't already
# !pip install accelerate -U

# Train the model
trainer.train()

# Save the final model
trainer.save_model("./final-model")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 10917/10917 [01:01<00:00, 178.52 examples/s]
  trainer = Trainer(
