In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [26]:
import os
import pandas as pd

directory_path = "/kaggle/input/article"
file_name = "article_data.csv"  
file_path = os.path.join(directory_path, file_name)

# Read the CSV file
data = pd.read_csv(file_path)

# Print the first few rows of the dataframe
print(data.head())


                                             Article  Category
0  Sudan Govt rejects call to separate religion, ...         0
1  Hassan:  #39;Abhorrent act #39; says Blair Wes...         0
2  Sharon Says Gaza Evacuation Set for 2005 (AP) ...         0
3  Prince Charles chastised for  quot;old fashion...         0
4  U.S. Says N.Korea Blast Probably Not Nuclear  ...         0


In [27]:
# Overview of the data
print("First few entries in the dataset:")
print(data.head())

print("\nShape of the dataset:")
print(data.shape)

print("\nBasic info of the dataset:")
print(data.info())

First few entries in the dataset:
                                             Article  Category
0  Sudan Govt rejects call to separate religion, ...         0
1  Hassan:  #39;Abhorrent act #39; says Blair Wes...         0
2  Sharon Says Gaza Evacuation Set for 2005 (AP) ...         0
3  Prince Charles chastised for  quot;old fashion...         0
4  U.S. Says N.Korea Blast Probably Not Nuclear  ...         0

Shape of the dataset:
(4000, 2)

Basic info of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Article   4000 non-null   object
 1   Category  4000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 62.6+ KB
None


In [28]:
# Model Building - Sentence Transformer + ML

# Install the sentence-transformers library
!pip install sentence-transformers
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer

# Load dataset
data = pd.read_csv('/kaggle/input/article/article_data.csv')

# Overview of the data
print("First few entries in the dataset:")
print(data.head())

print("\nShape of the dataset:")
print(data.shape)

print("\nBasic info of the dataset:")
print(data.info())

# Define the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the articles
X = model.encode(data['Article'].tolist())
y = data['Category']

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest - Base model
rf_base = RandomForestClassifier(random_state=42)
rf_base.fit(X_train, y_train)

# Predictions and evaluation
y_pred_valid_base = rf_base.predict(X_valid)
print("Base Model - Accuracy:", accuracy_score(y_valid, y_pred_valid_base))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_valid_base))
print("Classification Report:\n", classification_report(y_valid, y_pred_valid_base))

# Random Forest - Base model with class_weights
rf_base_weighted = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_base_weighted.fit(X_train, y_train)

# Predictions and evaluation
y_pred_valid_weighted = rf_base_weighted.predict(X_valid)
print("Base Model with Class Weights - Accuracy:", accuracy_score(y_valid, y_pred_valid_weighted))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_valid_weighted))
print("Classification Report:\n", classification_report(y_valid, y_pred_valid_weighted))

# Hyperparameter tuning (using GridSearchCV)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42), param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_model = grid_search.best_estimator_
y_pred_valid_best = best_model.predict(X_valid)
print("Best Model - Accuracy:", accuracy_score(y_valid, y_pred_valid_best))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_valid_best))
print("Classification Report:\n", classification_report(y_valid, y_pred_valid_best))


  pid, fd = os.forkpty()


First few entries in the dataset:
                                             Article  Category
0  Sudan Govt rejects call to separate religion, ...         0
1  Hassan:  #39;Abhorrent act #39; says Blair Wes...         0
2  Sharon Says Gaza Evacuation Set for 2005 (AP) ...         0
3  Prince Charles chastised for  quot;old fashion...         0
4  U.S. Says N.Korea Blast Probably Not Nuclear  ...         0

Shape of the dataset:
(4000, 2)

Basic info of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Article   4000 non-null   object
 1   Category  4000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 62.6+ KB
None


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Base Model - Accuracy: 0.87625
Confusion Matrix:
 [[182   7  16   4]
 [  4 205   4   0]
 [  5   5 159  25]
 [  6   2  21 155]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.87      0.90       209
           1       0.94      0.96      0.95       213
           2       0.80      0.82      0.81       194
           3       0.84      0.84      0.84       184

    accuracy                           0.88       800
   macro avg       0.87      0.87      0.87       800
weighted avg       0.88      0.88      0.88       800

Base Model with Class Weights - Accuracy: 0.88
Confusion Matrix:
 [[180   9  10  10]
 [  5 206   1   1]
 [  5   3 165  21]
 [  6   2  23 153]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.86      0.89       209
           1       0.94      0.97      0.95       213
           2       0.83      0.85      0.84       194
           3       0.83      0

  pid = os.fork()


Best Model - Accuracy: 0.8825
Confusion Matrix:
 [[180  10  12   7]
 [  2 206   2   3]
 [  5   3 164  22]
 [  7   3  18 156]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.86      0.89       209
           1       0.93      0.97      0.95       213
           2       0.84      0.85      0.84       194
           3       0.83      0.85      0.84       184

    accuracy                           0.88       800
   macro avg       0.88      0.88      0.88       800
weighted avg       0.88      0.88      0.88       800



In [30]:
# Model Building - Transformer

from datasets import Dataset
import pandas as pd

# Load dataset
data = pd.read_csv('/kaggle/input/article/article_data.csv')

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Map label column to integer
dataset = dataset.map(lambda examples: {'labels': examples['Category']}, batched=True)

# Split dataset into train and validation
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
valid_dataset = dataset['test']

# Define a function to preprocess the data with prompt
def preprocess_function(examples):
    prompts = ["Classify the following article: " + text for text in examples['Article']]
    return tokenizer(prompts, padding='max_length', truncation=True, return_tensors='pt')

# Preprocess datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
valid_dataset = valid_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [31]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
import os

# Ensure W&B is not initialized
os.environ['WANDB_DISABLED'] = 'true'

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data['Category'].unique()))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',    
    save_strategy='steps',          
    save_steps=500,                  
    eval_steps=500,                 
    report_to=None,                 
    load_best_model_at_end=True,    
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=valid_dataset,          
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print("Evaluation Results:")
print(f"Loss: {results['eval_loss']}")
print(f"Metrics: {results}")

# Example prediction
# Create a classification pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Example prediction
example_text = "Example text for classification."
prediction = classifier(example_text)

print(f"Prediction: {prediction}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss
500,0.1873,0.419737




Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Evaluation Results:
Loss: 0.4197366237640381
Metrics: {'eval_loss': 0.4197366237640381, 'eval_runtime': 13.7389, 'eval_samples_per_second': 58.229, 'eval_steps_per_second': 3.639, 'epoch': 3.0}
Prediction: [{'label': 'LABEL_3', 'score': 0.956001341342926}]


In [35]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load DistilBERT tokenizer and model
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(data['Category'].unique()))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_distilbert',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_distilbert',
    logging_steps=10,
    evaluation_strategy='steps',
    save_strategy='steps',
    save_steps=500,
    eval_steps=500,
    report_to=None,
    load_best_model_at_end=True,
)

# Initialize Trainer for DistilBERT
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Train the DistilBERT model
trainer_distilbert.train()

# Evaluate the DistilBERT model
results_distilbert = trainer_distilbert.evaluate()

print("DistilBERT Evaluation Results:")
print(f"Loss: {results_distilbert['eval_loss']}")
print(f"Metrics: {results_distilbert}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss
500,0.3384,0.43559




DistilBERT Evaluation Results:
Loss: 0.43558964133262634
Metrics: {'eval_loss': 0.43558964133262634, 'eval_runtime': 7.2964, 'eval_samples_per_second': 109.644, 'eval_steps_per_second': 6.853, 'epoch': 3.0}


In [37]:
# Apply the Best Model

from transformers import pipeline
from datasets import Dataset
import pandas as pd

# Load test data
test_data = pd.read_csv('/kaggle/input/article/article_data.csv')

# Convert test data to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_data)

# Preprocess test dataset
def preprocess_function(examples):
    return tokenizer(examples['Article'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')

test_dataset = test_dataset.map(preprocess_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Define a function to predict with the best model
def predict(texts):
    # Use the best tokenizer for prediction
    predictions = classifier(texts)
    return predictions

# Create a classification pipeline with the best model
classifier = pipeline('text-classification', model=best_model, tokenizer=best_tokenizer, device=0)  # Use GPU if available

# Apply the best model to test data
test_texts = test_data['Article'].tolist()
test_predictions = predict(test_texts)

# Print example predictions
for i, pred in enumerate(test_predictions[:5]):
    print(f"Example {i}: {pred}")


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Example 0: {'label': 'LABEL_0', 'score': 0.993241548538208}
Example 1: {'label': 'LABEL_0', 'score': 0.9936322569847107}
Example 2: {'label': 'LABEL_0', 'score': 0.9935747981071472}
Example 3: {'label': 'LABEL_0', 'score': 0.9807139039039612}
Example 4: {'label': 'LABEL_0', 'score': 0.9931232333183289}


In [39]:
import pandas as pd

# Reload data
data = pd.read_csv('/kaggle/input/article/article_data.csv')

# Split dataset into train and validation again
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_data = dataset['train'].to_pandas()
valid_data = dataset['test'].to_pandas()

# Check class distribution in the dataset
print("Training Data Class Distribution:")
print(train_data['Category'].value_counts())

print("Validation Data Class Distribution:")
print(valid_data['Category'].value_counts())

print("Test Data Class Distribution:")
print(test_data['Category'].value_counts())


Training Data Class Distribution:
Category
0    814
3    803
1    800
2    783
Name: count, dtype: int64
Validation Data Class Distribution:
Category
2    217
1    200
3    197
0    186
Name: count, dtype: int64
Test Data Class Distribution:
Category
0    1000
1    1000
2    1000
3    1000
Name: count, dtype: int64


In [40]:
# Ensure test_data is loaded
test_data = pd.read_csv('/kaggle/input/article/article_data.csv')

# Check predictions
test_texts = test_data['Article'].tolist()  
test_predictions = predict(test_texts)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(test_predictions)

# Print first few predictions
print(predictions_df.head())


     label     score
0  LABEL_0  0.993242
1  LABEL_0  0.993632
2  LABEL_0  0.993575
3  LABEL_0  0.980714
4  LABEL_0  0.993123


# Actionable Insights and Recommendations

## Model Performance:

    Base Model Accuracy: Achieved an accuracy of 87.63%, demonstrating strong performance but leaving room for improvement.
    Base Model with Class Weights Accuracy: Slightly improved accuracy at 88%, indicating that class weighting helped balance the performance across classes.
    Best Model Accuracy: Reached an accuracy of 88.25%, confirming that tuning or additional optimizations have positively impacted performance.
    Loss Metrics: The training and validation losses for the best model are 0.1873 and 0.4197 respectively, reflecting good model fit but suggesting there is still some room for reducing overfitting.

## Classification Report:

    Precision and Recall: The model performs well across most classes, with particularly high precision and recall for classes 0 and 1. However, class 2 and class 3 have slightly lower scores, indicating room for improvement.
    F1 Scores: The F1 scores are consistently high, showing a good balance between precision and recall, though class 2 and class 3 could benefit from additional tuning.

## Class Distribution:

    Training Data: Balanced across all categories, ensuring the model is trained on a representative sample.
    Validation Data: Slightly imbalanced but still fairly balanced; validation metrics are reflective of training data performance.
    Test Data: Perfectly balanced, providing a fair assessment of model performance across all classes.