In [31]:
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Scikit-Learn imports
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

import optuna  # For hyperparameter optimization

# PyTorch and Hugging Face Transformers imports
import torch
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer


In [3]:
def fetch_news_from_currents(api_key, query='world', page_size=100, max_records=5000):
    base_url = 'https://api.currentsapi.services/v1/search'
    all_articles = []
    page = 1

    while len(all_articles) < max_records:
        params = {
            'keywords': query,  # query for API
            'page_size': page_size,  # articles per request
            'page': page,  # Pagination
            'apiKey': api_key,  # API Key
        }
        
        response = requests.get(base_url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('news', [])  #'news' instead of 'articles'
            
            if not articles:
                print("No more articles found. Stopping.")
                break
            
            all_articles.extend(articles)
            print(f"Fetched {len(all_articles)} articles so far...")
            
            # Increment page number
            page += 1

            # API rate limits
            time.sleep(1)  # avoid API rate limit

        else:
            print(f"Error fetching data: {response.status_code} - {response.text}")
            break
    
    print(f"Total articles fetched: {len(all_articles)}")

    # articles to DataFrame
    df = pd.DataFrame(all_articles)
    return df

In [5]:

api_key = 'xzX6dUV0VMlju-t4gao-DX8HkRukXNkl45tD9crd9zuU9q2S' 
df = fetch_news_from_currents(api_key)
print(df.head())

Fetched 100 articles so far...
Fetched 200 articles so far...
Fetched 300 articles so far...
Fetched 400 articles so far...
Fetched 500 articles so far...
Fetched 600 articles so far...
Fetched 700 articles so far...
Fetched 800 articles so far...
Fetched 900 articles so far...
Fetched 1000 articles so far...
Fetched 1100 articles so far...
Fetched 1200 articles so far...
Fetched 1300 articles so far...
Fetched 1400 articles so far...
Fetched 1500 articles so far...
Fetched 1600 articles so far...
Fetched 1700 articles so far...
Fetched 1800 articles so far...
Fetched 1900 articles so far...
Fetched 2000 articles so far...
Fetched 2100 articles so far...
Fetched 2200 articles so far...
Fetched 2300 articles so far...
Fetched 2400 articles so far...
Fetched 2500 articles so far...
Fetched 2600 articles so far...
Fetched 2700 articles so far...
Fetched 2800 articles so far...
Fetched 2900 articles so far...
Fetched 3000 articles so far...
Fetched 3100 articles so far...
Fetched 3200 arti

In [18]:
print(df.columns)


Index(['id', 'title', 'description', 'url', 'author', 'image', 'language',
       'category', 'published'],
      dtype='object')


In [19]:
df

Unnamed: 0,id,title,description,url,author,image,language,category,published
0,becc9e6b-2eb2-4f58-9bff-05e5cb92d774,LNG Tailwind For U.S. Natural Gas Intact As Pr...,The US became the world’s largest exporter of ...,https://seekingalpha.com/article/4720479-lng-t...,VettaFi Research,https://static.seekingalpha.com/cdn/s3/uploads...,en,"[business, finance]",2024-09-11 15:48:00 +0000
1,954c3f4a-8b0f-4e7c-9dfd-273570764987,"Biden, Harris and Trump visit Sept. 11 site in...","New York — President Joe Biden, Vice President...",https://www.voanews.com/a/7780183.html,Reuters,,en,[world],2024-09-11 15:46:35 +0000
2,b14794f5-81d9-4cbc-8ef7-d443755d7f7e,Socceroos’ malaise one of their own making as ...,Australia coach Graham Arnold will ponder mis-...,https://www.theguardian.com/football/blog/arti...,Joey Lynch,https://i.guim.co.uk/img/media/bc549b15a0cb144...,en,[sports],2024-09-11 15:00:10 +0000
3,02edb6ea-95ea-4ed0-bd1d-6a224eb7264a,In Photos: US honors fallen on 23rd anniversar...,"Across the United States, people are rememberi...",https://www.voanews.com/a/in-photos-us-honors-...,VOA,https://gdb.voanews.com/8835a969-2dce-48e7-bc7...,en,[world],2024-09-11 14:48:56 +0000
4,57670674-2107-4551-a9e0-53afd00ad65b,"After Debate, Trump and Harris Meet Again at S...","Setting aside the rancor of their debate, Vice...",https://www.nytimes.com/2024/09/11/us/politics...,Jonathan Weisman,https://static01.nyt.com/images/2024/09/11/mul...,en,"[regional, usa]",2024-09-11 14:15:09 +0000
...,...,...,...,...,...,...,...,...,...
4995,b8d067e8-fd9a-432f-aa7e-7b147ed8d723,Spoofing-Aware Speaker Verification Robust Aga...,arXiv:2409.06327v1 Announce Type: cross \nAbst...,https://arxiv.org/abs/2409.06327,rss-help,,en,"[academic, CS]",2024-09-11 04:00:00 +0000
4996,3f48969c-198d-49c1-9f8c-c0814149249c,Advancing Causal Inference: A Nonparametric Ap...,arXiv:2409.06593v1 Announce Type: cross \nAbst...,https://arxiv.org/abs/2409.06593,rss-help,,en,"[academic, CS]",2024-09-11 04:00:00 +0000
4997,ea362e42-fc76-44a5-bb77-95d84a74d3bd,FC-Planner: A Skeleton-guided Planning Framewo...,arXiv:2309.13882v3 Announce Type: replace \nAb...,https://arxiv.org/abs/2309.13882,rss-help,,en,"[academic, CS]",2024-09-11 04:00:00 +0000
4998,f5f3aa7f-4b5f-4351-b06f-409ba4504a36,Learning Generative Models for Lumped Rainfall...,arXiv:2309.09904v3 Announce Type: replace-cros...,https://arxiv.org/abs/2309.09904,rss-help,,en,"[academic, CS]",2024-09-11 04:00:00 +0000


In [6]:
df.to_csv('articles.csv', index=False)

In [4]:
df_articles = pd.read_csv('articles.csv')

In [5]:

# Multi-label binarization
mlb = MultiLabelBinarizer()
df_articles['category'] = df_articles['category'].apply(lambda x: [x] if isinstance(x, str) else x)
binarized_labels = mlb.fit_transform(df_articles['category'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_articles['title'].tolist(),
    binarized_labels,
    test_size=0.2,
    random_state=42
)

In [6]:
#!pip install accelerate --upgrade
#!pip install transformers[torch] --upgrade


In [9]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # Use float for BCEWithLogitsLoss
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
# datasets
train_dataset = tokenize_data(train_texts, train_labels)
val_dataset = tokenize_data(val_texts, val_labels)

News Aggregation with Bert Pre Trained Model

In [11]:
# Tokenizer and model initialization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(mlb.classes_))

# Tokenize data
def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    return NewsDataset(encodings, labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:

def compute_metrics(p):
    # predictions and labels
    predictions = p.predictions > 0.5  # logits to binary predictions
    labels = p.label_ids

    # multi-label metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='samples')
    precision = precision_score(labels, predictions, average='samples')
    recall = recall_score(labels, predictions, average='samples')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [15]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Ensure this is correctly split
    compute_metrics=compute_metrics
)

# Training model
trainer.train()

# Evaluating model
results = trainer.evaluate()
print(results)


Step,Training Loss
500,0.015
1000,0.0054
1500,0.0035


{'eval_loss': 0.0034609108697623014, 'eval_accuracy': 0.974, 'eval_f1': 0.974, 'eval_precision': 0.974, 'eval_recall': 0.974, 'eval_runtime': 60.3444, 'eval_samples_per_second': 16.572, 'eval_steps_per_second': 2.071, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


idhar tak

My own model

In [16]:
#data type of the category
print(df_articles['category'].dtype)

print(df_articles['category'].head())


object
0              [['world']]
1           [['regional']]
2    [['regional', 'usa']]
3              [['world']]
4            [['general']]
Name: category, dtype: object


In [65]:
print(train_encodings)


tensor([[  101,  1996,  2088,  ...,     0,     0,     0],
        [  101,  2844, 19143,  ...,   102,     0,     0],
        [  101,  1999,  7760,  ...,     0,     0,     0],
        ...,
        [  101,  6847, 10665,  ...,     0,     0,     0],
        [  101,  4975,  2019,  ...,     0,     0,     0],
        [  101,  2300, 16326,  ...,     0,     0,     0]])


In [107]:
print(type(train_encodings))
print(type(train_labels_binarized))
print(train_encodings.shape)
print(train_labels_binarized.shape)


<class 'torch.Tensor'>
<class 'numpy.ndarray'>
torch.Size([4000, 40])
(4000, 2)


In [108]:
train_encodings = train_encodings.numpy()


In [110]:
print(train_encodings.shape)  
print(train_labels_binarized.shape)


(4000, 40)
(4000, 2)


In [112]:
print(type(train_encodings))  
print(type(train_labels_binarized)) 


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:

# Data Cleaning Function
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

In [18]:

df_articles['title'] = df_articles['title'].apply(clean_text)

In [19]:
# Converting categories to suitable for multi-label classification
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_articles['category'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_articles['title'], y, test_size=0.3, random_state=42)

In [20]:
#pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Text feature extraction
    ('clf', RandomForestClassifier())  # Classifier
])

In [21]:
#hyperparameters to tune for GridSearchCV
param_grid = {
    'tfidf__max_features': [None, 500, 1000],
    'clf__n_estimators': [50, 100, 150],
    'clf__max_depth': [None, 10, 20]
}

In [22]:
# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')

# Fit 
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [23]:
# Best hyperparameters
print("Best hyperparameters found by GridSearchCV:")
print(grid_search.best_params_)

# Prediction on test set
y_pred = grid_search.predict(X_test)

Best hyperparameters found by GridSearchCV:
{'clf__max_depth': None, 'clf__n_estimators': 50, 'tfidf__max_features': None}


In [24]:
# Evaluating the model
print(f"Training accuracy: {grid_search.score(X_train, y_train)}")
print(f"Test accuracy: {grid_search.score(X_test, y_test)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

Training accuracy: 0.9905714285714285
Test accuracy: 0.9886666666666667

Classification Report:
                                          precision    recall  f1-score   support

                            ['academia']       1.00      1.00      1.00        37
                ['academic', 'CS', 'CR']       1.00      1.00      1.00        32
                ['academic', 'CS', 'CV']       1.00      1.00      1.00        33
                ['academic', 'CS', 'DB']       1.00      1.00      1.00        13
                      ['academic', 'CS']       1.00      1.00      1.00       465
                    ['academic', 'ECON']       1.00      1.00      1.00        13
                 ['academic', 'PHYSICS']       1.00      1.00      1.00        24
           ['academic', 'STATS', 'MATH']       1.00      1.00      1.00        12
                  ['books', 'lifestyle']       1.00      1.00      1.00        16
                            ['business']       1.00      1.00      1.00        29
 

In [25]:
# Defining the objective function for Optuna
def objective(trial):
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=trial.suggest_int('tfidf_max_features', 500, 2000),
            ngram_range=(1, trial.suggest_int('ngram_range', 1, 2))
        )),
        ('clf', RandomForestClassifier(
            n_estimators=trial.suggest_int('n_estimators', 50, 200),
            max_depth=trial.suggest_int('max_depth', 10, 30)
        ))
    ])

    # cross-validation
    score = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [26]:
#Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2024-09-13 15:10:52,747] A new study created in memory with name: no-name-d054a8f8-178c-41b3-a811-5c7e43d50a74
[I 2024-09-13 15:10:56,324] Trial 0 finished with value: 0.8434257695546923 and parameters: {'tfidf_max_features': 1501, 'ngram_range': 1, 'n_estimators': 121, 'max_depth': 21}. Best is trial 0 with value: 0.8434257695546923.
[I 2024-09-13 15:11:00,502] Trial 1 finished with value: 0.8354224203523325 and parameters: {'tfidf_max_features': 733, 'ngram_range': 2, 'n_estimators': 133, 'max_depth': 22}. Best is trial 0 with value: 0.8434257695546923.
[I 2024-09-13 15:11:03,596] Trial 2 finished with value: 0.41371051544694654 and parameters: {'tfidf_max_features': 1986, 'ngram_range': 1, 'n_estimators': 125, 'max_depth': 11}. Best is trial 0 with value: 0.8434257695546923.
[I 2024-09-13 15:11:08,906] Trial 3 finished with value: 0.9514233375124871 and parameters: {'tfidf_max_features': 1783, 'ngram_range': 1, 'n_estimators': 178, 'max_depth': 28}. Best is trial 3 with value: 0.

In [27]:
# Best hyperparameters
print("Best hyperparameters found by Optuna:")
print(study.best_params)

Best hyperparameters found by Optuna:
{'tfidf_max_features': 1783, 'ngram_range': 1, 'n_estimators': 178, 'max_depth': 28}


In [28]:
# Train the model
best_params = study.best_params
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=best_params['tfidf_max_features'],
        ngram_range=(1, best_params['ngram_range'])
    )),
    ('clf', RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth']
    ))
])

In [29]:
pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = pipeline.predict(X_test)

In [30]:
# Evaluating model
print(f"Training accuracy with Optuna parameters: {pipeline.score(X_train, y_train)}")
print(f"Test accuracy with Optuna parameters: {pipeline.score(X_test, y_test)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


Training accuracy with Optuna parameters: 0.9654285714285714
Test accuracy with Optuna parameters: 0.9473333333333334

Classification Report:
                                          precision    recall  f1-score   support

                            ['academia']       1.00      1.00      1.00        37
                ['academic', 'CS', 'CR']       1.00      1.00      1.00        32
                ['academic', 'CS', 'CV']       1.00      1.00      1.00        33
                ['academic', 'CS', 'DB']       1.00      1.00      1.00        13
                      ['academic', 'CS']       1.00      1.00      1.00       465
                    ['academic', 'ECON']       1.00      1.00      1.00        13
                 ['academic', 'PHYSICS']       1.00      1.00      1.00        24
           ['academic', 'STATS', 'MATH']       1.00      1.00      1.00        12
                  ['books', 'lifestyle']       1.00      1.00      1.00        16
                            ['busines

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
