In [1]:
%pip install python-dotenv  

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install datasets
%pip install evaluate
%pip install transformers
%pip install transformers[torch]
%pip install "accelerate>=0.26.0"


In [None]:
%pip install huggingface_hub[hf_extras]

In [None]:
%pip install lxml

Now we'll import the necessary libraries for web scraping, HTML parsing, and environment variable management:

# NPR News Web Scraper

This notebook demonstrates web scraping of NPR news articles using the Decodo API and BeautifulSoup for HTML parsing.

## Setup and Dependencies

First, we'll install the required Python packages:

In [26]:
import os
import json
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from sklearn import preprocessing
import pandas as pd
import huggingface_hub
import re
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import classification_report, confusion_matrix
import evaluate
from sklearn.model_selection import train_test_split


## Environment Configuration

Load environment variables from a `.env` file to securely store the Decodo API authentication token:

In [3]:
load_dotenv()
DECODO_AUTH = os.getenv('DECODO_AUTH_FIELD')
HF_LLM_TOKEN = os.getenv('LLM_MODEL_TOKEN_HF')

Load the environment variables and authenticate with the Decodo API using the token stored in the `.env` file:

## Web Scraping Function

Define a function to crawl URLs using the Decodo API. This function takes a URL and returns the scraped content:

In [None]:
import requests

def crwal_url(URL_TO_CRWAL):  
  url = "https://scraper-api.decodo.com/v2/scrape"
    
  payload = {
        "url": URL_TO_CRWAL
  }
    
  headers = {
      "accept": "application/json",
      "content-type": "application/json",
      "authorization": DECODO_AUTH
  }
    
  response = requests.post(url, json=payload, headers=headers)
  
  return response

## Target URLs Configuration

Define the NPR news category URLs that we want to scrape. Each category has its own endpoint:

In [4]:
urls_to_crawl = {
  "Politics" : "https://www.npr.org/get/1014/render/partial/next", 
  "business" : "https://www.npr.org/get/1006/render/partial/next",
  "Health" : "https://www.npr.org/get/1128/render/partial/next", 
  "Science" : "https://www.npr.org/get/1007/render/partial/next",
  "Climate" : "https://www.npr.org/get/1167/render/partial/next",
}

## Scraping Politics Articles

Test the scraper by crawling the Politics section with pagination parameters (start index and batch size):

In [None]:
category_url = urls_to_crawl["Politics"]
start_index = 1
batch_size = 10
crawled_url = crwal_url(f"{category_url}?start={start_index}&count={batch_size}")


## Processing the Response

Parse the JSON response from the Decodo API to extract the scraped content:

In [None]:
crawled_url_json = json.loads(crawled_url.text)
crawled_url_json['results']

## Extracting HTML Content

Get the HTML content from the first result in the scraped data:

In [None]:
html_string = crawled_url_json['results'][0]['content']
html_string

## HTML Parsing with BeautifulSoup

Parse the HTML content and extract article information. Here we find all article elements and extract the first anchor tag:

In [None]:
soup = BeautifulSoup(html_string,'html.parser')
for article in soup.find_all('article'):
  anchor_tag  = article.find('a')
  article_url = anchor_tag['href']
  break

In [None]:
def get_article_text(article_url):
  try:
    crawled_article = crwal_url(article_url)
    crawled_article_json = json.loads(crawled_article.text)
    if crawled_article_json['results'][0]["status_code"] != 200:
      return None

    html_string = crawled_article_json['results'][0]['content']
    soup = BeautifulSoup(html_string,'html.parser')
    story_div = soup.find('div', id='storytext')
    if story_div is None:
      return None

    article_text = story_div.get_text(strip=True, separator='\n')

    return article_text
  except:
    return None


## Article Text Extraction Function

Define a function to extract the actual text content from individual article URLs. This function handles the full article scraping process:

In [None]:
def get_next_article(category_url, batch_size = 10):
  start_index = 1
  while True:
    crawled_page = crwal_url(f"{category_url}?start={start_index}&count={batch_size}")
    crawled_page_json = json.loads(crawled_page.text)

    if crawled_page_json['results'][0]['status_code'] != 200:
      break

    html_string = crawled_page_json['results'][0]['content']
    soup = BeautifulSoup(html_string,'html.parser')


    for article in soup.find_all('article'):
      anchor_tag = article.find('a')
      if anchor_tag is None:
        continue
      article_url = anchor_tag['href']
      article_text = get_article_text(article_url)
      if article_text is None:
        continue

      yield article_text
    start_index += batch_size



## Article Iterator Function

Create a generator function that iterates through all articles in a category with pagination support:

In [None]:
data = [] 
for news_category, category_url in urls_to_crawl.items():
  print(f"Crawling {news_category}")
  article_crawled_num = 0
  for article_text in get_next_article(category_url):
    data.append({'news_categoty' : news_category, 'article' : article_text})
    article_crawled_num += 1
    print(f"Crawled {article_crawled_num} articles")
    if article_crawled_num >= 5:
      break

## Main Scraping Loop

Execute the main scraping process across all news categories. 

**TODO: When you hit the API limit, create a new DECODO account and re-run these cells**

In [5]:
import os

csv_path = 'news_articles_Dataset.csv'
if os.path.exists(csv_path):
  data = pd.read_csv(csv_path).to_dict(orient='records')
else:
  data = []

In [6]:

df = pd.DataFrame(data)
df.to_csv('news_articles_Dataset.csv',index=False)

## Save Data to CSV

Convert the collected article data into a pandas DataFrame and save it as a CSV file:

In [7]:
head_csv = pd.read_csv('news_articles_Dataset.csv')
head_csv.sample(5)

Unnamed: 0,news_categoty,article
20,Climate,Energy Secretary Chris Wright spearheaded a re...
3,Politics,"Jeanine Pirro, the U.S. attorney for the Distr..."
8,business,"Circa 1750, The 'Spinning Jenny', invented by ..."
18,Science,"Klaus Vedfelt/Getty Images\nFor a long time, s..."
10,Health,Thomas_EyeDesign/iStockphoto/Getty Images\nCre...


--- 


# LLM Training

## Steps to fine tune an LLM
### 1. Define Parameters
### 2. Clean Dataset
### 3. Wrnagle Dataset: Label, Train/Test Split, Vector Dataset and Conveartion
### 4. Train the model
### 5. Evaluate the model
### 7. Model inference


In [8]:
MODEL_ID = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
huggingface_hub.login(HF_LLM_TOKEN)

In [9]:
CSV_FILE_PATH = 'news_articles_Dataset.csv' 
TEXT_COLUMN = 'article'
LABEL_COLUMN = 'news_categoty'
TEST_SIZE = 0.2
NUM_LABELS = max(2,df[LABEL_COLUMN].nunique())

## Clean Data

In [10]:
class Cleaner():
  def __init__(self):
    pass

  def clean(self,text):
    clean_text = self.remove_html(text)
    clean_text = self.remove_spaces(clean_text)
    return clean_text


  def remove_html(self,text):
    clean_text = BeautifulSoup(text, 'html.parser').text
    return clean_text
  
  def remove_spaces(self,text):
    clean_text = re.sub(' +', '  ', text)
    return clean_text

In [11]:
cleaner = Cleaner()
df['text_cleaned'] = df[TEXT_COLUMN].apply(cleaner.clean)
df.sample(5)

Unnamed: 0,news_categoty,article,text_cleaned
19,Science,Three scientists learned they carry genes that...,Three scientists learned they carry genes...
12,Health,"In this photo illustration, Pfizer-BioNTech CO...","In this photo illustration, Pfizer-BioNTec..."
17,Science,Fossils of the creature\nSpicomellus\nrevealed...,Fossils of the creature\nSpicomellus\nrevea...
11,Health,Health and Human Services Secretary Robert F. ...,Health and Human Services Secretary Rober...
9,business,President Donald Trump listens during a meetin...,President Donald Trump listens during a ...


## Wrangle The data

In [12]:
le = preprocessing.LabelEncoder()
le.fit(df[LABEL_COLUMN].tolist())
df['label'] = le.transform(df[LABEL_COLUMN].tolist())
df.sample(5)

Unnamed: 0,news_categoty,article,text_cleaned,label
8,business,"Circa 1750, The 'Spinning Jenny', invented by ...","Circa 1750, The 'Spinning Jenny', invente...",4
2,Politics,The entrance of the U.S. Department of Educati...,The entrance of the U.S. Department of ...,2
12,Health,"In this photo illustration, Pfizer-BioNTech CO...","In this photo illustration, Pfizer-BioNTec...",1
9,business,President Donald Trump listens during a meetin...,President Donald Trump listens during a ...,4
14,Health,NhuNgoc Pham with her family on the day she re...,NhuNgoc Pham with her family on the day...,1


# Train/Test Split

In [13]:
df_train, df_test =  train_test_split(df, test_size=TEST_SIZE)
df_train.shape, df_test.shape

((20, 4), (5, 4))

In [14]:
df_train = df_train[['text_cleaned','label']]
df_test = df_test[['text_cleaned','label']]

# HF Convartion

In [15]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [17]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
def preprocess_func(examples):
  return tokenizer(examples['text_cleaned'],truncation=True)

In [18]:
tokanized_train = train_dataset.map(preprocess_func, batched=True)
tokanized_test = test_dataset.map(preprocess_func, batched=True)

Map: 100%|██████████| 20/20 [00:00<00:00, 677.49 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 514.42 examples/s]


# Model initialize

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=NUM_LABELS)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model.config.pad_token_id = model.config.eos_token_id

In [21]:
number_of_layers = 0
for param in model.base_model.parameters():
  number_of_layers += 1

print(f"Number of layers in the base model: {number_of_layers}")

Number of layers in the base model: 200


In [22]:
layer_num = 0
for param in model.base_model.parameters():
  if layer_num >= number_of_layers - 25:
    break
  layer_num += 1
  param.requires_grad = False

print(f"Number of layers in the base model: {layer_num}")

Number of layers in the base model: 175


In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = evaluate.load("accuracy")

def compute_metrics(p):
  logits, labels = p
  preds = np.argmax(logits, axis=-1)
  return metric.compute(predictions=preds, references=labels)


Downloading builder script: 4.20kB [00:00, 2.38MB/s]


In [24]:


training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=2,
    per_device_train_batch_size=2,
    report_to="none",
    fp16=True,
    learning_rate=2e-4,
    weight_decay=0.01,
    save_steps=2000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokanized_train,
    eval_dataset=tokanized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=30, training_loss=6.207253011067708, metrics={'train_runtime': 1149.8576, 'train_samples_per_second': 0.052, 'train_steps_per_second': 0.026, 'total_flos': 678123971100672.0, 'train_loss': 6.207253011067708, 'epoch': 3.0})

# Evaluate Model

In [30]:
preds = trainer.predict(tokanized_test)
preds = np.argmax(preds[:3][0], axis=1)
GT = df_test['label'].tolist()
print(classification_report(GT, preds))



              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0
           2       0.00      0.00      0.00       2.0
           3       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       1.0

    accuracy                           0.00       5.0
   macro avg       0.00      0.00      0.00       5.0
weighted avg       0.00      0.00      0.00       5.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
