# PoC : News RAG LLM

1. Load required libraries
2. load "News_Category_Dataset_v3
3. Preprocess the dataset

In [30]:
#Load required Libraries

import json
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import re
from concurrent.futures import ThreadPoolExecutor
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
data =[]
with open('News_Category_Dataset_v3.json','r', encoding='utf-8') as file:
    for line in file:
        if line.strip(): #empty lines result in false condition
            data.append(json.loads(line)) #parse json file and append to list

print(f"There are {len(data)} articles in the dataset.")

#preview
print(data[:5])

There are 209527 articles in the dataset.
[{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9', 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'category': 'U.S. NEWS', 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'authors': 'Carla K. Johnson, AP', 'date': '2022-09-23'}, {'link': 'https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe', 'headline': 'American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video', 'category': 'U.S. NEWS', 'short_description': "He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.", 'authors': 'Mary Papenfuss', 'date': '2022-09-2

In [3]:
#create df
news_data= pd.DataFrame.from_dict(data, orient = 'columns')
df= news_data.sample(frac=0.1, random_state=42)  #make copy for preprocessing tasks

print(f"The dataset has {df.shape[1]} features and {df.shape[0]} rows.\nThe names of the categories are: {df.columns.tolist()}")

#preview dataframe
df.head()

The dataset has 6 features and 20953 rows.
The names of the categories are: ['link', 'headline', 'category', 'short_description', 'authors', 'date']


Unnamed: 0,link,headline,category,short_description,authors,date
128310,https://www.huffingtonpost.com/entry/what-if-w...,What If We Were All Family Generation Changers?,IMPACT,"What if, in doing so, we won't just create new...","Matt Murrie, ContributorEdupreneur, Cofounder/...",2014-06-20
139983,https://www.huffingtonpost.comhttp://www.washi...,Firestorm At AOL Over Employee Benefit Cuts,BUSINESS,It should have been a glorious week for AOL ch...,,2014-02-08
42339,https://www.huffingtonpost.com/entry/time-runs...,Dakota Access Protesters Arrested As Deadline ...,POLITICS,A few protesters who refused to leave remained...,"Michael McLaughlin & Josh Morgan, The Huffingt...",2017-02-22
131494,https://www.huffingtonpost.com/entry/one-glimp...,One Glimpse Of These Baby Kit Foxes And You'll...,GREEN,,,2014-05-14
163649,https://www.huffingtonpost.com/entry/mens-swea...,"Mens' Sweat Pheromone, Androstadienone, Influe...",SCIENCE,Scientists didn't know if humans played that g...,Melissa Cronin,2013-06-02


In [4]:
#check for missing values
df.isna().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [5]:
def duplicated_manager(dataframe):

    try:
        if dataframe.duplicated().sum() != 0:
            dataframe.drop_duplicates(keep ='first',ignore_index=True, inplace=True )

            #Validate duplicated values removed
            print(f" Duplicates successfully managed. There are now {dataframe.duplicated().sum()} duplicates.")
        else:
            print('There are no duplicates in the data')
        return dataframe
    except Exception as e:
        print(f" Error encountered removing duplicates: {e} ")
    


In [6]:
#check for duplicates

print(f" There are {df.duplicated().sum()} duplicated articles")

df[df.duplicated(keep=False )][:4]

 There are 0 duplicated articles


Unnamed: 0,link,headline,category,short_description,authors,date


In [7]:
#Handle duplicates
df =duplicated_manager(df)



There are no duplicates in the data


### Extract full artcile from links



In [8]:
#test regular expression for identifying more than one url in each row
#url = 'https://www.huffingtonpost.comhttp://www.motherjones.com/politics/2016/05/trump-butler-anthony-senecal-facebook-kill-obama'

#url_pattern = r'(https?://[^\s"<>]*?)(?=(?:https?://|$))'
#found_urls = re.findall(url_pattern, url)
#found_urls

In [9]:
#Check how many rows have more than one link concatenated together 
#urls = df['link'].loc[:].tolist()
#url_pattern = r'(https?://[^\s"<>]*?)(?=(?:https?://|$))'
#i=0
#if urls:
    #for url in urls:
        #if len(re.findall(url_pattern, url)) >1:
            #i+=1
#i


In [10]:
def count_url(url):

    url_pattern = r'(https?://[^\s"<>]*?)(?=(?:https?://|$))'

    if isinstance(url, str):
        return 1 if len(re.findall(url_pattern, url)) >1 else 0
    
    elif isinstance(url,pd.Series):

        count=0
        for val in url:
            if len(re.findall(url_pattern, val)) >1:
                 count+=1
        return f"The dataframe has {count} rows with concatenated URLs."
        
    else:
        raise TypeError("Input must be a string or pandas Series")
             

   

In [11]:
def extract_url(url):

    if not isinstance(url, str):
        raise TypeError("Column name must be a string")
    
    url_pattern = r'(https?://[^\s"<>]*?)(?=(?:https?://|$))'
    try:
        matches= re.findall(url_pattern, url) #separte the urls

        if matches:
            article_url =matches[-1] #taking last url as valid article url 
        return article_url

    except Exception as e:
        print(f"Error removing url: {str(e)}")


In [12]:
df['article_url'] = df['link'].apply(extract_url)

In [13]:
# Check for remaining malformed URLs
bad_urls = df[df['article_url'].str.contains('http://http|https://https')]
print(f"Found {len(bad_urls)} still-malformed URLs")

Found 0 still-malformed URLs


In [14]:
count_url(df['article_url'])

'The dataframe has 0 rows with concatenated URLs.'

In [47]:
def get_article(url):
    if not isinstance(url, str):
        raise TypeError('URL must be a string format')
    
    contents = None
    paragraphs = []  # Initialize paragraphs to avoid reference errors
    
    try:
        # Configure session with redirect limits
        session = requests.Session()

        # Set max_redirects to 5 (default is 30)
        adapter = HTTPAdapter(
            max_retries=Retry(total=3, backoff_factor=1),
            pool_connections=10,         # Connection pool size
            pool_maxsize=10,
        )
        adapter.max_redirects = 5

        session.mount("http://", adapter)
        session.mount("https://", adapter)
        
        print(f"Processing URL: {url}...")  # Show  URL

        headers = {'User-Agent': 'Mozilla/5.0'}
        url_call = session.get(url,
                                headers=headers, 
                                timeout=10,                                
                                )
        
        if url_call.status_code == 200:
            soup = BeautifulSoup(url_call.content, 'html.parser')
            print(" ->Parsing HTML content...")

            # Try multiple selectors - expanded list
            selectors = [
                ('article', 'entry-content'),
                ('article', 'entry__content'),
                ('div', 'article-body'),
                ('div', 'article-content'),
                ('div', 'post-content'),
                ('div', 'hz-editorial')

            ]
            
            for tag, class_name in selectors:
                url_text = soup.find(tag, class_=class_name)
                
                if url_text:
                    print(f" -> Found article with {tag}.{class_name}")
                    paragraphs = [p.get_text(strip=True) for p in url_text.find_all('p')]
                    contents = ' '.join(paragraphs)
                    print(f" ->Extracted {len(paragraphs)} paragraphs ({len(contents)} chars)")
                    return contents
                    
            
            print(" -> No article content found with standard selectors")

    except requests.exceptions.TooManyRedirects:
        print(f" -> Skipping (too many redirects): {url[:60]}...")
    except requests.exceptions.SSLError:
        print(f" -> Skipping (SSL error): {url[:60]}...") 
    except requests.exceptions.ConnectionError:
        print(f" -> Skipping (connection failed): {url[:60]}...")
    except requests.exceptions.Timeout:
        print(f" -> Skipping (timeout): {url[:60]}...")
    except Exception as e:
        print(f" -> Skipping (error: {str(e)[:50]}...): {url[:60]}...")
    
    return None


# Process URLs in batches (adjust max_workers based on your system)

def parallel_scrape(urls):
    with ThreadPoolExecutor() as executor:
        return list(executor.map(get_article, urls))
    

def is_scrapable(url):
    if not isinstance(url, str):
        return False
        
    skip_domains = {
        'houzz.com',
        'theatlanticcities.com', 
        'dailycandy.com'
    }
    domain = url.split('//')[-1].split('/')[0].lower()
    return not any(skip in domain for skip in skip_domains)

# Apply filter before processing
#df['scrapable'] = df['article_url'].apply(is_scrapable)
#scrapable_df = df[df['scrapable']].copy()



In [None]:
#urls= df['article_url'].tolist()

In [49]:
# Initialize article column if needed
if 'article' not in df.columns:
    df['article'] = None

batch_size = 2500
scrapable_idx = df[df['scrapable']].index
total_batches = (len(scrapable_idx)) // batch_size + 1

for batch_num, i in enumerate(range(0, len(scrapable_idx), batch_size)):
    batch_end = min(i + batch_size, len(scrapable_idx))
    batch_indices = scrapable_idx[i:batch_end]
    
    # Clear progress message
    print(f"\nProcessing batch {batch_num + 1}/{total_batches} (rows {i}-{batch_end-1})", end="\r")
    
    # Process batch
    batch_urls = df.loc[batch_indices, 'article_url'].tolist()
    results = parallel_scrape(batch_urls)
    
    # Update DataFrame
    df.loc[batch_indices, 'article'] = results
    
    # Save progress
    df.to_csv("progress_checkpoint.csv", index=False)
    print(f"✅ Batch {batch_num + 1}/{total_batches} complete - {sum(x is not None for x in results)}/{len(results)} successful")


Processing URL: https://www.huffingtonpost.com/entry/what-if-we-were-all-famil_b_5510958.html...
Processing URL: http://www.washingtonpost.com/business/economy/aol-chief-ignites-firestorm-over-401k-cuts-and-distressed-babies-remark/2014/02/07/2116c03a-9012-11e3-b227-12a45d109e03_story.html...
Processing URL: https://www.huffingtonpost.com/entry/time-runs-out-at-main-dakota-access-pipeline-protest-camp_us_58acc6a9e4b04a0b274df548...
Processing URL: https://www.huffingtonpost.com/entry/one-glimpse-of-these-baby_n_5319015.html...
Processing URL: https://www.huffingtonpost.com/entry/mens-sweat-pheromone-cooperation_us_5baeb6f0e4b014374e2e5ae6...
Processing URL: https://www.huffingtonpost.com/entry/sleepover_us_5b9c0b18e4b03a1dcc7bf30b...
Processing URL: https://www.huffingtonpost.com/entry/making-goals_us_5b9c788ee4b03a1dcc7ef3c9...
Processing URL: https://www.huffingtonpost.com/entry/maybe-colleges-should-take-a-lesson-from-zoos_us_591325eee4b0e070cad70a9b...
Processing URL: https://www.

Investigating some of the links that no content was found, it indeed had no content. But for others that was not the case. 
Will proceed for now and revist the scraping function after buidling the multi agent model. 

In [50]:
news_df = pd.read_csv('progress_checkpoint.csv')
print(f"The dataset has {news_df.shape[1]} features and {news_df.shape[0]} rows.\nThe names of the categories are: {news_df.columns.tolist()}")
news_df.head()

The dataset has 9 features and 20953 rows.
The names of the categories are: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'article_url', 'article', 'scrapable']


Unnamed: 0,link,headline,category,short_description,authors,date,article_url,article,scrapable
0,https://www.huffingtonpost.com/entry/what-if-w...,What If We Were All Family Generation Changers?,IMPACT,"What if, in doing so, we won't just create new...","Matt Murrie, ContributorEdupreneur, Cofounder/...",2014-06-20,https://www.huffingtonpost.com/entry/what-if-w...,In Tara Stone's conversation from the What If....,True
1,https://www.huffingtonpost.comhttp://www.washi...,Firestorm At AOL Over Employee Benefit Cuts,BUSINESS,It should have been a glorious week for AOL ch...,,2014-02-08,http://www.washingtonpost.com/business/economy...,,True
2,https://www.huffingtonpost.com/entry/time-runs...,Dakota Access Protesters Arrested As Deadline ...,POLITICS,A few protesters who refused to leave remained...,"Michael McLaughlin & Josh Morgan, The Huffingt...",2017-02-22,https://www.huffingtonpost.com/entry/time-runs...,"CANNON BALL, N.D. ― A few dozen Dakota Access ...",True
3,https://www.huffingtonpost.com/entry/one-glimp...,One Glimpse Of These Baby Kit Foxes And You'll...,GREEN,,,2014-05-14,https://www.huffingtonpost.com/entry/one-glimp...,Photographer Donald Quintana melts our hearts ...,True
4,https://www.huffingtonpost.com/entry/mens-swea...,"Mens' Sweat Pheromone, Androstadienone, Influe...",SCIENCE,Scientists didn't know if humans played that g...,Melissa Cronin,2013-06-02,https://www.huffingtonpost.com/entry/mens-swea...,,True


In [54]:

#re-check feature that has missing value
df[df["category"]=="GREEN"][:5]

Unnamed: 0,link,headline,category,short_description,authors,date,article_url,article,scrapable
131494,https://www.huffingtonpost.com/entry/one-glimp...,One Glimpse Of These Baby Kit Foxes And You'll...,GREEN,,,2014-05-14,https://www.huffingtonpost.com/entry/one-glimp...,Photographer Donald Quintana melts our hearts ...,True
23177,https://www.huffingtonpost.com/entry/californi...,Firefighters Battle To Control 15 Wildfires Ra...,GREEN,The blazes have killed at least 13 people and ...,"By Marc Vartabedian, Reuters",2017-10-10,https://www.huffingtonpost.com/entry/californi...,"SANTA ROSA, Calif., Oct 10 (Reuters) - Firefig...",True
39940,https://www.huffingtonpost.com/entry/surfing-c...,Trump Won't Save Us From Climate Change. Maybe...,GREEN,“Surfers will be the canaries in the coal mine...,Lydia O'Connor,2017-03-22,https://www.huffingtonpost.com/entry/surfing-c...,"For surfers, countering PresidentDonald Trump’...",True
51124,https://www.huffingtonpost.com/entry/military-...,Global Security Leaders Have Some Climate Chan...,GREEN,Too bad he thinks it's a giant hoax!,Chris D'Angelo,2016-11-15,https://www.huffingtonpost.com/entry/military-...,International military and security experts ha...,True
102120,https://www.huffingtonpost.com/entry/why-we-ne...,Why We Need to Change How We Talk About Climat...,GREEN,Presenting climate change and its causes as th...,"Maguire Mealy, ContributorGraduate, McGill Uni...",2015-04-16,https://www.huffingtonpost.com/entry/why-we-ne...,The scientific community has reached a resound...,True


In [58]:
news_df.isna().sum()

link                     0
headline                 2
category                 0
short_description     1972
authors               3767
date                     0
article_url              0
article              16144
scrapable                0
dtype: int64

In [None]:
#check for missing values
df.isna().sum()

link                     0
headline                 0
category                 0
short_description        0
authors                  0
date                     0
article_url              0
article              16139
scrapable                0
dtype: int64

In [62]:
#fill missing values with nothing
news_df.fillna('',inplace=True)

#check for missing values
news_df.isna().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
article_url          0
article              0
scrapable            0
dtype: int64

In [71]:
def create_unified_doc(row):

    parts = []
    # Add components only if they exist and aren't empty
    if pd.notna(row.get('headline')) and str(row['headline']).strip():
        parts.append(f"Headline: {row['headline']}")

    if pd.notna(row.get('date')) and str(row['date']).strip():
        parts.append(f"Date: {row['date']}")
    
    if pd.notna(row.get('authors')) and str(row['authors']).strip():
        parts.append(f"Author: {row['authors']}")
    
    if pd.notna(row.get('short_description')) and str(row['short_description']).strip():
        parts.append(f"Summary: {row['short_description']}")
    
    if pd.notna(row.get('article')) and str(row['article']).strip():
        parts.append(f"Article: {row['article']}")
    
    return '\n'.join(parts) if parts else 'No available content'


news_df['document'] = news_df.apply(create_unified_doc, axis=1)

In [72]:
print(news_df['document'].iloc[0])

Headline: What If We Were All Family Generation Changers?
Date: 2014-06-20
Author: Matt Murrie, ContributorEdupreneur, Cofounder/Chief Curiosity Curator of What If...?
Summary: What if, in doing so, we won't just create new opportunities for ourselves, we'll also uncover ways to create new opportunities for our families that may not have otherwise existed?
Article: In Tara Stone's conversation from the What If...? Conference this past March she begins with the recognition that there is no such thing as a perfect family. It's up to us to be the difference that can make our families better. Tara urges us to go beyond the standards our families have provided and challenges us to push down new paths. What if, in doing so, we won't just create new opportunities for ourselves, we'll also uncover ways to create new opportunities for our families that may not have otherwise existed? The life you have doesn't mean it's the life you've been given. Sometimes we can make the biggest impact on the 

In [73]:
#save preprocessed dataset
news_df.to_csv('news_category_dataset.csv', index=False)