# Scraping Hacker news data for last 24 hours

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime,timedelta

In [14]:
# Define the URL for Google News RSS feed (Rich Site Summary)

url = "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"

In [19]:
response = requests.get(url)
response

<Response [200]>

In [17]:
# parse Xml content

soup = BeautifulSoup(response.content,'xml')

In [18]:
# Find all item elements which represent individual news articles

articles = soup.find_all('item')

In [20]:
articles

[<item><title>Trump arrives in Miami for arraignment on first-ever federal charges against an ex-president - CNBC</title><link>https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjMvMDYvMTIvZG9uYWxkLXRydW1wLWluZGljdG1lbnQtdHJ1bXAtYXJyaXZlcy1pbi1taWFtaS1mb3ItYXJyYWlnbm1lbnQuaHRtbNIBZ2h0dHBzOi8vd3d3LmNuYmMuY29tL2FtcC8yMDIzLzA2LzEyL2RvbmFsZC10cnVtcC1pbmRpY3RtZW50LXRydW1wLWFycml2ZXMtaW4tbWlhbWktZm9yLWFycmFpZ25tZW50Lmh0bWw?oc=5</link><guid isPermaLink="false">CBMiY2h0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjMvMDYvMTIvZG9uYWxkLXRydW1wLWluZGljdG1lbnQtdHJ1bXAtYXJyaXZlcy1pbi1taWFtaS1mb3ItYXJyYWlnbm1lbnQuaHRtbNIBZ2h0dHBzOi8vd3d3LmNuYmMuY29tL2FtcC8yMDIzLzA2LzEyL2RvbmFsZC10cnVtcC1pbmRpY3RtZW50LXRydW1wLWFycml2ZXMtaW4tbWlhbWktZm9yLWFycmFpZ25tZW50Lmh0bWw</guid><pubDate>Mon, 12 Jun 2023 21:49:44 GMT</pubDate><description>&lt;ol&gt;&lt;li&gt;&lt;a href="https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjMvMDYvMTIvZG9uYWxkLXRydW1wLWluZGljdG1lbnQtdHJ1bXAtYXJyaXZlcy1pbi1

In [30]:
data = []                                                                                                      

In [31]:
cutoff_time = datetime.now()-timedelta(hours=24)

In [32]:
for article in articles :
    # Extract relevant data from each article 
    
    pub_date = datetime.strptime(article.pubDate.text,"%a, %d %b %Y %H:%M:%S %Z")
    
    #check if the article was published within the last 24 hours
    if pub_date >= cutoff_time :
        title = article.title.text 
        link = article.link.text 
        description = article.description.text
        
        data.append([title,link,description])
        
df = pd.DataFrame(data,columns=['Title','Link','Description'])
    
        
        
        
        
        

In [34]:
df.head()

Unnamed: 0,Title,Link,Description
0,Trump arrives in Miami for arraignment on firs...,https://news.google.com/rss/articles/CBMiY2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,California Gov. Newsom spars with Fox News hos...,https://news.google.com/rss/articles/CBMiaWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,"Kryvyi Rih Hit, S-300 Wreckage Found In Dnipro...",https://news.google.com/rss/articles/CCAiC3VGV...,"<ol><li><a href=""https://news.google.com/rss/a..."
3,The different ways Republicans defend Trump ov...,https://news.google.com/rss/articles/CBMiMWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,Fact checking Chris Christie’s CNN town hall -...,https://news.google.com/rss/articles/CBMiWmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."


In [35]:
# Apply unsupervised clustering algorihtm to get the labelled data

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [58]:
vectorizer = TfidfVectorizer()
X =  vectorizer.fit_transform(df['Description'])

In [59]:
k=5
kmeans = KMeans(n_clusters=k,random_state=42)
kmeans.fit(X)



In [60]:
labels = kmeans.labels_

In [61]:
labels

array([4, 4, 1, 4, 4, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 3, 3, 1, 1], dtype=int32)

In [62]:
df['Cluster'] = labels

In [64]:
cluster_counts = df['Cluster'].value_counts().sort_index()

In [65]:
cluster_counts

Cluster
0     4
1    23
2     2
3     2
4     4
Name: count, dtype: int64