In [None]:
# Task 1 : Multi-source Financial News Collection
import requests 
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
def fetch_news(source,url,articles):
    response=requests.get(url)
    if response.status_code!=200:
        print("failed to retrieve 'source' news")
        return []
    soup=BeautifulSoup(response.text,"xml")
    items=soup.find_all("item")
    for item in items:
        headline= item.find("title")
        pub_Date=item.find("pubDate")
        articles.append({
            "source":source,
            "headline":headline.text if headline else "N/A",
            "pub_Date":pub_Date.text if pub_Date else "N/A"
        })

    
rss_feeds={
    'Wall Street Journal':'https://feeds.content.dowjones.io/public/rss/RSSUSnews',
    'Bloomberg':'https://feeds.bloomberg.com/markets/news.rss',
    'MarketWatch':'https://feeds.content.dowjones.io/public/rss/mw_marketpulse'
} 
articles=[]
fetch_news('Wall Street Journal','https://feeds.content.dowjones.io/public/rss/RSSUSnews',articles)
fetch_news('MarketWatch','https://feeds.content.dowjones.io/public/rss/mw_marketpulse',articles)
fetch_news('Bloomberg','https://feeds.bloomberg.com/markets/news.rss',articles)
df=pd.DataFrame(articles)
df.to_csv(r"c:\Users\Amol Natu\Wids Project\news_raw.csv",index=False)

#Task 2: XML Structure Understanding

##Which XML tags were used to extract the headlines?
The XML tags which were used to extract headlines were <item>,<title> and <pubDate>.

##What is the role of the <item> tag in RSS feeds?
The <item> tag in RSS feeds represents a single news headline. It marks the start of a new news article.

##How does an RSS feed differ from a normal HTML webpage?
An RSS feed is designed and written for machine readability while a normal HTML webpage is designed to be read by humans in a readable and systematic visual format.RSS feeds provide standardized tags that make it easy to extract specific information using a program,while HTML pages require web scraping and are less consistent in structure.

In [None]:
# Task 3 : News Data Cleaning and Standardization
import pandas as pd 
import numpy as np
news_df=pd.read_csv('news_raw.csv') #read the csv file 
pd.set_option("display.max_colwidth", None) #show full column content
pd.set_option("display.max_columns", None)
news_df['pub_Date']=pd.to_datetime(news_df['pub_Date'],utc=True) #convert to a datetime object 
news_df["date"]=news_df["pub_Date"].dt.date #extract date from a datetime object
news_df["headline_length"]=news_df["headline"].str.len() #calculate headline length
news_df.to_csv("news_cleaned.csv",index=False) #save the cleaned data to a new csv file

In [None]:
# Task 4 : Stock Price Data Collection
import pandas as pd 
import numpy as np
import yfinance as yf 
ticker="TSLA"
stock_df=yf.download(ticker,period="10d",interval="1d")
stock_df.to_csv("stock_data.csv",index=True)
print (stock_df)

# Task 5: Market Calendar Awareness 

# Which dates in your news data are non-trading days?
Based on the News in the dataset the Days which are no n trading days are:
01-01-2026-(New Year's Day(Market Holiday))
28-12-2025-(Sunday)
27-12-2025-(Saturday)
21-12-2025-(Sunday)
20-12-2025-(Saturday)
28-12-2024-(Saturday)

# Why does the stock market not trade on those days?
The U.S. stock market does not operate on:
Weekends, because financial exchanges such as the NYSE and NASDAQ are officially closed on Saturdays and Sundays.
Federal holidays, such as New Yearâ€™s Day (January 1st), when markets are closed to observe national holidays.
On these days, no official stock price data is generated.

# How many news articles fall on non-trading days?
10 news articles were published on weekends.
18 news articles were published on New Year's Day.
Total 28 news articles were published on non-Trading Days.

In [None]:
# Task 6 : Intelligent Data Merging
import pandas as pd 
import numpy as np
news_df=pd.read_csv("news_cleaned.csv")
stock_df=pd.read_csv("stock_data.csv",skiprows=2)
stock_df["Date"] = pd.to_datetime(stock_df["Date"], format="%d-%m-%Y")
news_df["date"] = pd.to_datetime(news_df["date"], format="%d-%m-%Y")
merged_df=pd.merge(
    news_df,
    stock_df,
    left_on="date",
    right_on="Date",
   how="left"
)
merged_df["is_trading_day"] = merged_df["Date"].notna()
merged_df=merged_df.rename(columns={"Unnamed: 1":"Close"})
merged_df=merged_df.rename(columns={"Unnamed: 2":"High"})
merged_df=merged_df.rename(columns={"Unnamed: 3":"Low"})
merged_df=merged_df.rename(columns={"Unnamed: 4":"Open"})
merged_df=merged_df.rename(columns={"Unnamed: 5":"Volume"})

merged_df.to_csv("merged_midterm_data.csv", index=False)

In [None]:
# Task 7 : Plotting Various Insights from Merged Data
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
merged_df=pd.read_csv('merged_midterm_data.csv')
news_per_day=merged_df.groupby('date').size()
plt.figure()
news_per_day.plot(kind="line")
plt.xlabel("Date")
plt.ylabel("Number of News Articles")
plt.title("Daily News Article Count Over Time")
plt.show()
news_by_source = merged_df["source"].value_counts()

plt.figure()
news_by_source.plot(kind="bar")
plt.xlabel("News Source")
plt.ylabel("Number of Articles")
plt.title("News Distribution Across Sources")
plt.show()
trading_vs_non = merged_df["is_trading_day"].value_counts()

plt.figure()
trading_vs_non.plot(kind="bar")
plt.xlabel("Is Trading Day")
plt.ylabel("Number of Articles")
plt.title("Trading vs Non-Trading Day News Frequency")
plt.show() 