In [None]:
!pip install vaderSentiment
!pip install pandas_market_calendars

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import re
import pandas_market_calendars as mcal

#load data
df = pd.read_csv("HackerNews.csv")
#df = pd.read_csv('/content/drive/MyDrive/NewsDataCX4240/HackerNews.csv')

#clean text function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r"&gt;|&lt;|&amp;|&#x27;|<p>", "", text)
    return text.lower()

df["clean_text"] = df["text"].apply(clean_text)

#convert timestamp and extract date
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["date"] = df["timestamp"].dt.date

#get NYSE trading days
nyse = mcal.get_calendar("NYSE")
trading_days = nyse.valid_days(
    start_date=str(df["date"].min()),
    end_date=str(df["date"].max())
).date

#filter to trading days only
df = df[df["date"].isin(trading_days)]


#define tickers and common names
ticker_to_keywords = {
    "AAPL": ["aapl", "apple"],
    "MSFT": ["msft", "microsoft"],
    "NVDA": ["nvda", "nvidia"],
    "AMZN": ["amzn", "amazon"],
    "META": ["meta", "facebook", "meta platforms"],
    "BRK.B": ["brk.b", "berkshire", "buffett", "berkshire hathaway"],
    "GOOGL": ["googl", "alphabet", "google"],
    "GOOG": ["goog", "alphabet", "google"],
    "TSLA": ["tsla", "tesla", "elon"],
    "JPM": ["jpm", "jpmorgan", "jp morgan"],
    "UNH": ["unh", "unitedhealth"],
    "ACHR": ["archer", "archer aviation", "achr"],
    "COST": ["cost", "costco"],
    "NFLX": ["nflx", "netflix"],
    "WMT": ["wmt", "walmart"],
    "PG": ["pg", "procter", "procter and gamble"],
    "JNJ": ["jnj", "johnson", "johnson & johnson"],
    "HD": ["hd", "home depot"],
    "KO": ["ko", "coca cola", "coke"],
    "CRM": ["crm", "salesforce"],
    "PM": ["pm", "philip morris"],
    "CVX": ["cvx", "chevron"],
    "CRWD": ["crwd", "crowdstrike"],
    "MCD": ["mcd", "mcdonalds", "mickey d's"],
    "ORCL": ["orcl", "oracle"],
    "ABT": ["abt", "abbott"],
    "IBM": ["ibm", "big blue"],
    "WFC": ["wfc", "wells fargo"],
    "PEP": ["pep", "pepsi", "pepsico"],
    "MRK": ["mrk", "merck"],
    "PLTR": ["pltr", "palantir"],
    "VZ": ["vz", "verizon"],
    "ACN": ["acn", "accenture"],
    "ISRG": ["isrg", "intuitive surgical"],
    "RTX": ["rtx", "raytheon", "rtx corp"],
    "TMO": ["tmo", "thermo fisher"],
    "INTU": ["intu", "intuit"],
    "PGR": ["pgr", "progressive"],
}

#match ticker using keywords
def find_matching_ticker(text):
    for ticker, keywords in ticker_to_keywords.items():
        for keyword in keywords:
            if re.search(rf"\b{re.escape(keyword)}\b", text):
                return ticker
    return None

# apply ticker matching
df["matched_ticker"] = df["clean_text"].apply(find_matching_ticker)

# view matched rows
matched_df = df[df["matched_ticker"].notnull()][["timestamp", "clean_text", "matched_ticker"]]
print(matched_df.head(10))


In [None]:
#number of comments for each ticker
ticker_counts_df = df["matched_ticker"].value_counts().reset_index()
ticker_counts_df.columns = ["ticker", "count"]
print(ticker_counts_df)

In [None]:
# number of mentions per ticker per day
# Convert timestamp to datetime and extract date
df["date"] = pd.to_datetime(df["timestamp"]).dt.date

# Drop rows with no matched ticker
filtered_df = df[df["matched_ticker"].notnull()]

#group by date and ticker, then count
mentions_per_day = filtered_df.groupby(["date", "matched_ticker"]).size().reset_index(name="count")

# sort for easier viewing
mentions_per_day = mentions_per_day.sort_values(by=["date", "count"], ascending=[False, False])

print(mentions_per_day.head(34))

In [None]:
#total number of ticker mentions per day
df["date"] = pd.to_datetime(df["timestamp"]).dt.date

#filter only rows where a ticker was matched
filtered_df = df[df["matched_ticker"].notnull()]

#count ticker mentions per day
daily_mentions = filtered_df.groupby("date").size().reset_index(name="total_mentions")

print(daily_mentions)


In [None]:
#sentiment analysis using VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores["compound"]  #compound score is overall sentiment

df["sentiment"] = df["clean_text"].apply(get_sentiment)

#group by date and calculate average sentiment
df["date"] = pd.to_datetime(df["timestamp"]).dt.date

#filter only rows where a ticker was matched
filtered_df = df[df["matched_ticker"].notnull()]

#calculate sentiment per day
daily_sentiment = filtered_df.groupby("date")["sentiment"].mean().reset_index()
daily_sentiment.columns = ["date", "average_sentiment"]

#average sentiment per ticker per day
sentiment_by_ticker_day = filtered_df.groupby(["date", "matched_ticker"])["sentiment"].mean().reset_index()
sentiment_by_ticker_day.columns = ["date", "ticker", "average_sentiment"]

#results
print("Average Sentiment per Day:")
print(daily_sentiment.head(10))

print("\nAverage Sentiment per Ticker per Day:")
print(sentiment_by_ticker_day.head(34))

In [None]:
import pandas as pd
import pandas_market_calendars as mcal

#csv for 3 year data
#calculate comments per day per ticker
ticker_volume_by_day = filtered_df.groupby(["date", "matched_ticker"]).size().reset_index(name="comment_volume")

#calculate average sentiment per day per ticker
ticker_sentiment_by_day = filtered_df.groupby(["date", "matched_ticker"])["sentiment"].mean().reset_index(name="average_sentiment")

# merge volume and sentiment data on date and ticker
final_data = pd.merge(
    ticker_volume_by_day,
    ticker_sentiment_by_day,
    on=["date", "matched_ticker"],
    how="inner"
)

#rename for clarity
final_data = final_data.rename(columns={"matched_ticker": "ticker"})

# define date range and tickers
yr3start_date = "2022-01-10"
yr3end_date = "2025-04-11"

yr3selected_tickers = [
    'PG', 'MCD', 'TSLA', 'JPM', 'GOOGL', 'AAPL', 'COST', 'KO', 'NFLX', 'PM', 'META',
    'NVDA', 'BRK.B', 'WMT', 'AMZN', 'HD', 'GOOG', 'MSFT', 'PLTR', 'INTU', 'JNJ',
    'ORCL', 'PEP', 'RTX', 'VZ', 'IBM', 'CRM', 'PGR'
]

# ensure datetime format
final_data["date"] = pd.to_datetime(final_data["date"]).dt.normalize()

#filter to desired tickers and date range
filtered_data3 = final_data[
    (final_data["ticker"].isin(yr3selected_tickers)) &
    (final_data["date"] >= pd.to_datetime(yr3start_date)) &
    (final_data["date"] <= pd.to_datetime(yr3end_date))
]

#Fill in missing NYSE trading days
nyse = mcal.get_calendar("NYSE")
nyse_days = nyse.valid_days(start_date=yr3start_date, end_date=yr3end_date).tz_localize(None)
nyse_days = pd.to_datetime(nyse_days).normalize()

#create full date-ticker index
full_index = pd.MultiIndex.from_product(
    [nyse_days, yr3selected_tickers],
    names=["date", "ticker"]
)

# reindex to complete grid
filled_data = filtered_data3.set_index(["date", "ticker"]).reindex(full_index).reset_index()

# fill missing values with 0
filled_data["comment_volume"] = filled_data["comment_volume"].fillna(0)
filled_data["average_sentiment"] = filled_data["average_sentiment"].fillna(0)

# Sort and Save
filled_data = filled_data.sort_values(by=["date", "ticker"])
filled_data.to_csv("year3news.csv", index=False)

print(filled_data.head(20))

# uncomment below to download csv
#from google.colab import files
#files.download("year3news.csv")


In [None]:
#csv for 1 Year data
selected_tickers = [
    "NVDA",  # Nvidia
    "AAPL",  # Apple
    "COST",  # Costco Wholesale Corp.
    "TSLA",  # Tesla, Inc.
    "GOOGL", # Alphabet Inc. Class A
    "PM",    # Philip Morris International Inc.
    "MSFT",  # Microsoft
    "META",  # Meta Platforms, Inc. Class A
    "KO",    # Coca-Cola Company
    "AMZN",  # Amazon.com Inc.
    "PG",    # Procter & Gamble Company
    "BRK.B", # Berkshire Hathaway Class B
    "HD",    # Home Depot, Inc.
    "GOOG",  # Alphabet Inc. Class C
    "PLTR"   # Palantir
]

start_date = "2024-02-02"
end_date = "2025-04-11"


# Filter the data by ticker and date range
filtered_data = final_data[
    (final_data['ticker'].isin(selected_tickers)) &
    (final_data['date'] >= pd.to_datetime(start_date)) &
    (final_data['date'] <= pd.to_datetime(end_date))
]

# fill in missing NYSE trading days
nyse_days1 = nyse.valid_days(start_date=start_date, end_date=end_date).tz_localize(None)
nyse_days1 = pd.to_datetime(nyse_days1).normalize()

# Create full date-ticker index
full_index1 = pd.MultiIndex.from_product(
    [nyse_days1, selected_tickers],
    names=["date", "ticker"]
)

# Reindex to complete grid
filled_data1 = filtered_data.set_index(["date", "ticker"]).reindex(full_index1).reset_index()

# Fill missing values with 0
filled_data1["comment_volume"] = filled_data1["comment_volume"].fillna(0)
filled_data1["average_sentiment"] = filled_data1["average_sentiment"].fillna(0)

#sort and save
filled_data1 = filled_data1.sort_values(by=["date", "ticker"])

# Uncomment below to save csv
#from google.colab import files
#filled_data1.to_csv("News1year.csv", index=False)

print(filled_data.head(20))