In [2]:
# --- Cell 1: imports + helpers ---
import time
import re
import json
import html
from datetime import datetime, timezone

import requests

APP_ID = 2344520  # Diablo IV on Steam (SteamDB confirms)  :contentReference[oaicite:5]{index=5}

def get_json(url, params=None, timeout=30):
    r = requests.get(url, params=params, timeout=timeout)
    # Keep response text for debugging if JSON decode fails
    try:
        data = r.json()
    except Exception:
        raise RuntimeError(f"Non-JSON response (status={r.status_code}): {r.text[:300]}")
    return r.status_code, data

def strip_html(text):
    if text is None:
        return ""
    text = html.unescape(text)
    # very simple tag stripper (good enough for a first pass)
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def unix_to_iso(ts):
    return datetime.fromtimestamp(int(ts), tz=timezone.utc).isoformat()


In [3]:
# --- Cell 2: try Steam "News API" (ISteamNews/GetNewsForApp) ---
# NOTE: Steamworks docs have a publisher-only authed method; the public method is commonly used.
# We'll try the public endpoint first, then fall back to RSS if blocked.

def fetch_steam_news_api(appid, count=100, maxlength=0, feeds="steam_community_announcements"):
    url = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v0002/"
    params = {
        "appid": appid,
        "count": count,
        "maxlength": maxlength,
        "format": "json",
        "feeds": feeds,  # often used to narrow to announcements
    }
    status, data = get_json(url, params=params)
    if status != 200:
        raise RuntimeError(f"News API HTTP {status}: {str(data)[:200]}")
    if "appnews" not in data or "newsitems" not in data["appnews"]:
        raise RuntimeError(f"Unexpected news payload: {list(data.keys())}")
    items = data["appnews"]["newsitems"]
    # normalize a bit
    norm = []
    for it in items:
        norm.append({
            "source": "steam_news_api",
            "gid": it.get("gid"),
            "title": it.get("title"),
            "url": it.get("url"),
            "date_utc": unix_to_iso(it.get("date", 0)) if it.get("date") else None,
            "contents": strip_html(it.get("contents", "")),
            "feedname": it.get("feedname"),
            "feedlabel": it.get("feedlabel"),
        })
    return norm

news_items = []
try:
    news_items = fetch_steam_news_api(APP_ID, count=100, maxlength=0)
    print(f"‚úÖ News API worked. Items: {len(news_items)}")
except Exception as e:
    print(f"‚ö†Ô∏è News API failed: {e}")
    news_items = []


‚úÖ News API worked. Items: 40


In [4]:
# --- Cell 3: RSS fallback for Steam News (works even when API is blocked) ---
import xml.etree.ElementTree as ET

def fetch_steam_news_rss(appid):
    # RSS URL pattern is commonly used for Steam News feeds
    rss_url = f"https://store.steampowered.com/feeds/news/app/{appid}/"
    r = requests.get(rss_url, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"RSS HTTP {r.status_code}: {r.text[:200]}")
    root = ET.fromstring(r.text)

    # RSS structure: rss > channel > item*
    channel = root.find("channel")
    if channel is None:
        raise RuntimeError("RSS: missing channel")

    items = []
    for item in channel.findall("item"):
        title = (item.findtext("title") or "").strip()
        link = (item.findtext("link") or "").strip()
        pubdate = (item.findtext("pubDate") or "").strip()
        desc = strip_html(item.findtext("description") or "")

        items.append({
            "source": "steam_news_rss",
            "title": title,
            "url": link,
            "date_raw": pubdate,
            "contents": desc
        })

    return items

if not news_items:
    try:
        news_items = fetch_steam_news_rss(APP_ID)
        print(f"‚úÖ RSS worked. Items: {len(news_items)}")
    except Exception as e:
        print(f"‚ùå RSS failed too: {e}")
        news_items = []


In [5]:
# --- Cell 4: filter "patch notes" style posts from news/announcements ---
def looks_like_patch_note(title):
    t = (title or "").lower()
    keywords = ["patch", "hotfix", "patch notes", "update", "season", "vessel of hatred"]
    return any(k in t for k in keywords)

patch_posts = [x for x in news_items if looks_like_patch_note(x.get("title"))]

print(f"Total news items: {len(news_items)}")
print(f"Patch-like posts: {len(patch_posts)}")

for p in patch_posts[:5]:
    print("-" * 80)
    print(p.get("title"))
    print(p.get("url"))
    print((p.get("date_utc") or p.get("date_raw") or ""))
    print(p.get("contents", "")[:300], "...")


Total news items: 40
Patch-like posts: 23
--------------------------------------------------------------------------------
Diablo IV | Season of Divine Intervention | Season 11 Now Live!
https://steamstore-a.akamaihd.net/news/externalpost/steam_community_announcements/1819386365083889
2025-12-16T23:16:59+00:00
[p][/p][previewyoutube="qvkD6xHuuqo;full"][/previewyoutube][p]‚ÄúDarkness gathers. But fear not, Wanderer. A new day is coming. And you will be the dawn.‚Äù - Hadriel [/p][p][/p][p]Beat back the Lesser Evils with the power of the Divine Gifts. Partake in evolved monster combat, a reworked item journey,  ...
--------------------------------------------------------------------------------
Diablo IV | Season of Infernal Chaos | Season 10 Now Live!
https://steamstore-a.akamaihd.net/news/externalpost/steam_community_announcements/1811138915560849
2025-09-23T17:00:25+00:00
[p][/p][previewyoutube="kDc2ue8cAJo;full"][/previewyoutube][p]Hunt for Chaos Armor with max power and rare affixes

In [6]:
# --- Cell 5: fetch Steam reviews (store.steampowered.com/appreviews/<appid>?json=1) ---
# Steam documents cursor paging ("*" for first page) and parameters like filter, day_range, review_type, num_per_page. :contentReference[oaicite:6]{index=6}

def fetch_steam_reviews(appid, max_reviews=200, language="english", review_type="all", filter_mode="recent", num_per_page=100, sleep_s=1.0):
    url = f"https://store.steampowered.com/appreviews/{appid}"
    cursor = "*"
    all_reviews = []
    query_summary = None

    while len(all_reviews) < max_reviews:
        params = {
            "json": 1,
            "cursor": cursor,
            "language": language,
            "review_type": review_type,
            "filter": filter_mode,
            "num_per_page": min(int(num_per_page), 100),
        }
        r = requests.get(url, params=params, timeout=30)
        if r.status_code != 200:
            raise RuntimeError(f"Reviews HTTP {r.status_code}: {r.text[:200]}")
        data = r.json()

        if data.get("success") != 1:
            raise RuntimeError(f"Reviews API returned success={data.get('success')}: {str(data)[:200]}")

        if query_summary is None:
            query_summary = data.get("query_summary")

        reviews = data.get("reviews", [])
        if not reviews:
            break

        for rev in reviews:
            all_reviews.append({
                "recommendationid": rev.get("recommendationid"),
                "created_utc": unix_to_iso(rev.get("timestamp_created", 0)),
                "updated_utc": unix_to_iso(rev.get("timestamp_updated", 0)),
                "voted_up": rev.get("voted_up"),
                "votes_up": rev.get("votes_up"),
                "weighted_vote_score": rev.get("weighted_vote_score"),
                "playtime_at_review": (rev.get("author") or {}).get("playtime_at_review"),
                "review": rev.get("review", ""),
            })
            if len(all_reviews) >= max_reviews:
                break

        cursor = data.get("cursor", cursor)
        time.sleep(sleep_s)  # be polite

    return query_summary, all_reviews

summary, reviews = fetch_steam_reviews(APP_ID, max_reviews=120, filter_mode="recent", num_per_page=100, sleep_s=1.0)
print("‚úÖ Pulled reviews:", len(reviews))
print("Summary keys:", list((summary or {}).keys())[:10])
print("First review snippet:", reviews[0]["review"][:250])


‚úÖ Pulled reviews: 120
Summary keys: ['num_reviews', 'review_score', 'review_score_desc', 'total_positive', 'total_negative', 'total_reviews']
First review snippet: wholesome...needs a guide to how to do things for best experience


In [7]:
# --- Cell 6: save your test pull to disk (JSONL is convenient for NLP pipelines) ---
def write_jsonl(path, rows):
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

write_jsonl("diablo4_steam_news.jsonl", patch_posts)
write_jsonl("diablo4_steam_reviews.jsonl", reviews)

print("Saved:")
print(" - diablo4_steam_news.jsonl")
print(" - diablo4_steam_reviews.jsonl")


Saved:
 - diablo4_steam_news.jsonl
 - diablo4_steam_reviews.jsonl


In [8]:
import json
import pandas as pd

def read_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows

reviews = read_jsonl("diablo4_steam_reviews.jsonl")
news    = read_jsonl("diablo4_steam_news.jsonl")

df_reviews = pd.DataFrame(reviews)
df_news    = pd.DataFrame(news)

df_reviews["created_utc"] = pd.to_datetime(df_reviews["created_utc"])
df_news["date_utc"] = pd.to_datetime(df_news["date_utc"])

display(df_reviews.head(3))
display(df_news[["date_utc","title","url"]].head(10))


Unnamed: 0,recommendationid,created_utc,updated_utc,voted_up,votes_up,weighted_vote_score,playtime_at_review,review
0,217559543,2026-02-04 14:00:16+00:00,2026-02-04T14:00:16+00:00,True,0,0.5,1477,wholesome...needs a guide to how to do things ...
1,217556431,2026-02-04 13:00:21+00:00,2026-02-04T13:00:21+00:00,True,1,0.4897739589214324,4532,DABLO
2,217553728,2026-02-04 12:02:00+00:00,2026-02-04T12:02:00+00:00,True,0,0.5,7786,Old school game revisited time and time again....


Unnamed: 0,date_utc,title,url
0,2025-12-16 23:16:59+00:00,Diablo IV | Season of Divine Intervention | Se...,https://steamstore-a.akamaihd.net/news/externa...
1,2025-09-23 17:00:25+00:00,Diablo IV | Season of Infernal Chaos | Season ...,https://steamstore-a.akamaihd.net/news/externa...
2,2025-07-01 17:00:26+00:00,Diablo IV | Sins of the Horadrim | Season 9 No...,https://steamstore-a.akamaihd.net/news/externa...
3,2025-01-14 18:39:35+00:00,Master the Occult in Season of Witchcraft (par...,https://steamstore-a.akamaihd.net/news/externa...
4,2025-01-14 18:38:54+00:00,Master the Occult in Season of Witchcraft (par...,https://steamstore-a.akamaihd.net/news/externa...
5,2024-10-08 17:03:27+00:00,Diablo IV | Vessel of Hatred,https://steamstore-a.akamaihd.net/news/externa...
6,2024-10-01 16:03:00+00:00,What You Need to Know for Vessel of Hatred‚Äôs L...,https://steamstore-a.akamaihd.net/news/externa...
7,2024-06-09 18:18:14+00:00,Prepare to embody the Spiritborn: Pre-purchase...,https://steamstore-a.akamaihd.net/news/externa...
8,2024-05-14 18:26:34+00:00,Galvanize your Legend in Season 4: Loot Reborn,https://steamstore-a.akamaihd.net/news/externa...
9,2024-01-23 19:46:33+00:00,Diablo IV | Season of the Construct | Gameplay...,https://steamstore-a.akamaihd.net/news/externa...


In [9]:
import re
import html

def steam_bbcode_to_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = html.unescape(s)

    # Convert [url=LINK]text[/url] -> text (and keep link separately if you want)
    s = re.sub(r"\[url=([^\]]+)\]([^\[]+)\[/url\]", r"\2", s)

    # Drop common tags
    s = re.sub(r"\[/?(b|i|u|h1|h2|h3|p|list|quote|previewyoutube)[^\]]*\]", " ", s)
    s = re.sub(r"\[\*\]|\[\/\*\]", " ", s)

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_urls(s: str):
    if not isinstance(s, str):
        return []
    # capture [url=LINK]...[/url]
    return re.findall(r"\[url=([^\]]+)\]", s)

df_news["blizzard_urls"] = df_news["contents"].apply(extract_urls)
df_news["text_clean"] = df_news["contents"].apply(steam_bbcode_to_text)

df_news[["title","date_utc","blizzard_urls","text_clean"]].head(5)


Unnamed: 0,title,date_utc,blizzard_urls,text_clean
0,Diablo IV | Season of Divine Intervention | Se...,2025-12-16 23:16:59+00:00,"[""https://blizz.ly/3KybnQ4""]","‚ÄúDarkness gathers. But fear not, Wanderer. A n..."
1,Diablo IV | Season of Infernal Chaos | Season ...,2025-09-23 17:00:25+00:00,"[""https://www.youtube.com/hashtag/diabloiv"", ""...",Hunt for Chaos Armor with max power and rare a...
2,Diablo IV | Sins of the Horadrim | Season 9 No...,2025-07-01 17:00:26+00:00,"[""https://www.youtube.com/hashtag/diabloiv""]",Some secrets are too dark to be contained üíÄ ‚ö° ...
3,Master the Occult in Season of Witchcraft (par...,2025-01-14 18:39:35+00:00,[],The Tree of Whispers‚Äô most prized possessions ...
4,Master the Occult in Season of Witchcraft (par...,2025-01-14 18:38:54+00:00,"[https://news.blizzard.com/diablo4/24158873/, ...",New Legendary Aspects and Unique Items New pow...


In [12]:
import re

pattern = r"(?:patch notes|hotfix|\bpatch\b)"
mask = df_news["title"].str.contains(pattern, case=False, regex=True, na=False)
df_patch = df_news[mask].copy()


df_patch = df_patch.sort_values("date_utc", ascending=False)
df_patch[["date_utc", "title", "url"]].head(20)


Unnamed: 0,date_utc,title,url
12,2023-12-06 00:45:20+00:00,Diablo IV Patch Notes: 1.2.2b Build #47240,https://steamstore-a.akamaihd.net/news/externa...
13,2023-11-16 18:23:03+00:00,Diablo IV Patch Notes: 1.2.2b Build #47240,https://steamstore-a.akamaihd.net/news/externa...
14,2023-11-08 18:53:54+00:00,Diablo IV Patch Notes: 1.2.2a Build #47002,https://steamstore-a.akamaihd.net/news/externa...
15,2023-11-07 18:15:18+00:00,Diablo IV Patch Notes: 1.2.2 Build #46837,https://steamstore-a.akamaihd.net/news/externa...
16,2023-10-27 17:01:14+00:00,Diablo IV Patch Notes: 1.2.1 Build #46666,https://steamstore-a.akamaihd.net/news/externa...
17,2023-10-25 01:28:24+00:00,Diablo IV Patch Notes: 1.2.0d,https://steamstore-a.akamaihd.net/news/externa...
18,2023-10-20 18:19:11+00:00,Diablo IV Patch Notes: 1.2.0a and 1.2.0b,https://steamstore-a.akamaihd.net/news/externa...
20,2023-10-16 21:03:44+00:00,Diablo IV Patch Notes,https://steamstore-a.akamaihd.net/news/externa...


In [13]:
import time, requests, pandas as pd

APP_ID = 2344520

def fetch_news_page(appid, count=100, maxlength=0, enddate=None, feeds="steam_community_announcements"):
    url = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v0002/"
    params = {"appid": appid, "count": count, "maxlength": maxlength, "format": "json", "feeds": feeds}
    if enddate is not None:
        params["enddate"] = int(enddate)
    data = requests.get(url, params=params, timeout=30).json()
    return data["appnews"]["newsitems"]

def fetch_all_news(appid, max_pages=50, sleep_s=0.7):
    all_items = []
    enddate = None
    for _ in range(max_pages):
        items = fetch_news_page(appid, count=100, maxlength=0, enddate=enddate)
        if not items:
            break
        all_items.extend(items)
        oldest = min(i["date"] for i in items)  # unix epoch
        enddate = oldest - 1
        time.sleep(sleep_s)
    return all_items

all_news_raw = fetch_all_news(APP_ID, max_pages=30)
df_all_news = pd.DataFrame(all_news_raw)
df_all_news["date_utc"] = pd.to_datetime(df_all_news["date"], unit="s", utc=True)

df_all_news[["date_utc","title","url"]].sort_values("date_utc", ascending=False).head(20)


Unnamed: 0,date_utc,title,url
0,2026-02-06 08:05:26+00:00,Diablo 30th Anniversary Spotlight Livestream,https://steamstore-a.akamaihd.net/news/externa...
1,2025-12-17 23:25:50+00:00,Stand Against Mephisto: Pre-Purchase Lord of H...,https://steamstore-a.akamaihd.net/news/externa...
2,2025-12-16 23:16:59+00:00,Diablo IV | Season of Divine Intervention | Se...,https://steamstore-a.akamaihd.net/news/externa...
3,2025-11-25 19:11:07+00:00,Diablo IV - Steam Awards Nomination,https://steamstore-a.akamaihd.net/news/externa...
4,2025-09-23 17:00:25+00:00,Diablo IV | Season of Infernal Chaos | Season ...,https://steamstore-a.akamaihd.net/news/externa...
5,2025-07-01 17:00:26+00:00,Diablo IV | Sins of the Horadrim | Season 9 No...,https://steamstore-a.akamaihd.net/news/externa...
6,2025-03-11 16:03:15+00:00,BlizzCon Returns In 2026,https://steamstore-a.akamaihd.net/news/externa...
7,2025-01-14 18:39:35+00:00,Master the Occult in Season of Witchcraft (par...,https://steamstore-a.akamaihd.net/news/externa...
8,2025-01-14 18:38:54+00:00,Master the Occult in Season of Witchcraft (par...,https://steamstore-a.akamaihd.net/news/externa...
9,2024-10-09 16:36:58+00:00,The expansion is here and has left its mark on...,https://steamstore-a.akamaihd.net/news/externa...


In [14]:
import re, html

def steam_bbcode_to_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = html.unescape(s)
    # [url=LINK]text[/url] -> text
    s = re.sub(r"\[url=([^\]]+)\]([^\[]+)\[/url\]", r"\2", s)
    # turn bullet marker into newline
    s = s.replace("[*]", "\n- ")
    # drop common tags
    s = re.sub(r"\[/?(b|i|u|h1|h2|h3|p|list|quote|previewyoutube)[^\]]*\]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def chunk_lines(text, min_len=40):
    lines = [ln.strip(" -\t") for ln in text.split("\n")]
    return [ln for ln in lines if len(ln) >= min_len]

# build patch df from df_all_news
pattern = r"(?:patch notes|hotfix|\bpatch\b)"
df_patch = df_all_news[df_all_news["title"].str.contains(pattern, case=False, regex=True, na=False)].copy()

df_patch["text_clean"] = df_patch["contents"].apply(steam_bbcode_to_text)
df_patch = df_patch.sort_values("date_utc", ascending=False)

# explode into chunks
rows = []
for _, r in df_patch.iterrows():
    chunks = chunk_lines(r["text_clean"])
    for j, ch in enumerate(chunks):
        rows.append({
            "patch_gid": r.get("gid"),
            "patch_title": r.get("title"),
            "patch_date_utc": r.get("date_utc"),
            "source_url": r.get("url"),
            "chunk_id": f'{r.get("gid","nogid")}_{j}',
            "chunk_text": ch
        })

df_patch_chunks = pd.DataFrame(rows)
df_patch_chunks.head(10)


Unnamed: 0,patch_gid,patch_title,patch_date_utc,source_url,chunk_id,chunk_text
0,5395938616788903471,Diablo IV Patch Notes: 1.2.2b Build #47240,2023-12-06 00:45:20+00:00,https://steamstore-a.akamaihd.net/news/externa...,5395938616788903471_0,"1.2.3 Build #47693 December 5, 2023 Gameplay U..."
1,6560117310264940302,Diablo IV Patch Notes: 1.2.2b Build #47240,2023-11-16 18:23:03+00:00,https://steamstore-a.akamaihd.net/news/externa...,6560117310264940302_0,"1.2.2b Build #47240 (PC) - November 15, 2023 -..."
2,5302486387995302486,Diablo IV Patch Notes: 1.2.2a Build #47002,2023-11-08 18:53:54+00:00,https://steamstore-a.akamaihd.net/news/externa...,5302486387995302486_0,1.2.2a Build #47002 (All Platforms) - November...
3,5288974951225515255,Diablo IV Patch Notes: 1.2.2 Build #46837,2023-11-07 18:15:18+00:00,https://steamstore-a.akamaihd.net/news/externa...,5288974951225515255_0,1.2.2 Build #46837 (All Platforms) - November ...
4,5229301621580318284,Diablo IV Patch Notes: 1.2.1 Build #46666,2023-10-27 17:01:14+00:00,https://steamstore-a.akamaihd.net/news/externa...,5229301621580318284_0,1.2.1 Build #46666 (All Platforms) - October 3...
5,5229301621570664891,Diablo IV Patch Notes: 1.2.0d,2023-10-25 01:28:24+00:00,https://steamstore-a.akamaihd.net/news/externa...,5229301621570664891_0,1.2.0d Build #46536 (All Platforms) - October ...
6,5218041989047375115,Diablo IV Patch Notes: 1.2.0a and 1.2.0b,2023-10-20 18:19:11+00:00,https://steamstore-a.akamaihd.net/news/externa...,5218041989047375115_0,The Diablo IV team has been diligently monitor...
7,6438516944136281774,Diablo IV Patch Notes,2023-10-16 21:03:44+00:00,https://steamstore-a.akamaihd.net/news/externa...,6438516944136281774_0,The Diablo IV team has been diligently monitor...


In [15]:
import pandas as pd

df_reviews["created_utc"] = pd.to_datetime(df_reviews["created_utc"], utc=True)
df_reviews["is_positive"] = df_reviews["voted_up"].astype(int)

def pre_post_sentiment(reviews_df, patch_date, days=7):
    pre = reviews_df[(reviews_df["created_utc"] >= patch_date - pd.Timedelta(days=days)) &
                     (reviews_df["created_utc"] <  patch_date)]
    post = reviews_df[(reviews_df["created_utc"] >= patch_date) &
                      (reviews_df["created_utc"] <  patch_date + pd.Timedelta(days=days))]
    return {
        "patch_date": patch_date,
        "pre_n": len(pre),
        "post_n": len(post),
        "pre_pos_rate": pre["is_positive"].mean() if len(pre) else None,
        "post_pos_rate": post["is_positive"].mean() if len(post) else None,
        "delta_pos_rate": (post["is_positive"].mean() - pre["is_positive"].mean()) if (len(pre) and len(post)) else None
    }

patch_summary = []
for _, r in df_patch.drop_duplicates(subset=["gid"]).iterrows():
    patch_summary.append({
        "title": r["title"],
        **pre_post_sentiment(df_reviews, r["date_utc"], days=7)
    })

df_patch_impact = pd.DataFrame(patch_summary).sort_values("patch_date", ascending=False)
df_patch_impact.head(10)


Unnamed: 0,title,patch_date,pre_n,post_n,pre_pos_rate,post_pos_rate,delta_pos_rate
0,Diablo IV Patch Notes: 1.2.2b Build #47240,2023-12-06 00:45:20+00:00,0,0,,,
1,Diablo IV Patch Notes: 1.2.2b Build #47240,2023-11-16 18:23:03+00:00,0,0,,,
2,Diablo IV Patch Notes: 1.2.2a Build #47002,2023-11-08 18:53:54+00:00,0,0,,,
3,Diablo IV Patch Notes: 1.2.2 Build #46837,2023-11-07 18:15:18+00:00,0,0,,,
4,Diablo IV Patch Notes: 1.2.1 Build #46666,2023-10-27 17:01:14+00:00,0,0,,,
5,Diablo IV Patch Notes: 1.2.0d,2023-10-25 01:28:24+00:00,0,0,,,
6,Diablo IV Patch Notes: 1.2.0a and 1.2.0b,2023-10-20 18:19:11+00:00,0,0,,,
7,Diablo IV Patch Notes,2023-10-16 21:03:44+00:00,0,0,,,


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vec = TfidfVectorizer(stop_words="english", max_features=20000)
X = vec.fit_transform(df_patch_chunks["chunk_text"].fillna(""))

def search_patch_chunks(query, k=5):
    q = vec.transform([query])
    sims = cosine_similarity(q, X).ravel()
    top = sims.argsort()[::-1][:k]
    return df_patch_chunks.iloc[top][["patch_title","patch_date_utc","source_url","chunk_text"]]

search_patch_chunks("performance stutter frame generation", k=5)


Unnamed: 0,patch_title,patch_date_utc,source_url,chunk_text
3,Diablo IV Patch Notes: 1.2.2 Build #46837,2023-11-07 18:15:18+00:00,https://steamstore-a.akamaihd.net/news/externa...,1.2.2 Build #46837 (All Platforms) - November ...
0,Diablo IV Patch Notes: 1.2.2b Build #47240,2023-12-06 00:45:20+00:00,https://steamstore-a.akamaihd.net/news/externa...,"1.2.3 Build #47693 December 5, 2023 Gameplay U..."
4,Diablo IV Patch Notes: 1.2.1 Build #46666,2023-10-27 17:01:14+00:00,https://steamstore-a.akamaihd.net/news/externa...,1.2.1 Build #46666 (All Platforms) - October 3...
7,Diablo IV Patch Notes,2023-10-16 21:03:44+00:00,https://steamstore-a.akamaihd.net/news/externa...,The Diablo IV team has been diligently monitor...
6,Diablo IV Patch Notes: 1.2.0a and 1.2.0b,2023-10-20 18:19:11+00:00,https://steamstore-a.akamaihd.net/news/externa...,The Diablo IV team has been diligently monitor...
