# No Man's Sky (Hello Games) Patch Note Scraping

In [1]:
import numpy as np
import pandas as pd
import requests
import re
import time
import random
import copy
import os
from tqdm import tqdm
from bs4 import BeautifulSoup, SoupStrainer
from bs4.diagnose import diagnose

In [12]:
def get_patch_page_urls(
    url="https://www.nomanssky.com/",
    releaseLogPage="/release-log/",
    patchNoteSection=("a","link link--inherit")
):
    
    page = requests.get(url+releaseLogPage)
    soup = BeautifulSoup(page.content, "html.parser")

    ## Grab the list of individual patch release pages 
    if len(patchNoteSection)==1:
        ## UNTESTED
        htmlTarget = patchNoteSection[0]
        patchPages = soup.find_all(htmlTarget, href=True)
    elif len(patchNoteSection)==2:
        htmlTarget = patchNoteSection[0]
        htmlClass = patchNoteSection[1]
        patchPages = soup.find_all(htmlTarget, class_=htmlClass) 
    else:
        raise ValueError("Invalid patchNoteSection format.")
     
    links = [patch.get("href") for patch in patchPages]

    ## Clean links of any base URL ('url' variable) content. 
    ## This also helps address bugs introduced wherein some pages links are given in
    ##   `www.nommanssky.com` format (equivalent to http://www.nommanssky.com) rather than
    ## the expected 
    ##   `https://www.nomanssky.com/` format.
    splitLinks = [link.split(".com")[-1] if len(link.split(".com"))>1 else link for link in links]

    ## Clean whitespace
    finalLinks = [link.strip() for link in splitLinks]
    
    return finalLinks

def _class_not_patch_note_button(class_):
    if not class_:
        return True  # Exclude elements with no class

    ## check to ensure class_ var hasn't been passed a list
    if isinstance(class_, list):
        ## in which case we need to examine the list rather than an exact match
        if ("btn" in class_ and "btn--primary" in class_): 
            return False  # Exclude if both are present
    
    return class_ != "btn btn--primary"  # Default behaviour: return True when not patch note button, False when it is an exact match with button definition

def get_patch_notes_from_page(
    patchPageUrl,
    baseUrl="https://www.nomanssky.com/",
    identifyPatchNotesString="Patch Notes",
    identifyBugFixesString="Bug Fixes"
):
    output = {
        "Title": None,
        "Month": None,
        "Year": None,
        "Notes": None,
        "Patch update": False,
        "Bug fix update": False,
        "Scrape error": False,
        "Patch page url": copy.deepcopy(baseUrl)+copy.deepcopy(patchPageUrl),
    }

    try:
        output["Year"], output["Month"], output["Title"] = patchPageUrl[1:-1].split("/")
    except:
        title = patchPageUrl[1:-1].split("/")

        if type(title)==list:
            output["Title"] = title[0]
        else:
            output["Title"] = title
        
    page = requests.get(baseUrl+patchPageUrl)
    soup = BeautifulSoup(page.content, "html.parser")

    ## Retrieve all elements that contain the patch notes string
    all_notes = soup.find_all(string=re.compile(identifyPatchNotesString, flags=re.IGNORECASE))
    # print("all_notes:", all_notes) # Debugging step
    
    ## Filter out elements with "btn btn--primary"
    noteSection = [tag for tag in all_notes if _class_not_patch_note_button(tag.parent.get("class"))]  
    # print("Final Filtered NoteSection:", noteSection)  # Debugging step

    ## If no Patch Notes info has been found, try scraping for bug fix notes instead
    if not noteSection:
        output["Bug fix update"] = True
        ## Retrieve all elements that contain the bug fix string
        all_notes = soup.find_all(string=re.compile(identifyBugFixesString, flags=re.IGNORECASE))
        # print("all_notes:", all_notes) # Debugging step
        ## Filter out elements with "btn btn--primary"
        noteSection = [tag for tag in all_notes if _class_not_patch_note_button(tag.parent.get("class"))]  
        # print("Final Filtered NoteSection:", noteSection)  # Debugging step
    else:
        output["Patch update"] = True

    ## Now that we've located the relevant section of the page, let us go
    ##  back up the beautiful soup html tree, to identify the notes that follow
    notes = []
    for nn in noteSection:
        ## We need to climb several levels. At least one for the string itself, one for the placeholder of that string on page
        ##  and one more for that page section. However, there may be more, hence the while loop.
        ## Finally, we should find the section containing the patch/bug fix notes

        parentLevel = 0
        pageSection = nn.parent
        search = pageSection.find_all("li")
        # print(pageSection)
        while ((parentLevel <=5)&(len(search)<=3)):
            # print("\n")
            # print(pageSection)
            newSection = pageSection.parent
            search = newSection.find_all("li")
            pageSection = newSection
            parentLevel+=1

        # print(parentLevel)
        # print(len(search))
        for ee in pageSection.stripped_strings:
            notes.append(repr(ee))

    if not notes:
        output["Scrape error"] = True
        
    output["Notes"] = "\n".join(notes)
    return output

def get_patch_notes(
    url="https://www.nomanssky.com/",
    releaseLogPage="/release-log/",
    patchNoteSection=("a","link link--inherit"),
    identifyPatchNotesString="Patch Notes",
    identifyBugFixesString="Bug Fixes"):

    out = {}
    print(f"Finding individual patch release pages from {url+releaseLogPage} ...")
    patchPages = get_patch_page_urls(
        url=url,
        releaseLogPage=releaseLogPage,
        patchNoteSection=patchNoteSection
    )

    nPages = len(patchPages)
    print(f"Found {nPages} individual patch release pages!")
    print(f"Scraping patch notes...")
    # print(f"DEBUG Mod active!! Pages gathered limited!!!!")
    for ii,page in tqdm(enumerate(patchPages),total=nPages):
        ## Reduce request frequency in less 'robotic' fashion to prevent sites blocking access
        time.sleep(random.randint(0,3))
        notes = get_patch_notes_from_page(
            patchPageUrl=page,
            baseUrl=url,
            identifyPatchNotesString=identifyPatchNotesString,
            identifyBugFixesString=identifyBugFixesString
        )
        out[ii] = copy.deepcopy(notes)

    return out

## Main

### Scrape data

In [13]:
nmsPatchNotes = get_patch_notes()

Finding individual patch release pages from https://www.nomanssky.com//release-log/ ...
Found 238 individual patch release pages!
Scraping patch notes...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 238/238 [07:53<00:00,  1.99s/it]


### Convert scraped data to dataframe

In [14]:
df = pd.DataFrame.from_dict(nmsPatchNotes, orient="index")

In [15]:
print(df.head(),"\n"*2,"="*20)
print(df.describe(),"\n"*2,"="*20)
print(df.info(),"\n"*2,"="*20)

                        Title Month  Year  \
0         worlds-part-ii-5-57    02  2025   
1         worlds-part-ii-5-56    02  2025   
2  expedition-seventeen-titan    02  2025   
3         worlds-part-ii-5-55    02  2025   
4         worlds-part-ii-5-54    02  2025   

                                               Notes  Patch update  \
0  'Worlds Part II - 5.57'\n'February 14, 2025'\n...         False   
1  'Worlds Part II - 5.56'\n'February 13, 2025'\n...         False   
2  'Expedition Seventeen: Titan'\n'February 12, 2...          True   
3  'Worlds Part II - 5.55'\n'February 12, 2025'\n...         False   
4  'Worlds Part II - 5.54'\n'February 06, 2025'\n...         False   

   Bug fix update  Scrape error  \
0            True         False   
1            True         False   
2           False         False   
3            True         False   
4            True         False   

                                      Patch page url  
0  https://www.nomanssky.com//2025/02/worl

Convert dtypes

In [16]:
df = df.astype({'Title':str,'Month':pd.Int64Dtype(),'Year':pd.Int64Dtype(),'Bug fix update':"boolean",'Patch update':"boolean",'Scrape error':"boolean",'Notes':str,'Patch page url':str})

Sort and reverse row indices --- this (hopefully) future-proofs the dataset if/when new patches are released by having the latest patch be the page with the highest index value.

In [17]:
df = df.loc[::-1].reset_index(drop=True)

In [18]:
print(df.head(),"\n"*2,"="*20)
print(df.describe(),"\n"*2,"="*20)
print(df.info(),"\n"*2,"="*20)

           Title  Month  Year Notes  Patch update  Bug fix update  \
0  pc-patch-1-04   <NA>  <NA>               False            True   
1  pc-patch-1-05   <NA>  <NA>               False            True   
2  pc-patch-1-06   <NA>  <NA>               False            True   
3  pc-patch-1-07   <NA>  <NA>               False            True   
4  pc-patch-1-08   <NA>  <NA>               False            True   

   Scrape error                             Patch page url  
0          True  https://www.nomanssky.com//pc-patch-1-04/  
1          True  https://www.nomanssky.com//pc-patch-1-05/  
2          True  https://www.nomanssky.com//pc-patch-1-06/  
3          True  https://www.nomanssky.com//pc-patch-1-07/  
4          True  https://www.nomanssky.com//pc-patch-1-08/   

          Month         Year
count     199.0        199.0
mean   7.040201  2021.150754
std    2.969269     2.277999
min         1.0       2016.0
25%         5.0       2019.0
50%         8.0       2021.0
75%         9.

In [19]:
savePath = "./Data/"
saveFile = "NMS_patch_notes.xlsx"

tmp = ""
for savePathChunk in savePath.split("/")[:-1]:
    tmp += savePathChunk + "/"
    try:
        os.mkdir(tmp)
    except:
        pass

with pd.ExcelWriter(path=savePath+saveFile,mode="w") as writer:
    df.to_excel(writer)

In [34]:
df.loc[df["Scrape error"] == True]

Unnamed: 0,Title,Month,Year,Notes,Patch update,Bug fix update,Scrape error,Patch page url
0,pc-patch-1-04,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-04/
1,pc-patch-1-05,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-05/
2,pc-patch-1-06,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-06/
3,pc-patch-1-07,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-07/
4,pc-patch-1-08,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-08/
5,pc-patch-1-09,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-09/
6,ps4-1-04-1-07,,,,False,True,True,https://www.nomanssky.com//ps4-1-04-1-07/
7,ps4-patch-1-08,,,,False,True,True,https://www.nomanssky.com//ps4-patch-1-08/
8,ps4-patch-1-09,,,,False,True,True,https://www.nomanssky.com//ps4-patch-1-09/
10,patch-1-12,12.0,2016.0,,False,True,True,https://www.nomanssky.com//2016/12/patch-1-12/


In [33]:
df.loc[df["Year"].isnull() == True]

Unnamed: 0,Title,Month,Year,Notes,Patch update,Bug fix update,Scrape error,Patch page url
0,pc-patch-1-04,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-04/
1,pc-patch-1-05,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-05/
2,pc-patch-1-06,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-06/
3,pc-patch-1-07,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-07/
4,pc-patch-1-08,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-08/
5,pc-patch-1-09,,,,False,True,True,https://www.nomanssky.com//pc-patch-1-09/
6,ps4-1-04-1-07,,,,False,True,True,https://www.nomanssky.com//ps4-1-04-1-07/
7,ps4-patch-1-08,,,,False,True,True,https://www.nomanssky.com//ps4-patch-1-08/
8,ps4-patch-1-09,,,,False,True,True,https://www.nomanssky.com//ps4-patch-1-09/
9,foundation-updat,,,'Patch notes'\n'General Gameplay'\n'Fixed tech...,True,False,False,https://www.nomanssky.com//foundation-update


In [33]:
debugPatchPages = get_patch_page_urls(
    url="https://www.nomanssky.com/",
    releaseLogPage="/release-log/",
    patchNoteSection=("a","link link--inherit")
)

In [34]:
debugPatchPages

['/2025/02/worlds-part-ii-5-57/',
 '/2025/02/worlds-part-ii-5-56/',
 '/2025/02/expedition-seventeen-titan/',
 '/2025/02/worlds-part-ii-5-55/',
 '/2025/02/worlds-part-ii-5-54/',
 '/2025/02/worlds-part-ii-5-53/',
 '/2025/02/worlds-part-ii-5-52/',
 '/2025/01/worlds-part-ii-5-51/',
 '/worlds-part-ii-update/',
 '/2024/12/holiday-2024-expeditions-5-29/',
 '/2024/11/holiday-2024-expeditions/',
 '/2024/11/cross-save-ps5-pro-normandy-5-28/',
 '/2024/11/cross-save-ps5-pro-normandy-5-27/',
 '/2024/11/cross-save-ps5-pro-normandy-5-26/',
 '/2024/11/cross-save-ps5-pro-normandy',
 '/2024/10/the-cursed-5-22/',
 '/2024/10/the-cursed-5-21/',
 '/2024/10/expedition-sixteen-the-cursed/',
 '/2024/09/aquarius-5-12/',
 '/2024/09/aquarius-5-11/',
 '/2024/09/aquarius-update/',
 '/2024/08/worlds-part-1-5-05/',
 '/2024/07/worlds-part-1-5-03/',
 '/2024/07/worlds-part-1-5-02/',
 '/2024/07/worlds-part-1-5-1-1/',
 '/2024/07/worlds-part-1-5-01/',
 '/2024/07/worlds-part-1-5-00-1/',
 '/worlds-part-i-update/',
 '/2024/06

In [16]:
debugNotes = get_patch_notes_from_page("https://www.nomanssky.com/2022/06/leviathan-patch-3-93/")

In [17]:
debugNotes

{'Title': 'ttps:',
 'Month': None,
 'Year': None,
 'Notes': '',
 'Patch update': False,
 'Bug fix update': True,
 'Scrape error': True,
 'Patch page url': 'https://www.nomanssky.com/https://www.nomanssky.com/2022/06/leviathan-patch-3-93/'}

In [57]:
saveFile = "NMS_patch_notes.hdf5"
dataHomeGroupDir = "PatchNotes"
with h5py.File(saveFile,"w") as file:
    for pageNumber, patchNotes in nmsPatchNotes.items():
        page = dataHomeGroupDir+f"/{int(pageNumber)}"
        grp = file.create_group(page)
        for key,data in patchNotes.items():
            if data is not None:
                grp.create_dataset(str(key),data=copy.deepcopy(data))
            else:
                grp.create_dataset(str(key),data=h5py.Empty("f"))

In [62]:
with h5py.File(saveFile,"r") as file:
    ii = 0
    for pageNumber, data in file["PatchNotes"].items():
        if ii>4:
            assert False
        print(f"Patch page number {pageNumber}")
        ii+=1
        for kk, dd in data.items():
            print(kk, dd[()])

Patch page number 0
Bug fix update True
Month b'02'
Notes b"'Worlds Part II - 5.57'\n'February 14, 2025'\n'.'\n'Hello Everyone,'\n'Thank you to everyone playing No Man\xe2\x80\x99s Sky \xe2\x80\x93 Worlds Part II, especially those taking the time to report any issues they encounter via Zendesk or console crash reporting.'\n'We are listening closely to your feedback, and have identified and resolved a number of issues. These fixes are included in patch 5.57, which is now live on Steam and will be coming to other platforms as soon as possible.'\n'Bug Fixes'\n'Fixed an issue with the Hazard Pay milestone in the Titan expedition that could cause notification messages to flicker.'\n'Fixed an issue that prevented an underwater crashed freighter from spawning at the second rendezvous point in the Titan expedition.'\n'Implemented a recovery for players who have lost their hyperdrive in the Titan expedition.'\n'Fixed a number of issues that could prevent Exocraft scanners from locating building

AssertionError: 

<Closed HDF5 dataset>

## Initial debugging tests

In [59]:
url="https://www.nomanssky.com/"
patchPages = get_patch_page_urls()
testPage = get_patch_notes_from_page(patchPages[0])
print(patchPages[0])
print(testPage)

/2025/02/worlds-part-ii-5-57/
{'Title': 'worlds-part-ii-5-57', 'Month': '02', 'Year': '2025', 'Notes': "'Worlds Part II - 5.57'\n'February 14, 2025'\n'.'\n'Hello Everyone,'\n'Thank you to everyone playing No Man’s Sky – Worlds Part II, especially those taking the time to report any issues they encounter via Zendesk or console crash reporting.'\n'We are listening closely to your feedback, and have identified and resolved a number of issues. These fixes are included in patch 5.57, which is now live on Steam and will be coming to other platforms as soon as possible.'\n'Bug Fixes'\n'Fixed an issue with the Hazard Pay milestone in the Titan expedition that could cause notification messages to flicker.'\n'Fixed an issue that prevented an underwater crashed freighter from spawning at the second rendezvous point in the Titan expedition.'\n'Implemented a recovery for players who have lost their hyperdrive in the Titan expedition.'\n'Fixed a number of issues that could prevent Exocraft scanners 

In [60]:
url="https://www.nomanssky.com/"
patchNoteSection=("a","link link--inherit")
patchPages = get_patch_page_urls()
testPage = get_patch_notes_from_page(patchPages[1])
print(patchPages[1])
print(testPage)

/2025/02/worlds-part-ii-5-56/
{'Title': 'worlds-part-ii-5-56', 'Month': '02', 'Year': '2025', 'Notes': '\'Worlds Part II - 5.56\'\n\'February 13, 2025\'\n\'.\'\n\'Hello Everyone,\'\n\'Thank you to everyone playing No Man’s Sky – Worlds Part II, especially those taking the time to report any issues they encounter via Zendesk or console crash reporting.\'\n\'We are listening closely to your feedback, and have identified and resolved a number of issues. These fixes are included in patch 5.56, which is now live on Steam and will be coming to other platforms as soon as possible.\'\n\'Bug Fixes\'\n\'Fixed an issue that could caused the Pillar of Titan reward to appear as already claimed after transferring back to the main save after completing the Titan Expedition.\'\n\'Fixed an issue that prevented packaged technology from being deployed into the Colossus.\'\n\'Fixed an issue in the Titan Expedition where building the floating variant of the Nautilon Chamber would not allow missions to prog

In [61]:
url="https://www.nomanssky.com/"
patchNoteSection=("a","link link--inherit")
patchPages = get_patch_page_urls()
testPage = get_patch_notes_from_page(patchPages[2])
print(patchPages[2])
print(testPage)

/2025/02/expedition-seventeen-titan/


In [32]:
# Grab individual patch note releases from nms main release=log page
URL = "https://www.nomanssky.com/worlds-part-ii-update/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
not_patch_button = SoupStrainer(class_=_class_not_patch_note_button)
# patch_note_sections = soup.find_all(class_=_class_not_patch_note_button)#,string=re.compile("Patch Notes",flags=re.IGNORECASE))
patch_note_sections = soup.find_all(class_=_class_not_patch_note_button, string=re.compile("Patch Notes", flags=re.IGNORECASE))

In [33]:
all_notes = soup.find_all(string=re.compile("Patch Notes", flags=re.IGNORECASE))
print("All notes:", all_notes)
filtered_notes = [tag for tag in all_notes if _class_not_patch_note_button(tag.parent.get("class"))]  

print("Final Filtered Notes:", filtered_notes)  # Debugging step


All notes: ['Latest patch notes', '5.5 Patch notes']
Final Filtered Notes: ['5.5 Patch notes']
