# No Man's Sky (Hello Games) Patch Note Scraping

In [97]:
import numpy as np
import pandas as pd
import requests
import re
import time
from datetime import datetime
import random
import copy
import os
from tqdm import tqdm
from bs4 import BeautifulSoup, SoupStrainer
from bs4.diagnose import diagnose

In [2]:
def get_patch_page_urls(
    url="https://www.nomanssky.com/",
    releaseLogPage="/release-log/",
    patchNoteSection=("a","link link--inherit")
):
    
    page = requests.get(url+releaseLogPage)
    soup = BeautifulSoup(page.content, "html.parser")

    ## Grab the list of individual patch release pages 
    if len(patchNoteSection)==1:
        ## UNTESTED
        htmlTarget = patchNoteSection[0]
        patchPages = soup.find_all(htmlTarget, href=True)
    elif len(patchNoteSection)==2:
        htmlTarget = patchNoteSection[0]
        htmlClass = patchNoteSection[1]
        patchPages = soup.find_all(htmlTarget, class_=htmlClass) 
    else:
        raise ValueError("Invalid patchNoteSection format.")
     
    links = [patch.get("href") for patch in patchPages]

    ## Clean links of any base URL ('url' variable) content. 
    ## This also helps address bugs introduced wherein some pages links are given in
    ##   `www.nommanssky.com` format (equivalent to http://www.nommanssky.com) rather than
    ## the expected 
    ##   `https://www.nomanssky.com/` format.
    splitLinks = [link.split(".com")[-1] if len(link.split(".com"))>1 else link for link in links]

    ## Clean whitespace
    finalLinks = [link.strip() for link in splitLinks]
    
    return finalLinks

def _class_not_patch_note_button(class_):
    if not class_:
        return True  # Exclude elements with no class

    ## check to ensure class_ var hasn't been passed a list
    if isinstance(class_, list):
        ## in which case we need to examine the list rather than an exact match
        if ("btn" in class_ and "btn--primary" in class_): 
            return False  # Exclude if both are present
    
    return class_ != "btn btn--primary"  # Default behaviour: return True when not patch note button, False when it is an exact match with button definition

def get_patch_notes_from_page(
    patchPageUrl,
    baseUrl="https://www.nomanssky.com/",
    identifyPatchNotesString="Patch Notes",
    identifyBugFixesString="Bug Fixes"
):
    output = {
        "Title": None,
        "Date": None,
        "Notes": None,
        "Patch update": False,
        "Bug fix update": False,
        "Scrape error": False,
        "Patch page url": copy.deepcopy(baseUrl)+copy.deepcopy(patchPageUrl),
    }

    page = requests.get(baseUrl+patchPageUrl)
    soup = BeautifulSoup(page.content, "html.parser")
        
    ## Try default page title assignment
    try:
        releaseTitleNote = soup.find(class_="text--heading-centered margin--bottom-default")
        releaseTitleString = str(list(releaseTitleNote.stripped_strings)[0])
        output["Title"] = releaseTitleString
    except:
        output["Title"] = None 

    rawUrlNote = patchPageUrl.split("/")
    urlNote = [xx for xx in rawUrlNote if xx] # Clean out any empty strings from 'str.split("/")'
    
    ## Backup method to find release title
    if output["Title"] == None:
        ## Title patch page entry by default naming convention of NMS URLs    
        output["Title"] = urlNote[-1]
    

    ## Attempt to assign date
    try:
        ## Frequently the date occurs in a html span objecth class_=="date"
        ## This tends to be given in format of, for example: "August 18, 2016."
        dateNote = soup.find(class_="date")
        dateString = str(list(dateNote.stripped_strings)[0])
        ## Note the dropped final full-stop. `stripped_strings` line above seems to remove this...
        ## We also want to remove the comma, as there are edge-cases where the comma is absent
        output["Date"] = datetime.strptime(dateString.replace(",",""),"%B %d %Y")
    except:
        ## Okay, no date in this format
        output["Date"] = None

    ## Attempt backup. so, if we found no date, try the url format instead
    if output["Date"] == None:
        try:
            dateNote = urlNote[-2]
            ## No day of month given in the url format, so will default to 1st of the month for all
            dateNote = dateNote + ["01"]
            dateString = " ".join(dateNote) 
            dateString = dateString.strip()
            output["Date"] = datetime.strptime(dateString,"%Y %m %d")
        except:
            ## Okay, no date in this format either. For now, set this Date value to None.
            output["Date"] = None

        
    ## Retrieve all elements that contain the patch notes string
    allNotes = soup.find_all(string=re.compile(identifyPatchNotesString, flags=re.IGNORECASE))
    
    ## Filter out elements with "btn btn--primary"
    noteSection = [tag for tag in allNotes if _class_not_patch_note_button(tag.parent.get("class"))]  

    ## If no patch notes found in this format, check old patch release page format
    ## Where patch note string is just in `Patch [number]` format
    ## Frequently these seem to actually be Bug Fixes, and should be flagged accordingly...
    if not noteSection:
        output["Bug fix update"] = True
        oldIdentifyPatchNotesString = identifyPatchNotesString.split(" ")[0]
        ## Retrieve all elements that contain the patch notes string
        allNotes = soup.find_all(string=re.compile(oldIdentifyPatchNotesString, flags=re.IGNORECASE))
    
        ## Filter out elements with "btn btn--primary"
        noteSection = [tag for tag in allNotes if _class_not_patch_note_button(tag.parent.get("class"))]  
        
    
    ## If no Patch Notes info has been found, try scraping for bug fix notes instead
    if not noteSection:
        output["Bug fix update"] = True
        ## Retrieve all elements that contain the bug fix string
        allNotes = soup.find_all(string=re.compile(identifyBugFixesString, flags=re.IGNORECASE))

        ## Filter out elements with "btn btn--primary"
        noteSection = [tag for tag in allNotes if _class_not_patch_note_button(tag.parent.get("class"))]  


    if output["Bug fix update"] == False:
        output["Patch update"] = True
    
    ## Now that we've located the relevant section of the page, let us go
    ##  back up the beautiful soup html tree, to identify the notes that follow
    notes = []
    for nn in noteSection:
        ## We need to climb several levels. At least one for the string itself, one for the placeholder of that string on page
        ##  and one more for that page section. However, there may be more, hence the while loop.
        ## Finally, we should find the section containing the patch/bug fix notes

        parentLevel = 0
        pageSection = nn.parent
        search = pageSection.find_all("li")

        while ((parentLevel <=5)&(len(search)<=3)):
            newSection = pageSection.parent
            search = newSection.find_all("li")
            pageSection = newSection
            parentLevel+=1

        for ee in pageSection.stripped_strings:
            notes.append(repr(ee))

    if not notes:
        output["Scrape error"] = True
        
    output["Notes"] = "\n".join(notes)
    return output

def get_patch_notes(
    url="https://www.nomanssky.com/",
    releaseLogPage="/release-log/",
    patchNoteSection=("a","link link--inherit"),
    identifyPatchNotesString="Patch Notes",
    identifyBugFixesString="Bug Fixes"):

    out = {}
    print(f"Finding individual patch release pages from {url+releaseLogPage} ...")
    patchPages = get_patch_page_urls(
        url=url,
        releaseLogPage=releaseLogPage,
        patchNoteSection=patchNoteSection
    )

    nPages = len(patchPages)
    print(f"Found {nPages} individual patch release pages!")
    print(f"Scraping patch notes...")
    # print(f"DEBUG mode active!! Pages gathered limited!!!!")
    for ii,page in tqdm(enumerate(patchPages),total=nPages):
        ## Reduce request frequency in less 'robotic' fashion to prevent sites blocking access
        time.sleep(random.uniform(0.0,3.0))
        notes = get_patch_notes_from_page(
            patchPageUrl=page,
            baseUrl=url,
            identifyPatchNotesString=identifyPatchNotesString,
            identifyBugFixesString=identifyBugFixesString
        )
        out[ii] = copy.deepcopy(notes)

    return out

## Main

### Part 1: Scrape NMS webpage data

In [7]:
nmsPatchNotes = get_patch_notes()

Finding individual patch release pages from https://www.nomanssky.com//release-log/ ...
Found 239 individual patch release pages!
Scraping patch notes...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 239/239 [08:55<00:00,  2.24s/it]


####

### Convert scraped data to dataframe

In [8]:
df = pd.DataFrame.from_dict(nmsPatchNotes, orient="index")

In [9]:
print(df.head(),"\n"*2,"="*20)
print(df.describe(),"\n"*2,"="*20)
print(df.info(),"\n"*2,"="*20)

                         Title       Date  \
0        Worlds Part II - 5.58 2025-03-04   
1        Worlds Part II - 5.57 2025-02-14   
2        Worlds Part II - 5.56 2025-02-13   
3  Expedition Seventeen: Titan 2025-02-12   
4        Worlds Part II - 5.55 2025-02-12   

                                               Notes  Patch update  \
0  'Worlds Part II - 5.58'\n'March 04, 2025'\n'.'...         False   
1  'Worlds Part II - 5.57'\n'February 14, 2025'\n...         False   
2  'Worlds Part II - 5.56'\n'February 13, 2025'\n...         False   
3  'Expedition Seventeen: Titan'\n'February 12, 2...          True   
4  'Worlds Part II - 5.55'\n'February 12, 2025'\n...         False   

   Bug fix update  Scrape error  \
0            True         False   
1            True         False   
2            True         False   
3           False         False   
4            True         False   

                                      Patch page url  
0  https://www.nomanssky.com//2025/03/worl

Convert dtypes

In [10]:
df = df.astype({'Title':str,'Date':"datetime64[ns]",'Bug fix update':"boolean",'Patch update':"boolean",'Scrape error':"boolean",'Notes':str,'Patch page url':str})

Sort and reverse row indices --- this (hopefully) future-proofs the dataset if/when new patches are released by having the latest patch be the page with the highest index value.

In [11]:
df = df.loc[::-1].reset_index(drop=True)

####

In [12]:
print(df.head(),"\n"*2,"="*20)
print(df.describe(),"\n"*2,"="*20)
print(df.info(),"\n"*2,"="*20)

           Title       Date  \
0  PC Patch 1.04 2016-08-18   
1  PC Patch 1.05 2016-08-19   
2  PC Patch 1.06 2016-08-20   
3  PC Patch 1.07 2016-09-02   
4  PC Patch 1.08 2016-09-04   

                                               Notes  Patch update  \
0  "PC Patch 1.04 - No Man's Sky"\n'Menu'\n'Lates...         False   
1  "PC Patch 1.05 - No Man's Sky"\n'Menu'\n'Lates...         False   
2  "PC Patch 1.06 - No Man's Sky"\n'Menu'\n'Lates...         False   
3  "PC Patch 1.07 - No Man's Sky"\n'Menu'\n'Lates...         False   
4  "PC Patch 1.08 - No Man's Sky"\n'Menu'\n'Lates...         False   

   Bug fix update  Scrape error                             Patch page url  
0            True         False  https://www.nomanssky.com//pc-patch-1-04/  
1            True         False  https://www.nomanssky.com//pc-patch-1-05/  
2            True         False  https://www.nomanssky.com//pc-patch-1-06/  
3            True         False  https://www.nomanssky.com//pc-patch-1-07/  
4      

### Part 2: Scrape wikipedia.org NMS page to get named releases' date of release data

In [99]:
wikipediaURL ="https://en.wikipedia.org/wiki/No_Man%27s_Sky"
page = requests.get(wikipediaURL)
soup = BeautifulSoup(page.content, "html.parser")

In [100]:
for caption in soup.find_all("caption"):
    captionsList = list(caption.strings)
    tokenisedCaptions = [capt.lower().split(" ") for capt in captionsList]
    for capt in tokenisedCaptions:
        if ("list" in capt)&("updates" in capt):
            table = caption.find_parent("table", {"class": "wikitable"})

In [101]:
wikiData = {}
for ii, entry in enumerate(table.find_all("tr")):
    output = {
        "Title": None,
        "Date": None,
        "Notes": None,
        "Ref": None
        }
    content = [" ".join(list(val.strings)) for val in entry.contents if val != "\n"]
    output["Date"], output["Title"], output["Notes"], output["Ref"] = content

    ## If the entry is a valid row, it will have a date associated with it. The table headers row, however, will not. So we will skip that first row with this try, except block.
    try:
        dateString = output["Date"].strip()
        output["Date"] = datetime.strptime(dateString,"%B %Y")
    except:
        continue
        
    wikiData[ii] = output.copy()

In [102]:
wikiDF =  pd.DataFrame.from_dict(wikiData, orient="index")
wikiDF = wikiDF.astype({'Title':str,'Date':"datetime64[ns]",'Notes':str,'Ref':str})
wikiDF.head()

Unnamed: 0,Title,Date,Notes,Ref
1,Foundation\n,2016-11-01,"User-made bases built from modular components,...",[ 38 ] \n
2,Pathfinder\n,2017-03-01,Ability to share bases with other players New ...,[ 39 ] [ 40 ] [ 41 ] \n
3,Atlas Rises\n,2017-08-01,Addition of new narrative branch to main game'...,[ 43 ] [ 42 ] \n
4,Next\n,2018-07-01,Added support for Xbox One and WeChat Full ...,[ 44 ] [ 45 ] [ 46 ] [ 47 ] [ 48 ] [ 49 ] [ 50...
5,The Abyss\n,2018-10-01,Expanded aquatic biomes features include new c...,[ 52 ] \n


### Save the data

In [13]:
savePath = "./Data/Raw/"
saveFile = "NMS_patch_notes.xlsx"

tmp = ""
for savePathChunk in savePath.split("/")[:-1]:
    tmp += savePathChunk + "/"
    try:
        os.mkdir(tmp)
    except:
        pass

with pd.ExcelWriter(path=savePath+saveFile,mode="w") as writer:
    df.to_excel(writer)

In [104]:
savePath = "./Data/Raw/"
saveFile = "NMS_wikipedia_major_releases_notes.xlsx"

tmp = ""
for savePathChunk in savePath.split("/")[:-1]:
    tmp += savePathChunk + "/"
    try:
        os.mkdir(tmp)
    except:
        pass

with pd.ExcelWriter(path=savePath+saveFile,mode="w") as writer:
    wikiDF.to_excel(writer)

#

#