Added support for Sanoma My Place

FelixFrog · Jul 10, 2024 · 6264916 · 6264916
1 parent 8fde469
commit 6264916
Show file tree

Hide file tree

Showing 4 changed files with 220 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ Please contact me on "Issues" if you want to issue a takedown notice.
 | Palumbo Editore - Saggi Digitali | yes | yes | average/big | no | no | no | yes | no | no | |
 | Cengage Read | yes | yes | enormous (500+ mb) | no | yes | ? | yes | yes | yes | |
 | Oxford Reading Club | yes | yes/(depends on vendor) | big | 2 | no | ? | yes | no | no | |
+| Sanoma My Place | yes | yes | (unimplemented)/excellent | no | yes | very quickly | yes | (yes, not implemented) | no | |
 
 Apps that I am aware of but I can't work with beacuse I don't have books:
  - ~~Appbook (might be a shitty html webview)~~
@@ -36,7 +37,7 @@ Apps that I am aware of but I can't work with beacuse I don't have books:
 ## TODO
  - Make a CLI interface with argparser
  - Token versioning
- - pdfgrabber versining
+ - pdfgrabber versioning
  - Add anonymous user
  - ~~Add ability to pass options to scripts~~
  - Add "perfect" token checks

diff --git a/config-default.ini b/config-default.ini
@@ -31,6 +31,10 @@ Author =
 # Show additional format information in the library. Mostly used for debugging
 ShowFormat = no
 
+# Prefer the vendor-provided labels and toc (table of contents) in case of directly downloaded pdf
+PreferOriginalToc = no
+PreferOriginalLabels = no
+
 [znc]
 # Add a pdf index downloaded form Zanichelli's catalog. If "SkipFirstChapter" is set, then if the pdf is found the first chapter will be deleted. "IndexName" sets the name in the toc
 SearchIndex = no
@@ -77,3 +81,7 @@ RenderScaleLiquidBooks = 0.7
 [orc]
 # Show expired books in the library. The server doesn't check the ownership so all books are downloadable
 AllowExpired = yes
+
+[snm]
+# Use a regex to search for a possible link to the original pdf in master.json. Disable if the documents aren't accesible anymore or if there are false positives
+SearchForOriginal = yes
diff --git a/services/snm.py b/services/snm.py
@@ -0,0 +1,209 @@
+import fitz
+import requests
+from base64 import b64decode
+import json
+from io import BytesIO
+import tempfile
+from zipfile import ZipFile
+from playwright.sync_api import sync_playwright
+from pathlib import Path
+import re
+import lib
+import config
+
+service = "snm"
+
+key = "1cff42dabb60beaf1e3b57988af787246c63613ef60435a05c9c79b98a9b41c8"
+
+configfile = config.getconfig()
+
+def getlogindata(username, password):
+	r = requests.post("https://npmoffline.sanoma.it/mcs/api/v1/login", json={"id": username, "password": password}, headers={"x-timezone-offset": "+0000"})
+	return r.json()
+
+def getlibrary(token):
+	r = requests.get("https://npmoffline.sanoma.it/mcs/api/v1/books", headers={"x-auth-token": f"Bearer {token}"}, params={"app": "true"})
+	return r.json()
+
+def getuserproducts(token, username):
+	r = requests.get(f"https://npmoffline.sanoma.it/mcs/users/{username}/products", headers={"x-auth-token": f"Bearer {token}"})
+	return r.json()
+
+def getbookinfo(token, username, bookid):
+	r = requests.get(f"https://npmoffline.sanoma.it/mcs/users/{username}/products/books/{bookid}", params={"app": "true", "light": "true"}, headers={"x-auth-token": f"Bearer {token}"})
+	return r.json()
+
+def downloadzip(url, tmpfile, progress=False, total=0, done=0):
+	showprogress = bool(progress)
+	r = requests.get(url, stream=showprogress)
+	length = int(r.headers.get("content-length", 1))
+	for data in r.iter_content(chunk_size=102400):
+		tmpfile.write(data)
+		if showprogress:
+			progress(round(done + tmpfile.tell() / length * total))
+
+def getcover(url):
+	r = requests.get(url)
+	return r.content
+
+def extractusername(token):
+	contents = json.loads(b64decode(token.split(".")[1] + "==="))
+	return contents["pes_authorization"]["id"]
+
+def decrypt(data):
+	# the original implementation used JS's String.charCodeAt(p) which returns a utf-16 codepoint value
+	# we can't use urllib.parse.unquote because it doesn't support the "%uXXXX" esacaping (understandably, since it has never been part of any RFC)
+
+	res = ""
+	raw = BytesIO(b64decode(data))
+	i = 0
+
+	while (c := raw.read(1)):
+		keyval = ord(key[(i - 1) % len(key)])
+		if c != b"%":
+			# if not escaped we use the ASCII value directly as the unicode codepoint
+			res += chr(ord(c) - keyval)
+		else:
+			if ((c := raw.read(1)) == b"u"):
+				# if escaped as "%uXXXX" we consider XXXX as the unicode codepoint value
+				c = raw.read(4)
+				res += chr(int(c, 16) - keyval)
+			else:
+				# if escaped as "%XX" we consider XX as a raw byte value
+				c += raw.read(1)
+				res += chr(int(c, 16) - keyval)
+		i += 1
+	return res
+
+def parsestructure(mobj):
+	# this is the most complex structure I have ever seen
+	# each unit can be referred to by either its id or its "idUnit" string only if it is a top-level unit (WTF)
+	# we have to create a tree to enumerate where each page is contained to be able to get the position of the bookmarks
+	toc, labels = [], []
+
+	unitnames = {i["id"]: i["title"] for i in mobj["units"]}
+	groupcodetoid = {i["idUnit"]: i["id"] for i in mobj["units"]}
+	pageidtolabel = {i["id"]: i["label"] for i in mobj["pages"]}
+	first = []
+
+	children = {}
+
+	for i in mobj["units"]:
+		if i["id"] not in children:
+			children[i["id"]] = []
+		if p := i.get("parent_unit"):
+			children[p].append(i["id"])
+		else:
+			first.append(i["id"])
+
+	for i in mobj["pages"]:
+		if not i["chapter"]:
+			parentid = groupcodetoid[i["idUnit"]]
+		else:
+			parentid = i["chapter"]["chapter_id"]
+		children[parentid].append(i["id"])
+
+	pageorder = {i["id"]: i["order"] for i in mobj["pages"]}
+	def order(objid):
+		if objid in pageidtolabel:
+			return pageorder[objid]
+		else:
+			return min([order(i) for i in children[objid]] + [len(pageidtolabel) + 1])
+
+	def generatetoc(ch, level):
+		global tot
+		ch.sort(key=order)
+		for i in ch:
+			if i in pageidtolabel:
+				labels.append(pageidtolabel[i])
+			else:
+				toc.append([level, unitnames[i].strip(), len(labels) + 1])
+				generatetoc(children[i], level + 1)
+
+	first = [i for i in first if children[i]]
+	generatetoc(first, 1)
+
+	return toc, labels
+
+def checkrequest(res):
+	return not (res["result"] is None or res["code"] == 3)
+
+def login(username, password):
+	data = getlogindata(username, password)
+	if not checkrequest(data):
+		print("Login failed: " + data["message"])
+	else:
+		return data["result"]["data"]["access_token"] + "|" + data["result"]["data"]["refresh_token"]
+
+def library(token):
+	accesstoken, refreshtoken = token.split("|")
+	books = {}
+	library = getlibrary(accesstoken)
+	if checkrequest(library):
+		for book in library["result"]["data"]:
+			books[str(book["gedi"])] = {"title": book["name"], "cover": book["image_url"], "isbn": book["isbn"]}
+		return books
+
+def cover(token, bookid, data):
+	return getcover(data["cover"])
+
+def checktoken(token):
+	accesstoken, refreshtoken = token.split("|")
+	username = extractusername(accesstoken)
+	'''
+	library = getlibrary(accesstoken)
+	return checkrequest(library)
+	'''
+	products = getuserproducts(accesstoken, username)
+	return checkrequest(products)
+
+def downloadbook(token, bookid, data, progress):
+	accesstoken, refreshtoken = token.split("|")
+	username = extractusername(accesstoken)
+
+	progress(1, "Getting book info")
+	bookinfo = getbookinfo(accesstoken, username, bookid)
+	if not checkrequest(bookinfo):
+		print(f"Unable to get book info: {bookinfo['message']}")
+		return
+	url = bookinfo["result"]["data"]["url_download"]
+
+	pdf = fitz.Document()
+
+	with tempfile.TemporaryDirectory(prefix="sanoma.", ignore_cleanup_errors=True) as tmpdirfull:
+		tmpdir = Path(tmpdirfull)
+		zippath = tmpdir / f"{bookid}_light.zip"
+
+		progress(3, "Downloading zip")
+		downloadzip(url, open(zippath, "wb"), progress, 40, 3)
+
+		progress(45, "Extracting zip")
+		bookzip = ZipFile(zippath, "r")
+		bookzip.extractall(path=tmpdir)
+
+		master = open(tmpdir / "data" / "master.json")
+
+		urlmatch = re.search(r"https:\\\/\\\/npmitaly-pro-gpd-files\.santillana\.es\\\/editorLM50\\\/([0-9]{6,8})\\\/pdf\\\/(.+?)\.pdf", master.read())
+		master.seek(0)
+
+		mobj = json.load(master)
+		toc, labels = parsestructure(mobj)
+
+		if urlmatch and configfile.getboolean(service, "SearchForOriginal", fallback=True):
+			finalurl = urlmatch.group(0).replace(r"\/", r"/")
+			pdfbytes = BytesIO()
+			progress(47, "Downloading pdf")
+			downloadzip(finalurl, pdfbytes, progress, 45, 47)
+			pdf = fitz.Document(stream=pdfbytes, filetype="pdf")
+		else:
+			print("Error: can't find the source pdf of the book, resorting to manual rendering")
+			print("Manual rendering not implemented yet, contact the developer!")
+			exit()
+
+	progress(98, "Applying toc/labels")
+	if not pdf.get_toc() or not configfile.getboolean(service, "PreferOriginalToc", fallback=False):
+		pdf.set_toc(toc)
+
+	if not pdf.get_page_labels() or not configfile.getboolean(service, "PreferOriginalLabels", fallback=False):
+		pdf.set_page_labels(lib.generatelabelsrule(labels))
+	return pdf
diff --git a/utils.py b/utils.py
@@ -15,7 +15,7 @@
 tokentable = db.table("tokens")
 booktable = db.table("books")
 
-services = {"bsm": "bSmart", "ees": "easyeschool", "hbs": "Mondadori HUB Scuola", "mcm": "MEE2", "myl": "MyLim", "prs": "Pearson Reader+ / Pearson+", "dbk": "Laterza diBooK", "olb": "Oxford Learner’s Bookshelf", "rfl": "Raffaello Player", "cmb": "Cambridge GO", "blk": "Blinklearning", "hoe": "HoepliAcademy+", "pmb": "Palumbo Editore - Saggi Digitali", "cng": "Cengage Read", "orc": "Oxford Reading Club"}
+services = {"bsm": "bSmart", "ees": "easyeschool", "hbs": "Mondadori HUB Scuola", "mcm": "MEE2", "myl": "MyLim", "prs": "Pearson Reader+ / Pearson+", "dbk": "Laterza diBooK", "olb": "Oxford Learner’s Bookshelf", "rfl": "Raffaello Player", "cmb": "Cambridge GO", "blk": "Blinklearning", "hoe": "HoepliAcademy+", "pmb": "Palumbo Editore - Saggi Digitali", "cng": "Cengage Read", "orc": "Oxford Reading Club", "snm": "Sanoma My Place"}
 nologin = ["pmb"]
 oneshots = {"gnt": "mydBook Giunti TVP", "apb": "AppBook"}