Skip to content

Commit

Permalink
Updated wiki download scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
yorikvanhavre committed Feb 26, 2018
1 parent 3dc0f77 commit 8aad83f
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 29 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Expand Up @@ -28,8 +28,7 @@ install_manifest.txt
/ZERO_CHECK.dir/
/build/
/src/Tools/offlinedoc/localwiki/
/src/Tools/offlinedoc/todolist.txt
/src/Tools/offlinedoc/wikifiles.txt
/src/Tools/offlinedoc/*.txt
OpenSCAD_rc.py
.subuser-dev
/\.idea/
Expand Down
11 changes: 10 additions & 1 deletion src/Tools/offlinedoc/README
Expand Up @@ -16,4 +16,13 @@ download and another to actually download the files.

4) run "buildpdf.py" to generate freecad.pdf (wkhtmltopdf must be installed)

5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"
5) the qhelp files can be tested with "assistant -collectionFile freecad.qhc"

6) If you have already downloaded the whole wiki, run "update.py" immediately
after, to create a list of revision IDs for each page.

7) Once the initial revisions list has been created, the "update.py" script
can be ran anytime in the future, to check for pages that have changed
since the stored revision ID. The script is meant to run twice, one to get
a list of pages that have changed, and another one to download the changed
pages (and all their dependencies) again.
62 changes: 37 additions & 25 deletions src/Tools/offlinedoc/buildwikiindex.py
Expand Up @@ -36,9 +36,9 @@

# CONFIGURATION #################################################

URL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
Expand All @@ -48,35 +48,37 @@

wikiindex = "/index.php?title="

def crawl(pagename):
def crawl(pagename=[]):
"downloads an entire wiki site"
todolist = []
processed = []
count = 1
if os.path.exists("wikifiles.txt"):
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading existing list..."
for l in f.readlines():
if l.strip() != "":
if VERBOSE: print "Adding ",l
processed.append(l.strip())
f.close()
if os.path.exists("todolist.txt"):
f = open("todolist.txt","r")
if VERBOSE: print "Reading existing todo list..."
for l in f.readlines():
if l.strip() != "":
todolist.append(l.strip())
f.close()
if pagename:
if not isinstance(pagename,list):
pagename = [pagename]
todolist = pagename
else:
if pagename:
todolist = pagename
if os.path.exists("wikifiles.txt"):
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading existing list..."
for l in f.readlines():
if l.strip() != "":
if VERBOSE: print "Adding ",l
processed.append(l.strip())
f.close()
if os.path.exists("todolist.txt"):
f = open("todolist.txt","r")
if VERBOSE: print "Reading existing todo list..."
for l in f.readlines():
if l.strip() != "":
todolist.append(l.strip())
f.close()
else:
indexpages,imgs = get(INDEX)
todolist.extend(indexpages)
while todolist:
targetpage = todolist.pop()
if not targetpage in NORETRIEVE:
if (not targetpage in NORETRIEVE):
if VERBOSE: print count, ": Scanning ", targetpage
pages,images = get(targetpage)
count += 1
Expand All @@ -92,6 +94,8 @@ def crawl(pagename):
if VERBOSE: print "Fetched ", count, " pages"
if not WRITETHROUGH:
writeList(processed)
if pagename:
return processed
return 0

def get(page):
Expand Down Expand Up @@ -126,33 +130,40 @@ def cleanhtml(html):

def getlinks(html):
"returns a list of wikipage links in html file"
global NORETRIEVE
links = re.findall('<a[^>]*>.*?</a>',html)
pages = []
for l in links:
# rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l)
if not rg:
rg = re.findall('href="\/wiki\/(.*?)"',l)
if "images" in rg:
rg = None
if rg:
rg = rg[0]
if not "Command_Reference" in rg:
if "#" in rg:
rg = rg.split('#')[0]
if ":" in rg:
NORETRIEVE.append(rg)
if ";" in rg:
NORETRIEVE.append(rg)
if "&" in rg:
NORETRIEVE.append(rg)
if ";" in rg:
NORETRIEVE.append(rg)
if "/" in rg:
if not GETTRANSLATIONS:
NORETRIEVE.append(rg)
pages.append(rg)
if not rg in NORETRIEVE:
pages.append(rg)
print "got link: ",rg
return pages

def getimagelinks(html):
"returns a list of image links found in an html file"
return re.findall('<img.*?src="(.*?)"',html)
imlinks = re.findall('<img.*?src="(.*?)"',html)
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
return imlinks

def fetchpage(page):
"retrieves given page from the wiki"
Expand All @@ -165,6 +176,7 @@ def fetchpage(page):
except HTTPError:
failcount += 1
print 'Error: unable to fetch page ' + page
sys.exit()

def cleanList(pagelist):
"cleans the list"
Expand Down
4 changes: 3 additions & 1 deletion src/Tools/offlinedoc/downloadwiki.py
Expand Up @@ -35,7 +35,7 @@

# CONFIGURATION #################################################

DEFAULTURL = "http://www.freecadweb.org/wiki" #default URL if no URL is passed
DEFAULTURL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
GETTRANSLATIONS = False # Set true if you want to get the translations too.
Expand Down Expand Up @@ -189,6 +189,8 @@ def getlinks(html):
for l in links:
# rg = re.findall('php\?title=(.*)\" title',l)
rg = re.findall('href=.*?php\?title=(.*?)"',l)
if not rg:
rg = re.findall('href="\/wiki\/(.*?)"',l)
if rg:
rg = rg[0]
if not "Command_Reference" in rg:
Expand Down
198 changes: 198 additions & 0 deletions src/Tools/offlinedoc/update.py
@@ -0,0 +1,198 @@
#!/usr/bin/env python

#***************************************************************************
#* *
#* Copyright (c) 2017 Yorik van Havre <yorik@uncreated.net> *
#* *
#* This program is free software; you can redistribute it and/or modify *
#* it under the terms of the GNU Lesser General Public License (LGPL) *
#* as published by the Free Software Foundation; either version 2 of *
#* the License, or (at your option) any later version. *
#* for detail see the LICENCE text file. *
#* *
#* This program is distributed in the hope that it will be useful, *
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
#* GNU Library General Public License for more details. *
#* *
#* You should have received a copy of the GNU Library General Public *
#* License along with this program; if not, write to the Free Software *
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
#* USA *
#* *
#***************************************************************************

__title__="update.py"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://www.freecadweb.org"

"""
This script needs to be ran after the wiki has been fully downloaded. It has three usages:
1) If no revisions.txt file is found, it parses the contents of the wikifiles.txt file
and, for each entry, it retrieves a corresponding revision ID, and creates a revisions.txt file
2) If a revisions.txt file exists but no update.txt file exists, it crawls through all entries of
wikifiles.txt, and for each one, compares the current revision with the one stored in revisions.txt.
An update.txt file is created with all pages that have different revision IDs
3) If update.txt exists, each entry of it will be scanned again for new links and all the needed
files downloaded. Revision.txt and wikifiles.txt get also updated.
"""

import sys, os, re, tempfile, getopt
from urllib2 import urlopen, HTTPError

# CONFIGURATION #################################################

URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.

# END CONFIGURATION ##############################################

wikiindex = "/index.php?title="

def update(pagename=None):

if not os.path.exists("revisions.txt"): # case 1)
if not os.path.exists("wikifiles.txt"):
print "No wikifiles.txt found. Aborting"
sys.exit()
pages = []
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading existing list..."
for l in f.readlines():
if l.strip() != "":
if not "/wiki/" in l:
if VERBOSE: print "Adding ",l.strip()
pages.append(l.strip())
f.close()
if VERBOSE: print "Added ",str(len(pages))," entries"
i = 1
revs = []
for page in pages:
rev = getRevision(page)
if VERBOSE: print str(i)," revision: ",rev
revs.append(page+":"+rev)
i += 1
writeList(revs,"revisions.txt")
print "All done. Successfully written revisions.txt with ",len(revs)," entries."

elif os.path.exists("revisions.txt") and (not os.path.exists("updates.txt")): # case 2)
f = open("revisions.txt","r")
if VERBOSE: print "Reading revisions list..."
revisions = {}
for l in f.readlines():
if l.strip() != "":
r = l.strip().split(":")
p = ":".join(r[:-1])
if VERBOSE: print "Adding ",p
revisions[p] = r[1]
f.close()
if VERBOSE: print "Added ",str(len(revisions.keys()))," entries"
updates = []
i = 1
for page in revisions.keys():
rev = getRevision(page)
if rev != revisions[page]:
if VERBOSE: print str(i),page," has a new revision: ",rev
updates.append(page)
else:
if VERBOSE: print str(i),page," is up to date "
i += 1
if updates:
writeList(updates,"updates.txt")
print "All done. Successfully written updates.txt with ",len(updates)," entries."
else:
print "Everything up to date. Nothing to be done."

elif os.path.exists("revisions.txt") and os.path.exists("updates.txt"): # case 3)
if not os.path.exists("wikifiles.txt"):
print "No wikifiles.txt found. Aborting"
sys.exit()
wikifiles = []
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading wikifiles list..."
for l in f.readlines():
if l.strip() != "":
wikifiles.append(l.strip())
f.close()
if VERBOSE: print "Read ",str(len(wikifiles))," entries"
f = open("revisions.txt","r")
if VERBOSE: print "Reading revisions list..."
revisions = {}
for l in f.readlines():
if l.strip() != "":
r = l.strip().split(":")
p = ":".join(r[:-1])
revisions[p] = r[1]
f.close()
todo = []
f = open("updates.txt","r")
if VERBOSE: print "Reading updates list..."
for l in f.readlines():
if l.strip() != "":
todo.append(l.strip())
f.close()
if VERBOSE: print str(len(todo))," pages to scan..."
import buildwikiindex
buildwikiindex.WRITETHROUGH = False
buildwikiindex.VERBOSE = VERBOSE
updates = []
for t in todo:
if VERBOSE: print "Scanning ",t
updates.extend(buildwikiindex.crawl(t))
updates = [u for u in updates if not u in wikifiles]
if VERBOSE: print str(len(updates))," files to download..."
import downloadwiki
i = 1
for u in updates:
if VERBOSE: print i, ": Fetching ", u
downloadwiki.get(u)
if not "/wiki/" in u:
rev = getRevision(u)
revisions[u] = rev
if not u in wikifiles:
wikifiles.append(u)
i += 1
if VERBOSE: print "Updating wikifiles and revisions..."
writeList(wikifiles,"wikifiles.txt")
updatedrevs = []
for k in revisions.keys():
updatedrevs.append(k+":"+revisions[k])
writeList(updatedrevs,"revisions.txt")
os.remove("updates.txt")
if VERBOSE: print "All done!"

def getRevision(page):
html = fetchPage(page)
revs = re.findall("wgCurRevisionId\"\:(.*?),",html)
if len(revs) == 1:
return revs[0]
print 'Error: unable to get revision ID of ' + page
sys.exit()

def fetchPage(page):
"retrieves given page from the wiki"
print "fetching: ",page
failcount = 0
while failcount < MAXFAIL:
try:
html = (urlopen(URL + wikiindex + page).read())
return html
except HTTPError:
failcount += 1
print 'Error: unable to fetch page ' + page
sys.exit()

def writeList(pages,filename):
f = open(filename,"wb")
for p in pages:
f.write(p+"\n")
f.close()
if VERBOSE: print "written ",filename

if __name__ == "__main__":
update(sys.argv[1:])

0 comments on commit 8aad83f

Please sign in to comment.