This repository has been archived by the owner on Mar 31, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
export_things.py
303 lines (248 loc) · 11.1 KB
/
export_things.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#!/usr/bin/python
# Thingiverse* exporter
# by Carlos Garcia Saura
# CC-BY-SA license (http://creativecommons.org/licenses/by-sa/3.0/)
# https://github.com/carlosgs/export-things
# *Unofficial program, not associated with Thingiverse
# Use at your own risk!
# Modules
import requests
from BeautifulSoup import BeautifulSoup
import os
import re
import urllib
import time
import pickle # For file saving
# EDIT THIS!
user = "carlosgs" # User from Thingiverse (as in the profile URL)
authorName = "Carlos Garcia Saura (carlosgs)" # Any string is OK
authorDescription = "<http://example.com/>"
readmeHeader = "**Please note: This list of things was [automatically generated](https://github.com/carlosgs/export-things). Make sure to check the individual licenses and authorships.** \n"
thingReadmeHeader = "**Please note: This thing is part of a list that was [automatically generated](https://github.com/carlosgs/export-things) and may have been updated since then. Make sure to check for the current license and authorship.** \n"
listPageTitle = "Things designed by " + authorName
#listPageTitle = "Things liked by " + authorName
urlPathToDownload = "/designs/page:" # "/likes/page:" # Set to the url you want to download from (either your posted designs or your liked designs)
authorMark = True # If set true, will write your author name and description at the bottom of all pages
downloadFiles = True # If set to false, will link to original files instead of downloading them
redownloadExistingFiles = False # This saves time when re-running the script in long lists (but be careful, it only checks if file already exists -not that it is good-)
redownloadExistingThings = True # If set False, it won't re-download anything fron things that already have a folder (be careful, it ONLY checks if the THING FOLDER already exists -not that it is good-). Useful to save time when resuming long lists
url = "https://www.thingiverse.com"
# Helper function to dump data to files with Pickle
def saveToFile(data,path,filename):
with open(path+filename, 'wb') as path_file:
ret = pickle.dump(data, path_file, protocol=2)
path_file.close()
return ret
raise Exception("Could not save " + path + filename)
# Helper function to load data from files with Pickle
def loadFromFile(path,filename):
with open(path+filename, 'rb') as path_file:
ret = pickle.load(path_file)
path_file.close()
return ret
raise Exception("Could not load " + path + filename)
# Helper function to create directories
def makeDirs(path):
try:
os.makedirs(path)
except:
return -1
return 0
# Helper function to perform the required HTTP requests
def httpGet(page, filename=False, redir=True):
if filename and not redownloadExistingFiles and os.path.exists(filename):
return [] # Simulate download OK for existing file
try:
r = requests.get(page, allow_redirects=redir)
except:
time.sleep(10)
return httpGet(page, filename, redir)
if r.status_code != 200:
print(r.status_code)
return -1
if not filename:
# Remove all non ascii characters
text = (c for c in r.content if 0 < ord(c) < 127) # changed from r.text to r.content
text = ''.join(text)
return text.encode('ascii', 'ignore')
else:
with open(filename, 'wb') as fd:
for chunk in r.iter_content(512):
fd.write(chunk)
fd.close()
return r.history
# Helper function to remove all html tags and format to a BeautifulSoup object
# This is a patch, since the getText function gives problems with non-ascii characters
def myGetText(BScontent):
try:
text = str(BScontent.getText(separator=u' ')) # Won't work with non-ascii characters
except:
text = re.sub('<[^<]+?>', '', str(BScontent)) # If there are non-ascii characters, we strip tags manually with a regular expression
return text.strip() # Remove leading and trailing spaces
thingList = {}
print("Username: " + user)
with open("README.md", 'w') as fdr: # Generate the global README file with the list of the things
fdr.write(listPageTitle)
fdr.write("\n===============\n\n")
fdr.write(readmeHeader)
thingCount = 1
pgNum = 1
while 1: # Iterate over all the pages of things
print("\nPage number: " + str(pgNum))
# if pgNum < 17:
# pgNum += 1
# continue
res = httpGet(url + "/" + user + urlPathToDownload + str(pgNum), redir=False)#, filename="test" + str(pgNum) + ".html")
if res == -1: break
res_xml = BeautifulSoup(res, convertEntities=BeautifulSoup.HTML_ENTITIES)
things = res_xml.findAll("div", { "class":"thing thing-interaction-parent" })
for thing in things: # Iterate over each thing
thingList[thingCount] = {}
#title = str(thing["title"])
title = str(thing.findAll("span", { "class":"thing-name" })[0].text.encode('utf-8', 'ignore'))
title = re.sub("\[[^\]]*\]","", title) # Optional: Remove text within brackets from the title
title = title.strip()
id = str(thing["data-thing-id"]) # Get title and id of the current thing
thingList[thingCount]["title"] = title
thingList[thingCount]["id"] = id
print("\nProcessing thing: " + id + " : " + title)
# if id != "59196": continue
folder = "-".join(re.findall("[a-zA-Z0-9]+", title)) # Create a clean title for our folder
print(folder)
previewImgUrl = str(thing.findAll("img", { "class":"thing-img" })[0]["src"]) # Get the link for the preview image
previewImgName = previewImgUrl.split('/')[-1]
previewImgFile = folder + "/img/" + previewImgName
thingList[thingCount]["folder"] = folder
thingList[thingCount]["previewImgUrl"] = previewImgUrl
thingList[thingCount]["previewImgName"] = previewImgName
thingList[thingCount]["previewImgFile"] = previewImgFile
if redownloadExistingThings or not os.path.exists(folder):
makeDirs(folder) # Create the required directories
makeDirs(folder + "/img")
print("Downloading preview image ( " + previewImgName + " )")
httpGet(previewImgUrl, previewImgFile) # Download the preview image
print("Loading thing data")
res = httpGet(url + "/thing:" + id, redir=False) # Load the page of the thing
if res == -1:
print("Error while downloading " + id + " : " + title)
exit()
res_xml = BeautifulSoup(res, convertEntities=BeautifulSoup.HTML_ENTITIES)
description = res_xml.findAll("div", { "id":"description" })
if description:
description = "".join(str(item) for item in description[0].contents) # Get the description
description = description.strip()
else:
description = "None"
thingList[thingCount]["description"] = description
instructions = res_xml.findAll("div", { "id":"instructions" })
if instructions:
instructions = "".join(str(item) for item in instructions[0].contents) # Get the instructions
instructions = instructions.strip()
else:
instructions = "None"
thingList[thingCount]["instructions"] = instructions
license = res_xml.findAll("div", { "class":"license-text" })
if license:
license = myGetText(license[0]) # Get the license
else:
license = "CC-BY-SA (default, check actual license)"
thingList[thingCount]["license"] = license
tags = res_xml.findAll("div", { "class":"thing-info-content thing-detail-tags-container" })
if tags:
tags = myGetText(tags[0]) # Get the tags
else:
tags = "None"
if len(tags) < 2: tags = "None"
thingList[thingCount]["tags"] = tags
header = res_xml.findAll("div", { "class":"thing-header-data" })
if header:
header = myGetText(header[0]) # Get the header (title + date published)
else:
header = "None"
if len(header) < 2: header = "None"
thingList[thingCount]["header"] = header
files = {}
for file in res_xml.findAll("div", { "class":"thing-file" }): # Parse the files and download them
fileUrl = url + str(file.a["href"])
fileName = str(file.a["data-file-name"])
filePath = folder + "/" + fileName
if downloadFiles:
print("Downloading file ( " + fileName + " )")
httpGet(fileUrl, filePath)
else:
print("Skipping download for file: " + fileName + " ( " + fileUrl + " )")
filePreviewUrl = str(file.img["src"])
filePreviewPath = filePreviewUrl.split('/')[-1]
filePreview = folder + "/img/" + filePreviewPath
print("-> Downloading preview image ( " + filePreviewPath + " )")
httpGet(filePreviewUrl, filePreview)
files[filePath] = {}
files[filePath]["url"] = fileUrl
files[filePath]["name"] = fileName
files[filePath]["preview"] = filePreviewPath
thingList[thingCount]["files"] = files
gallery = res_xml.findAll("div", { "class":"thing-page-slider main-slider" })[0]
images = []
images_full = {}
for image in gallery.findAll("div", { "class":"thing-page-image featured" }): # Parse the images and download them
imgUrl = str(image["data-large-url"])
imgName = imgUrl.split('/')[-1]
imgFile = folder + "/img/" + imgName
print("Downloading image ( " + imgName + " )")
httpGet(imgUrl, imgFile)
images.append(imgName)
images_full[imgFile] = {}
images_full[imgFile]["url"] = imgUrl
images_full[imgFile]["name"] = imgName
thingList[thingCount]["images"] = images_full
# Write in the page for the thing
with open(folder + "/README.md", 'w') as fd: # Generate the README file for the thing
fd.write(title)
fd.write("\n===============\n")
fd.write(thingReadmeHeader + "\n")
fd.write(header)
if len(images) > 0:
fd.write('\n\n![Image](img/' + urllib.quote(images[0]) + ')\n\n')
fd.write("Description\n--------\n")
fd.write(description)
fd.write("\n\nInstructions\n--------\n")
fd.write(instructions)
fd.write("\n\nFiles\n--------\n")
for path in files.keys():
file = files[path]
fileurl = file["url"]
if downloadFiles:
fileurl = file["name"]
fd.write('[![Image](img/' + urllib.quote(file["preview"]) + ')](' + file["name"] + ')\n')
fd.write(' [ ' + file["name"] + '](' + fileurl + ') \n\n')
if len(images) > 1:
fd.write("\n\nPictures\n--------\n")
for image in images[1:]:
fd.write('![Image](img/' + urllib.quote(image) + ')\n')
fd.write("\n\nTags\n--------\n")
fd.write(tags + " \n\n")
fd.write(" \n\nLicense\n--------\n")
fd.write(license + " \n\n")
if authorMark:
fd.write("\n\nBy: " + authorName + "\n--------\n")
fd.write(authorDescription)
fd.close()
thing = thingList[thingCount]
# Add to the global thing list
fdr.write(str(thingCount) + '. [' + thing["title"] + '](' + thing["folder"] + '/)\n')
fdr.write("--------\n")
fdr.write('[![Image](' + thing["previewImgFile"] + ')](' + thing["folder"] + '/) \n\n')
fdr.flush()
thingCount += 1
#if thingCount > 2: break
#if thingCount > 2: break
pgNum += 1
fdr.write(" \n\nLicense\n--------\n")
# fdr.write("CC-BY-SA (unless other specified)\n\n")
fdr.write("Please check the individual pages for each design\n\n")
if authorMark:
fdr.write("\n\nBy: " + authorName + "\n--------\n")
fdr.write(authorDescription)
fdr.close()
saveToFile(thingList,"./","thingList_data.p")
print("\n\nIt's done!! Keep knowledge free!! Au revoir Thingiverse!!\n")