-
Notifications
You must be signed in to change notification settings - Fork 0
/
findEtextbooks.py
173 lines (158 loc) · 6.27 KB
/
findEtextbooks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import datetime
import json
import os
import re
import requests
import string
from checksumdir import dirhash # folder md5
from xlrd import open_workbook # Excel files
from src.utils import *
from src.ProgressBar import ProgressBar
currentPeriod = "2018_0_spring" # 1 for fall, 0 for spring
print ("\nCURRENT PERIOD: %s\n" % currentPeriod)
worldcatAI = 'falveylibrary'
pubFilePath = "PublisherFiles" # DRM free available books
storeFilePath = "BookstoreFiles" # Course requirements
catFilePath = "CatalogFiles" # What we have
def getISBNsFromFolder(foldername, prefix=''):
isbns = []
if os.path.exists(foldername):
print ('\n%s/' % foldername)
hash = dirhash(foldername, 'md5')
hashFile = 'hashes/%s-%s.txt' % (prefix, hash)
print ('= %s' % hash)
if not os.path.exists(hashFile):
for file in os.listdir(foldername):
ext = file[-4:].lower()
if ext == '.xls' or ext == 'xlsx':
isbns = isbns + (findExcelISBNs(file, foldername)) # EXCEL
else:
isbns = isbns + (findISBNs(file, foldername))
isbns = sortUnique(isbns)
with open(hashFile, "w") as hashFile:
hashFile.write("%s" % '\n'.join(isbns))
return isbns
else:
with open(hashFile, "r") as hashw:
isbns = [isbn.strip() for isbn in hashw]
print ('= Loaded %s from file (%s)' % (foldername, comma(len(isbns))))
return isbns
else:
print ('\nNo %s' % foldername)
return []
def getMetadata (matchingISBNs, outFileName, exact=False):
with open ("%s.csv" % outFileName, "w") as csvfile:
if len(matchingISBNs) == 0:
csvfile.write("nothing")
return
fields = 'isbn,year,ed,title,author,lang,url,publisher,form,city'
bar = ProgressBar(len(matchingISBNs), label='%s: %u ' % (outFileName, len(matchingISBNs)))
for isbn in matchingISBNs:
urlz = 'http://xisbn.worldcat.org/webservices/xid/isbn/'+isbn+'?method=getMetadata&fl='+fields+'&format=csv&ai='+worldcatAI
response = requests.get(urlz)
if not str(response.text)[:1] == '9':
csvfile.write("%s\n" % isbn)
else:
csvfile.write("%s\n" % str(response.text).strip())
bar.progress()
if exact:
bar.finish("%.3f%%" % (100 * len(matchingISBNs) / len(bookstoreISBNs)))
else:
bar.finish("%.3f%%" % (100 * len(matchingISBNs) / len(xCourseISBNs)))
# Expanded ISBNs
print ('BookstoreFiles/')
# Bookstore JSON
bookstoreISBNs = []
bookstoreJSON = []
for file in os.listdir(storeFilePath):
with open(os.path.join(storeFilePath, file), 'r') as jsonFile:
bookstoreJSON.extend(json.load(jsonFile))
bookstoreISBNs = [x['isbn'] for x in bookstoreJSON]
expandedHashFile = expandedHashPath()
print ('= %s' % expandedHashFile)
if not os.path.exists(expandedHashFile):
xCourseISBNs = expandCourseISBNs(bookstoreJSON, worldcatAI)
else:
with open(expandedHashFile, "r") as courseFile:
xCourseISBNs = [book.strip() for book in courseFile]
print ('> Editions loaded from file (%s)' % comma(len(xCourseISBNs)))
pubISBNs = getISBNsFromFolder(pubFilePath, prefix='pub')
catISBNs = getISBNsFromFolder(catFilePath, prefix='cat')
# match the files
# needToBuy in pubFile but not cat
# printBooks in cat but not pubfile
# ebookMatches in pubfile and cat
bar = ProgressBar(
len(xCourseISBNs),
label='Looking for %s ISBNs in a pool of %s ' % (
comma(len(xCourseISBNs)),
comma(len(pubISBNs) + len(catISBNs))
)
)
ebookMatches = []
exactEbooks = []
printBooks = []
exactPrint = []
needToBuy = []
noMatch = []
catIndex = 0
pubIndex = 0
for x in xCourseISBNs:
bar.progress()
while catIndex < len(catISBNs) and x > catISBNs[catIndex]:
catIndex += 1
while pubIndex < len(pubISBNs) and x > pubISBNs[pubIndex]:
pubIndex += 1
inCats = catIndex < len(catISBNs) and x == catISBNs[catIndex]
if pubIndex < len(pubISBNs) and x == pubISBNs[pubIndex]:
if inCats:
if x in bookstoreISBNs:
exactEbooks.append(x)
ebookMatches.append(x)
else:
needToBuy.append(x)
elif inCats:
if x in bookstoreISBNs:
exactPrint.append(x)
printBooks.append(x)
elif x in bookstoreISBNs:
noMatch.append(x)
bar.finish()
print ("\nSaving Report for posterity...")
if not os.path.exists("hashes/reports/"):
os.mkdir("hashes/reports/")
if os.path.exists("hashes/reports/%s.json" % currentPeriod):
currentPeriod += datetime.datetime.now().strftime("_%y_%m_%d")
with open ("hashes/reports/%s.json" % currentPeriod, "w") as hashreport:
report = {
"bookstore": {
"total-books": len(bookstoreJSON),
"expanded": len(xCourseISBNs),
},
"ebooks": {
"exact-matches": len(exactEbooks),
"expanded-matches": len(ebookMatches),
},
"hashes": {
"catalog": "hashes/cat-%s.txt" % dirhash("CatalogFiles", "md5"),
"expanded": expandedHashPath(),
"map": mapHashPath(),
"publishers": "hashes/pub-%s.txt" % dirhash("PublisherFiles", "md5"),
},
"no-matches": noMatch,
"print": {
"exact-matches": len(exactPrint),
"expanded-matches": len(printBooks),
},
}
json.dump(report, hashreport, sort_keys=True, indent=4)
print ("\nPrinting results...")
if not os.path.exists("reports/"):
os.mkdir("reports/")
getMetadata (ebookMatches, "reports/have-ebooks") # have and open access
getMetadata (exactEbooks, "reports/have-ebooks-exact", exact=True) # exact class ebookMatches for above
getMetadata (printBooks, "reports/have-print") # have and not open access: physical books, CASA catalog, restricted ebooks
getMetadata (exactPrint, "reports/have-print-exact", exact=True) # exact class ebookMatches for above
getMetadata (needToBuy, "reports/ebooks-available-for-purchase") # don't have
getMetadata (noMatch, "reports/dont-have-no-ebook", exact=True) # don't have no ebook
# print ('no matches: %s (%.3f%%)\n' % (comma(len(noMatch)), 100 * len(noMatch) / len(bookstoreISBNs)))