-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
executable file
·46 lines (40 loc) · 1.57 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python3
import hashlib
import os.path
from newspaper import Article
dataFile = open('data.txt', 'r')
lines = [x.strip('\n') for x in dataFile]
dataFile.close()
keyFile = open('../../key.txt', 'r')
key = keyFile.read().strip('\n')
keyFile.close()
i = 0
newLines = []
for ln in lines:
sp = ln.split('|')
guid = hashlib.md5(sp[1].encode('utf-8')).hexdigest()
if not os.path.isfile('articles/'+guid+'.html'):
try:
article = Article(sp[1])
article.download()
article.parse()
articleHtml = '<!DOCTYPE html><html lang="en"><head><title>'+article.title+'</title><style type="text/css">body{font-size:20px;}</style></head><body>'
articleHtml += '<h1>'+article.title+'</h1>'
articleHtml += '<p><strong>Source: <a href="' + sp[1] + '" target="_blank">' + sp[1] + '</a></strong></p>'
articleHtml += '<p>[<a href="../read.php?key='+key+'&id='+str(i)+'">mark read</a>]</p>'
articleHtml += '<p>'
articleHtml += article.text.replace('\n','<br>')
articleHtml += '</p>'
articleHtml += '<p>[<a href="../read.php?key='+key+'&id='+str(i)+'">mark read</a>]</p>'
articleHtml += '</body></html>'
articleFile = open('articles/' + guid + '.html', 'w')
articleFile.write(articleHtml)
articleFile.close()
ln = ln + '|' + article.title
except:
print('error:'+sp[1]);
newLines.append(ln)
i += 1
dataFile = open('data.txt', 'w')
dataFile.write('\n'.join(newLines))
dataFile.close()