-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
123 lines (108 loc) · 3.98 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import requests
import re
from pattern import web
class Crawler():
def __init__(self, startURL, matchURL, n=None):
self.startURL = startURL
self.regex = matchURL
self.G = nx.DiGraph()
self.nPages = n
self.crawled = list()
self.crawling = [startURL,]
self.currentId = 0
return
def getLinks(self, url):
'''Returns a list of links contained in the DOM of the page at a given URL'''
data = requests.get(url).text
dom = web.Element(data)
links = list()
for a in dom.by_tag('a'):
if 'href' in a.attributes:
links.append(a.attributes['href'].encode('ascii','xmlcharrefreplace'))
return links
def cleanLinks(self, dirtyList, currentURL):
'''Removes url not matching given domain and transform relative paths to absolute paths'''
cleanList = list()
for link in dirtyList:
url = None
# print 'In:', link
if self.regex in link:
if link[:7] == 'http://':
url = link
elif link[:2] == '//':
url = 'http:' + link
elif len(link) > 0 and link[0] == '/':
url = self.startURL + link
if url not in cleanList and url != None:
cleanList.append(url)
# print 'Out:', cleanList[-1]
return cleanList
def updateNetwork(self, G, url, linksList):
'''Add fetched links to digraph G'''
edgesList = [(url, l) for l in linksList]
G.add_node(url)
G.add_nodes_from(linksList)
G.add_edges_from(edgesList)
return G
def reloadFromExistingState(self, graph, crawled, crawling, dump=None):
'''Relaunch from existing dumped state'''
self.crawled = self.loadFromFile(crawled)
self.crawling = self.loadFromFile(crawling)
self.G = nx.read_gml(graph)
self.currentId = len(self.crawled)
self.crawl(dump=dump)
return
def dumpGraph(self):
'''Dump graph as a GML file and save png of the network'''
# Save Graph
nx.write_gml(self.G, 'graph_full.gml')
# Plot Graph
pos = nx.spring_layout(self.G)
nx.draw_networkx_nodes(self.G, pos, cmap=plt.get_cmap('jet'))
nx.draw_networkx_edges(self.G, pos, edge_color='k', arrows=True)
plt.savefig("graph.png", bbox_inches="tight")
print 'Crawled %d pages, %d remaining' % (len(self.crawled), len(self.crawling))
def dumpToFile(self, data, fname):
'''Use pickle to dump data in fname file'''
f = open(fname, 'w')
pickle.dump(data, f)
f.close()
return
def loadFromFile(self, fname):
'''Use pickle to load data from fname file and return it'''
f = open(fname, 'r')
data = pickle.load(f)
f.close()
return data
def crawl(self, n=None, dump=None):
'''Launch crawler on n pages if n != None, otherwise, it stops when all webpages have been explored'''
if n != None:
self.nPages = n
print "Start crawling ", self.startURL
while (self.nPages == None and len(self.crawling) > 0) or (self.nPages != None and len(self.crawled) <= self.nPages):
self.currentId += 1
if dump != None and (self.currentId)%dump == 0:
# Dump intermediary graph in case of crash or interrupt
nx.write_gml(self.G, 'graph_%06d.gml' % self.currentId)
self.dumpToFile(self.crawled, 'crawled_%06d.p' % self.currentId)
self.dumpToFile(self.crawling, 'crawling_%06d.p'% self.currentId)
currentURL = self.crawling.pop(0)
print "Crawling page %d of %d:"%(self.currentId, len(self.crawling + self.crawled)), currentURL.encode('ascii','xmlcharrefreplace')
self.crawled.append(currentURL)
# Get a list of new links from the current page
dirtyLinks = self.getLinks(currentURL)
cleanLinks = self.cleanLinks(dirtyLinks, currentURL)
newLinks = list(set(cleanLinks) - set(self.crawling + self.crawled))
self.crawling += newLinks
print '%d of %d new links found on the current page'%(len(newLinks), len(cleanLinks))
# Build network
self.G = self.updateNetwork(self.G, currentURL, cleanLinks)
self.dumpGraph()
return
if __name__ == "__main__":
bug = Crawler("http://cdiscount.com", "cdiscount.com")
bug.crawl(n=None, dump=200)