Skip to content

Commit

Permalink
Merge pull request #33 from ddworken/master
Browse files Browse the repository at this point in the history
Added parameter to configure the bloom filter size
  • Loading branch information
DanMcInerney committed Dec 18, 2019
2 parents 06ad0aa + a1617d0 commit f6e65c0
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 27 deletions.
23 changes: 0 additions & 23 deletions xsscrapy/bloom.py

This file was deleted.

3 changes: 2 additions & 1 deletion xsscrapy/bloomfilters.py
@@ -1,13 +1,14 @@
from pybloom import BloomFilter
from scrapy.utils.job import job_dir
from scrapy.dupefilters import BaseDupeFilter
from settings import bloomfilterSize

class BloomURLDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""

def __init__(self, path=None):
self.file = None
self.fingerprints = BloomFilter(3000000, 0.0001)
self.fingerprints = BloomFilter(bloomfilterSize*10, 0.0001)

@classmethod
def from_settings(cls, settings):
Expand Down
7 changes: 4 additions & 3 deletions xsscrapy/middlewares.py
Expand Up @@ -3,12 +3,13 @@
from pybloom import BloomFilter
import random
import re
from settings import bloomfilterSize

# Filter out duplicate requests with Bloom filters since they're much easier on memory
#URLS_FORMS_HEADERS = BloomFilter(3000000, 0.00001)
URLS_SEEN = BloomFilter(300000, .0001)
FORMS_SEEN = BloomFilter(300000, .0001)
HEADERS_SEEN = BloomFilter(300000, .0001)
URLS_SEEN = BloomFilter(bloomfilterSize, .0001)
FORMS_SEEN = BloomFilter(bloomfilterSize, .0001)
HEADERS_SEEN = BloomFilter(bloomfilterSize, .0001)
USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
Expand Down
2 changes: 2 additions & 0 deletions xsscrapy/settings.py
Expand Up @@ -38,3 +38,5 @@

CONCURRENT_REQUESTS = 30

# If you get bloom filter problems, increase this number
bloomfilterSize = 300000

0 comments on commit f6e65c0

Please sign in to comment.