Skip to content

Commit

Permalink
chg: [Crawler] add launcher and install
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Sep 24, 2018
1 parent 6edc1dd commit 50c8177
Show file tree
Hide file tree
Showing 11 changed files with 164 additions and 73 deletions.
42 changes: 22 additions & 20 deletions bin/Crawler.py
Expand Up @@ -40,16 +40,13 @@ def crawl_onion(url, domain, date, date_month, message):
exit(0)

if r.status_code == 200:
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father],
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)

if process.returncode == 0:
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
p.populate_set_out(msg, 'Tags')

# onion up
print(process.stdout.read())

else:
Expand All @@ -59,14 +56,19 @@ def crawl_onion(url, domain, date, date_month, message):
## FIXME: # TODO: relaunch docker
exit(0)

time.sleep(60)


if __name__ == '__main__':

if len(sys.argv) != 2:
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)')
if len(sys.argv) != 3:
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
print(sys.argv[1])
print(sys.argv[2])
exit(1)

type_hidden_service = sys.argv[1]
splash_port = sys.argv[2]

publisher.port = 6380
publisher.channel = "Script"
Expand All @@ -85,21 +87,19 @@ def crawl_onion(url, domain, date, date_month, message):

if type_hidden_service == 'onion':
regex_hidden_service = url_onion
splash_url = p.config.get("Crawler", "splash_url_onion")
http_proxy = p.config.get("Crawler", "http_proxy_onion")
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
elif type_hidden_service == 'i2p':
regex_hidden_service = url_i2p
splash_url = p.config.get("Crawler", "splash_url_i2p")
http_proxy = p.config.get("Crawler", "http_proxy_i2p")
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port)
elif type_hidden_service == 'regular':
regex_hidden_service = url_i2p
splash_url = p.config.get("Crawler", "splash_url_onion")
http_proxy = p.config.get("Crawler", "http_proxy_onion")
splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port)
else:
print('incorrect crawler type: {}'.format(type_hidden_service))
exit(0)

print(type_hidden_service)
print(splash_url)

crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")

Expand Down Expand Up @@ -129,8 +129,6 @@ def crawl_onion(url, domain, date, date_month, message):

# Recovering the streamed message informations. http://eepsites.i2p
message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
#message = 'http://i2pwiki.i2p;test'
#message = 'http://i2host.i2p;test'

# # FIXME: remove
if message is None:
Expand Down Expand Up @@ -186,13 +184,16 @@ def crawl_onion(url, domain, date, date_month, message):
# save down onion
if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
#r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
#r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date)
else:
r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
#r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
p.populate_set_out(msg, 'Tags')

# last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
Expand Down Expand Up @@ -226,12 +227,13 @@ def crawl_onion(url, domain, date, date_month, message):
r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))

# update list, last crawled onions
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)

#send all crawled domain past
msg = domain
p.populate_set_out(msg, 'DomainSubject')
#msg = domain
#p.populate_set_out(msg, 'DomainSubject')

#time.sleep(30)

Expand Down
30 changes: 27 additions & 3 deletions bin/LAUNCH.sh
Expand Up @@ -27,6 +27,7 @@ islogged=`screen -ls | egrep '[0-9]+.Logging_AIL' | cut -d. -f1`
isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1`
isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1`
isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1`
iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1`

function helptext {
echo -e $YELLOW"
Expand Down Expand Up @@ -198,6 +199,26 @@ function launching_scripts {

}

function launching_crawler {
CONFIG=$AIL_BIN/packages/config.cfg
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
echo $lport

IFS='-' read -ra PORTS <<< "$lport"
first_port=${PORTS[0]}
last_port=${PORTS[1]}

screen -dmS "Crawler_AIL"
sleep 0.1

for ((i=first_port;i<=last_port;i++)); do
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
sleep 0.1
done

echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
}

function shutting_down_redis {
redis_dir=${AIL_HOME}/redis/src/
bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN'
Expand Down Expand Up @@ -406,6 +427,9 @@ function launch_all {
Flask)
launch_flask;
;;
Crawler)
launching_crawler;
;;
Killall)
killall;
;;
Expand All @@ -427,13 +451,13 @@ function launch_all {

while [ "$1" != "" ]; do
case $1 in
-l | --launchAuto ) launch_all "automatic";
-l | --launchAuto ) launch_all "automatic"; launching_crawler
;;
-k | --killAll ) killall;
;;
-c | --configUpdate ) checking_configuration "manual";
-t | --thirdpartyUpdate ) update_thirdparty;
;;
-t | --thirdpartyUpdate ) update_thirdparty;
-c | --crawler ) launching_crawler;
;;
-h | --help ) helptext;
exit
Expand Down
64 changes: 38 additions & 26 deletions bin/Onion.py
Expand Up @@ -113,6 +113,15 @@ def fetch(p, r_cache, urls, domains, path):
message = p.get_from_set()
prec_filename = None

# send to crawler:
activate_crawler = p.config.get("Crawler", "activate_crawler")
if activate_crawler == 'True':
activate_crawler = True
print('Crawler enabled')
else:
activate_crawler = False
print('Crawler disabled')

# Thanks to Faup project for this regex
# https://github.com/stricaud/faup
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
Expand Down Expand Up @@ -142,6 +151,7 @@ def fetch(p, r_cache, urls, domains, path):
domains_list.append(domain)
urls.append(url)

'''
for x in PST.get_regex(i2p_regex):
# Extracting url with regex
url, s, credential, subdomain, domain, host, port, \
Expand All @@ -156,6 +166,7 @@ def fetch(p, r_cache, urls, domains, path):
r_onion.sadd('i2p_domain_crawler_queue', domain)
msg = '{};{}'.format(url,PST.p_path)
r_onion.sadd('i2p_crawler_queue', msg)
'''

# Saving the list of extracted onion domains.
PST.__setattr__(channel, domains_list)
Expand All @@ -176,32 +187,33 @@ def fetch(p, r_cache, urls, domains, path):
to_print = 'Onion;{};{};{};'.format(PST.p_source,
PST.p_date,
PST.p_name)
'''
for url in fetch(p, r_cache, urls, domains_list, path):
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
p.populate_set_out(msg, 'Tags')
'''

date_month = datetime.datetime.now().strftime("%Y%m")
date = datetime.datetime.now().strftime("%Y%m%d")
for url in urls:

domain = re.findall(url_regex, url)
if len(domain) > 0:
domain = domain[0][4]
else:
continue

if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
if not r_onion.sismember('onion_domain_crawler_queue', domain):
print('send to onion crawler')
r_onion.sadd('onion_domain_crawler_queue', domain)
msg = '{};{}'.format(url,PST.p_path)
r_onion.sadd('onion_crawler_queue', msg)
#p.populate_set_out(msg, 'Crawler')

if activate_crawler:
date_month = datetime.datetime.now().strftime("%Y%m")
date = datetime.datetime.now().strftime("%Y%m%d")
for url in urls:

domain = re.findall(url_regex, url)
if len(domain) > 0:
domain = domain[0][4]
else:
continue

if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
if not r_onion.sismember('onion_domain_crawler_queue', domain):
print('send to onion crawler')
r_onion.sadd('onion_domain_crawler_queue', domain)
msg = '{};{}'.format(url,PST.p_path)
r_onion.sadd('onion_crawler_queue', msg)
#p.populate_set_out(msg, 'Crawler')

else:
for url in fetch(p, r_cache, urls, domains_list, path):
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')

msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
p.populate_set_out(msg, 'Tags')
else:
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))

Expand Down
7 changes: 3 additions & 4 deletions bin/packages/config.cfg.sample
Expand Up @@ -235,8 +235,7 @@ port = 6381
db = 0

[Crawler]
activate_crawler = True
crawler_depth_limit = 1
splash_url_onion = http://127.0.0.1:8050
splash_url_i2p = http://127.0.0.1:8050
http_proxy_onion = http://127.0.0.1:9050
http_proxy_i2p = http://127.0.0.1:9050
splash_url_onion = http://127.0.0.1
splash_onion_port = 8050-8050
17 changes: 6 additions & 11 deletions bin/torcrawler/TorSplashCrawler.py
Expand Up @@ -26,7 +26,7 @@

class TorSplashCrawler():

def __init__(self, splash_url, http_proxy, crawler_depth_limit):
def __init__(self, splash_url, crawler_depth_limit):
self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
Expand Down Expand Up @@ -114,7 +114,6 @@ def parse(self,response):
if response.status == 504:
# down ?
print('504 detected')
#elif response.status in in range(400, 600):
elif response.status != 200:
print('other: {}'.format(response.status))
else:
Expand All @@ -128,7 +127,7 @@ def parse(self,response):
if self.save_crawled_paste(filename_paste, response.data['html']):

# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
Expand Down Expand Up @@ -157,21 +156,17 @@ def parse(self,response):
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))

#interest = response.data['har']['log']['entries'][0]['response']['header'][0]
with open(filename_screenshot+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode())

# save external links in set
lext = LinkExtractor(deny_domains=self.domains, unique=True)
for link in lext.extract_links(response):
self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
#lext = LinkExtractor(deny_domains=self.domains, unique=True)
#for link in lext.extract_links(response):
# self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
# self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

#le = LinkExtractor(unique=True)
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
self.r_cache.setbit(link, 0, 0)
self.r_cache.expire(link, 360000)
yield SplashRequest(
link.url,
self.parse,
Expand Down
38 changes: 38 additions & 0 deletions bin/torcrawler/launch_splash_crawler.sh
@@ -0,0 +1,38 @@
#!/bin/bash

#usage() { echo "Usage: sudo $0 [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]" 1>&2; exit 1; }

while getopts ":p:f:n:" o; do
case "${o}" in
p)
p=${OPTARG}
;;
f)
f=${OPTARG}
;;
n)
n=${OPTARG}
;;
*)
usage
;;
esac
done
shift $((OPTIND-1))

if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
#usage
echo "usage"
fi

first_port=$p
echo "usage0"
screen -dmS "Docker_Splash"
echo "usage1"
sleep 0.1

for ((i=0;i<=$((${n} - 1));i++)); do
port_number=$((${p} + $i))
screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
sleep 0.1
done

0 comments on commit 50c8177

Please sign in to comment.