-
Notifications
You must be signed in to change notification settings - Fork 0
/
url_scraping.sh
73 lines (57 loc) · 1.48 KB
/
url_scraping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/bash
## Ctrl + c
# (function)
signal_handler(){
echo -e "\n[!] User terminated."
tput cnorm; exit 1 # return cursor and exit
}
# (signal)
trap signal_handler SIGINT
## Functions
# display help panel
help(){
echo -e "\nDescription: Web scraping of website"
echo
echo "[*] Use: $0 target_url"
echo
}
# valid arguments
if [[ $# -ne 1 ]]
then
help
tput cnorm; exit 1
fi
# get the urls of a website
get_urls(){
local scan_depth_limit=$2
target_url=$1
local website_urls=$(curl -s $target_url | grep -oP "href=[\"|'](.*?)[\"|']" | awk -F'href=' '{print $2}' | tr -d "\"'" | grep -vE "^(#|//)" | sort -u | xargs)
IFS=' ' read -ra website_urls_array <<< "$website_urls"
((scan_depth_limit--))
for url in "${website_urls_array[@]}"
do
if [[ "$url" != *"$target_domain"* ]]
then
continue
fi
echo $url
echo $url >> all_urls.txt
if [[ ! $scan_depth_limit -eq 0 ]]
then
get_urls "$url" $scan_depth_limit &
else
return
fi
done; wait
}
# Main flow
# ---------
tput civis # hice cursor (esthetic)
scan_depth_limit=4
echo -n '' > all_urls.txt # create file to save urls
target_domain=$(echo $1 | awk -F'//' '{print $2}' | tr -d '\n') # only find with specific domain
echo -e "\n[*] Scanning site: $1\n"
get_urls "$1" $scan_depth_limit # call function (recursive)
echo -e "\n[+] Saving output: all_urls.txt"
cat all_urls.txt | sort -u | sponge all_urls.txt # filter unique urls and save
tput cnorm # return cursor