/
01_fetch_data.sh
executable file
·123 lines (101 loc) · 4.91 KB
/
01_fetch_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env bash
set -e
set -o pipefail
dbsnp_builds=("b149" "b147" "b146")
reference_genome_builds=("GRCh38" "GRCh37")
usage_exit() {
echo "Usage: $0 [-d dbsnp_build] [-r reference_genome_build] <data_dir>" >&2
echo >&2
echo "-d dbSNP build version. Set one from (${dbsnp_builds[@]}). Default: ${dbsnp_builds[0]}" >&2
echo "-r reference genome build version. Set one from (${reference_genome_builds[@]}). Default: ${reference_genome_builds[0]}" >&2
exit 1
}
# Parse args
while getopts ":d:r:" OPT; do
case "${OPT}" in
d) dbsnp=${OPTARG};;
r) ref=${OPTARG};;
\?) usage_exit;;
esac
done
shift $((OPTIND - 1))
if [ "$#" -ne 1 ]; then
usage_exit
fi
# Requirements: wget
if type wget >/dev/null 2>/dev/null; then
:
else
echo "[FATAL] `date +"%Y-%m-%d %H:%M:%S"` Command not found: wget"
exit 1
fi
# Set defaults
: ${dbsnp:=${dbsnp_builds[0]}}
: ${ref:=${reference_genome_builds[0]}}
# TODO: check dbsnp_builds and reference_genome_builds in choices
# Check args
data_dir=$1
if [ -z "${data_dir}" ]; then
usage_exit
fi
mkdir -p ${data_dir}
cd ${data_dir}
database="${dbsnp}_${ref}"
echo "[INFO] Fetching data for ${database} to ${data_dir}..."
declare -A ftp_main=( \
["b146_GRCh37"]="human_9606_b146_GRCh37p13" \
# b147_GRCh37 does not contain main tables, so use ones in GRCh38
["b147_GRCh37"]="human_9606_b147_GRCh38p2" \
# b149_GRCh37 does not contain main tables, so use ones in GRCh38
["b149_GRCh37"]="human_9606_b149_GRCh38p7" \
["b146_GRCh38"]="human_9606_b146_GRCh38p2" \
["b147_GRCh38"]="human_9606_b147_GRCh38p2" \
["b149_GRCh38"]="human_9606_b149_GRCh38p7" \
)
declare -A ftp_branch=( \
["b146_GRCh37"]="human_9606_b146_GRCh37p13" \
["b147_GRCh37"]="human_9606_b147_GRCh37p13" \
["b149_GRCh37"]="human_9606_b149_GRCh37p13" \
["b146_GRCh38"]="human_9606_b146_GRCh38p2" \
["b147_GRCh38"]="human_9606_b147_GRCh38p2" \
["b149_GRCh38"]="human_9606_b149_GRCh38p7" \
)
declare -A ref_code=( \
["b146_GRCh37"]="105" \
["b147_GRCh37"]="105" \
["b149_GRCh37"]="105" \
["b146_GRCh38"]="107" \
["b147_GRCh38"]="107" \
["b149_GRCh38"]="108" \
)
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_main[${database}]}/database/organism_data/OmimVarLocusIdSNP.bcp.gz{,.md5} # ~200 KB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_main[${database}]}/database/organism_data/RsMergeArch.bcp.gz{,.md5} # ~150 MB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_main[${database}]}/database/organism_data/SNP.bcp.gz{,.md5} # ~1.7 GB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_main[${database}]}/database/organism_data/SNP3D.bcp.gz{,.md5} # ~18 MB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_branch[${database}]}/database/organism_data/${dbsnp}_ContigInfo_${ref_code[${database}]}.bcp.gz{,.md5} # ~141 KB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_branch[${database}]}/database/organism_data/${dbsnp}_MapLinkInfo_${ref_code[${database}]}.bcp.gz{,.md5} # ~970 KB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_branch[${database}]}/database/organism_data/${dbsnp}_MapLink_${ref_code[${database}]}.bcp.gz{,.md5} # ~1.2 GB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_branch[${database}]}/database/organism_data/${dbsnp}_SNPChrPosOnRef_${ref_code[${database}]}.bcp.gz{,.md5} # ~500 MB
wget -c ftp.ncbi.nih.gov/snp/organisms/${ftp_branch[${database}]}/database/organism_data/${dbsnp}_SNPContigLoc_${ref_code[${database}]}.bcp.gz{,.md5} # ~3.3 GB
wget -c ftp.ncbi.nih.gov/snp/database/shared_data/Allele.bcp.gz{,.md5} # ~70 MB
wget -c ftp.ncbi.nih.gov/snp/database/shared_data/SnpChrCode.bcp.gz{,.md5} # ~1 KB
echo "[INFO] Checking md5..."
type openssl >/dev/null 2>&1; is_openssl_cmd_found=$?
type md5sum >/dev/null 2>&1; is_md5sum_cmd_found=$?
for src in *.gz; do
if [ $is_openssl_cmd_found = 0 ]; then
diff <(openssl md5 ${src}) <(cat ${src}.md5)
elif [ $is_md5sum_cmd_found = 0 ]; then
md5sum -c ${src}.md5
else
echo "[WARN] Command `openssl` or `md5sum` not found. Skipping md5 check."
break
fi
done
# Unifying bcp name
ln -s ${dbsnp}_ContigInfo_${ref_code[${database}]}.bcp.gz ContigInfo.bcp.gz
ln -s ${dbsnp}_MapLinkInfo_${ref_code[${database}]}.bcp.gz MapLinkInfo.bcp.gz
ln -s ${dbsnp}_MapLink_${ref_code[${database}]}.bcp.gz MapLink.bcp.gz
ln -s ${dbsnp}_SNPChrPosOnRef_${ref_code[${database}]}.bcp.gz SNPChrPosOnRef.bcp.gz
ln -s ${dbsnp}_SNPContigLoc_${ref_code[${database}]}.bcp.gz SNPContigLoc.bcp.gz
echo "[INFO] Done"