-
Notifications
You must be signed in to change notification settings - Fork 16
/
hdi_copy_table.sh
executable file
·260 lines (201 loc) · 6.37 KB
/
hdi_copy_table.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/bin/bash
#-------------------------------------------------------------------------------------------------------
# THIS SCRIPT HELPS IN COPYING A LIST OF TABLES FOR A GIVEN TIME RANGE TO ANOTHER HBASE CLUSTER
#-------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------
# PRINT USAGE INFORMATION
#----------------------------------------------------------------
print_usage()
{
cat << ...
Usage:
$0 -t <table1:start_timestamp:end_timestamp;table2:start_timestamp:end_timestamp;...> -p <replication_peer> [-m <hostname>] [-everythingTillNow]
Mandatory arguments:
--------------------
-t, --table-list
A double-quoted, ';' separated list of tables along with the start and end
timestamp range which needs to be copied. For each table, the tablename,
starttime and endtime should be separated by ':'.
If the list of tables is too large and exceed the limitation of command line
argument, then this command multiple times with limited tables.
For example:
-t "table1:0:452256397;table2:14141444:452256397"
OR
--tablelist="table1:0:452256397;table2:14141444:452256397"
-p, --replication-peer
This is the zookeeper peer address of HBase cluster where table needs to be
copied.
For example:
-p zk5-hbrpl2;zk1-hbrpl2;zk5-hbrpl2:2181:/hbase-unsecure
OR
--replication-peer=dsthbcluster
-m, --machine
This option should be used when running the $0 script as
Script Action from HDInsight portal or Azure Powershell.
It is recommended to set -m as hn1 which is usually idle.
-everythingTillNow
Use this switch when user does not want to compute start/end timestamps.
This switch will copy all rows until current system timestamp.
-h, --help
Display's usage information.
...
exit 132
}
#------------------------------------------------------------------
# INITIALIZE PARAMETERS
#------------------------------------------------------------------
TABLE_LIST=
REPLICAITON_PEER=
TARGET_MACHINE=`hostname`
EVERYTHING_TILL_NOW=false
#------------------------------------------------------------------
# PARSE AND PROCESS COMMAND LINE ARGUMENTS
#------------------------------------------------------------------
process_arguments()
{
while :; do
case $1 in
-h|--help)
print_usage
exit 0
;;
-t|--table-list)
if [ -n "$2" ]; then
TABLE_LIST=$2
shift
else
printf '[ERROR] -t or --table-list requires non-empty list of tables along with start and end timestamps.' >&2
print_usage
exit 1
fi
;;
--table-list=?*)
TABLE_LIST=${1#*=}
;;
--table-list=)
# Handle the case where no argument is specified after '=' sign.
printf '[ERROR] -t or --table-list requires non-empty list of tables along with start and end timestamps.' >&2
print_usage
exit 1
;;
-p|--replication-peer)
if [ -n "$2" ]; then
REPLICAITON_PEER=$2
shift
else
printf '[ERROR] -p or --replication-peer requires non-empty value.' >&2
print_usage
exit 1
fi
;;
--replication-peer=?*)
REPLICAITON_PEER=${1#*=}
;;
--replication-peer=)
# Handle the case where no argument is specified after '=' sign.
printf '[ERROR] -p or --replication-peer requires non-empty value.' >&2
print_usage
exit 1
;;
-m|--machine)
if [ -n "$2" ]; then
TARGET_MACHINE=$2
shift
else
printf '[ERROR] -m or --machine requires non-empty machine name.' >&2
print_usage
exit 1
fi
;;
-everythingTillNow)
EVERYTHING_TILL_NOW=true
;;
--machine=?*)
TARGET_MACHINE=${1#*=}
;;
--machine=)
# Handle the case where no argument is specified after '=' sign.
printf '[ERROR] -m or --machine requires non-empty machine name.' >&2
print_usage
exit 1
;;
--)
shift
break
;;
-?*)
printf '[WARN] Ignoring unknown option: %s\n' "$1" >&2
;;
*)
# Breaking out of while loop as there are no more arguments left.
break
esac
shift
done
}
#------------------------------------------------------------------
# VALIDATE MANDATORY COMMAND LINE PARAMETERS
#------------------------------------------------------------------
validate_arguments()
{
if [[ -z "${TABLE_LIST// }" ]] || [[ -z "${REPLICAITON_PEER// }" ]]; then
printf '[ERROR] Mandatory arguments missing.\n' >&2
print_usage
exit 1
fi
if [[ $TARGET_MACHINE != hn* ]]; then
printf '[ERROR] -m accepts only hn0 or hn1 as arguments.\n' >&2
print_usage
exit 1
fi
# MACHINE VALIDATION.
#
THIS_MACHINE=`hostname`
if [[ $THIS_MACHINE != $TARGET_MACHINE* ]]; then
printf '[ERROR] Not the correct machine to execute the script. Exiting!\n' >&2
exit 0
fi
}
copy_tables()
{
TABLES_ARRAY=(`echo $TABLE_LIST | sed -e 's/;/ /g'`)
if [[ $EVERYTHING_TILL_NOW == true ]];
then
START_TIME=0
END_TIME=$(($(date +%s%N)/1000000))
fi
for TABLE_ENTRY in "${TABLES_ARRAY[@]}"
do
TABLE_NAME=`echo $TABLE_ENTRY | cut -f 1 -d '#'`
if [[ $EVERYTHING_TILL_NOW == false ]]
then
START_TIME=`echo $TABLE_ENTRY | cut -f 2 -d '#'`
END_TIME=`echo $TABLE_ENTRY | cut -f 3 -d '#'`
REGEX='^[0-9]+$'
if ! [[ $START_TIME =~ $REGEX ]];
then
echo "[ERROR] Start time '$START_TIME' for table '$TABLE_NAME' is not a valid number."
exit 1
fi
if ! [[ $END_TIME =~ $REGEX ]];
then
echo "[ERROR] End time '$END_TIME' for table '$TABLE_NAME' is not a valid number."
exit 1
fi
if [ $START_TIME -gt $END_TIME ];
then
echo "[ERROR] Start time '$START_TIME' for table '$TABLE_NAME' is greater than end time '$END_TIME'."
exit 1
fi
fi
echo "[INFO] Transferring pre-existing data of table '$CURRENT_TABLE' upto END_TIMESTAMP='$END_TS'"
echo "[INFO] Running command: 'hbase org.apache.hadoop.hbase.mapreduce.CopyTable --peer.adr=$REPLICAITON_PEER --starttime=$START_TIME --endtime=$END_TIME $TABLE_NAME'"
hbase org.apache.hadoop.hbase.mapreduce.CopyTable --peer.adr=$REPLICAITON_PEER --starttime=$START_TIME --endtime=$END_TIME $TABLE_NAME
done
}
#------------------------------------------------------------------
# MAIN
#------------------------------------------------------------------
process_arguments $@
validate_arguments
copy_tables