-
Notifications
You must be signed in to change notification settings - Fork 346
/
Copy pathhive_compute_table_stats.py
executable file
·179 lines (154 loc) · 6.66 KB
/
hive_compute_table_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2019-11-26 10:08:52 +0000 (Tue, 26 Nov 2019)
#
# https://github.com/HariSekhon/DevOps-Python-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn
# and optionally send me feedback to help steer this or other code I publish
#
# https://www.linkedin.com/in/HariSekhon
#
"""
Connect to a HiveServer2 and compute optimization statistics on all tables,
or only those matching given db / table / partition value regexes
Tested on CDH 5.10, Hive 1.1.0 with Kerberos
Due to a thrift / impyla bug this needs exactly thrift==0.9.3, see
https://github.com/cloudera/impyla/issues/286
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import argparse
import logging
import os
import re
import socket
import sys
# pylint: disable=import-error
from impala.dbapi import connect
__author__ = 'Hari Sekhon'
__version__ = '0.1.0'
logging.basicConfig()
log = logging.getLogger(os.path.basename(sys.argv[0]))
host_envs = [
'HIVESERVER2_HOST',
'HIVE_HOST',
'HOST'
]
port_envs = [
'HIVESERVER2_PORT',
'HIVE_PORT',
'PORT'
]
def getenvs(keys, default=None):
for key in keys:
value = os.getenv(key)
if value:
return value
return default
def parse_args():
parser = argparse.ArgumentParser(
description="Computes statistics on all Hive tables/partitions matching database / table / partition regexes")
parser.add_argument('-H', '--host', default=getenvs(host_envs, socket.getfqdn()),\
help='HiveServer2 host ' + \
'(default: fqdn of local host, $' + ', $'.join(host_envs) + ')')
parser.add_argument('-P', '--port', type=int, default=getenvs(port_envs, 10000),
help='HiveServer2 port (default: 10000, ' + ', $'.join(port_envs) + ')')
parser.add_argument('-d', '--database', default='.*', help='Database regex (default: .*)')
parser.add_argument('-t', '--table', default='.*', help='Table regex (default: .*)')
parser.add_argument('-p', '--partition', default='.*', help='Partition regex (default: .*)')
parser.add_argument('-k', '--kerberos', action='store_true', help='Use Kerberos (you must kinit first)')
parser.add_argument('-n', '--krb5-service-name', default='hive',
help='Service principal (default: \'hive\')')
parser.add_argument('-S', '--ssl', action='store_true', help='Use SSL')
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose mode')
args = parser.parse_args()
if args.verbose:
log.setLevel(logging.INFO)
return args
def connect_db(args, database):
auth_mechanism = None
if args.kerberos:
auth_mechanism = 'GSSAPI'
log.info('connecting to %s:%s database %s', args.host, args.port, database)
return connect(
host=args.host,
port=args.port,
auth_mechanism=auth_mechanism,
use_ssl=args.ssl,
#user=user,
#password=password,
database=database,
kerberos_service_name=args.krb5_service_name
)
def main():
args = parse_args()
try:
database_regex = re.compile(args.database, re.I)
table_regex = re.compile(args.table, re.I)
partition_regex = re.compile(args.partition, re.I)
except re.error as _:
log.error('error in provided regex: %s', _)
sys.exit(3)
conn = connect_db(args, None)
log.info('querying databases')
with conn.cursor() as db_cursor:
db_cursor.execute('show databases')
for db_row in db_cursor:
database = db_row[0]
if not database_regex.search(database):
log.debug("skipping database '%s', does not match regex '%s'", database, args.database)
continue
log.info('querying tables for database %s', database)
#db_conn = connect_db(args, database)
#with db_conn.cursor() as table_cursor:
with conn.cursor() as table_cursor:
# doesn't support parameterized query quoting from dbapi spec
#table_cursor.execute('use %(database)s', {'database': database})
table_cursor.execute('use {}'.format(database))
table_cursor.execute('show tables')
for table_row in table_cursor:
table = table_row[0]
if not table_regex.search(table):
log.debug("skipping database '%s' table '%s', does not match regex '%s'", \
database, table, args.table)
continue
compute_table_stats(conn, args, database, table, partition_regex)
def compute_table_stats(conn, args, database, table, partition_regex):
log.info("getting partitions for database '%s' table '%s'", database, table)
partitions_found = False
with conn.cursor() as partition_cursor:
# doesn't support parameterized query quoting from dbapi spec
partition_cursor.execute('use {}'.format(database))
partition_cursor.execute('show partitions {}'.format(database))
for partitions_row in partition_cursor:
partition_key = partitions_row[0]
partition_value = partitions_row[1]
partitions_found = True
if not partition_regex.match(partition_value):
# pylint: disable=logging-not-lazy
log.debug("skipping database '%s' table '%s' partition key '%s' value '%s', " +
"value does not match regex '%s'",
database,
table,
partition_key,
partition_value,
args.partition)
continue
# doesn't support parameterized query quoting from dbapi spec
partition_cursor.execute('ANALYZE TABLE {db}.{table} PARTITION({key}={value}) COMPUTE STATISTICS'\
.format(db=database, table=table, key=partition_key, value=partition_value))
if not partitions_found:
log.info("no partitions found for database '%s' table '%s', computing stats for whole table", database, table)
with conn.cursor() as table_cursor:
log.info("running compute stats on table '%s'", table)
# doesn't support parameterized query quoting from dbapi spec
table_cursor.execute('ANALYZE TABLE {db}.{table} COMPUTE STATISTICS'.format(db=database, table=table))
if __name__ == '__main__':
main()