In [1]:
!pip install pyspark==2.4.3

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import datetime
import os
import pyspark
import pandas
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
import socket
import random

In [3]:
from source_buckets import source_buckets

In [4]:
os.environ['PYSPARK_PYTHON'] = '/opt/app-root/bin/python3'
# os.environ['PYSPARK_DRIVER_PYTHON'] = '/opt/app-root/bin/python3'
# spark.jars.ivy={os.environ['HOME']}
SPARK_CLUSTER = 'spark://172.44.45.6:7077'
S3_ENDPOINT = 'https://s3.upshift.redhat.com/'
SPARK_APP_NAME = f'repartition-multiple-buckets-{datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}'
HOSTNAME = socket.gethostbyname(socket.gethostname())
print('Spark Cluster: {}'.format(SPARK_CLUSTER))
print('S3 endpoint: {}'.format(S3_ENDPOINT))
print('Spark App Name: {}'.format(SPARK_APP_NAME))
print('Hostname: {}'.format(HOSTNAME))

Spark Cluster: spark://172.44.45.6:7077
S3 endpoint: https://s3.upshift.redhat.com/
Spark App Name: repartition-multiple-buckets-2020-08-19 11:38
Hostname: 172.44.44.242


In [5]:
def create_spark_config(spark_cluster, executor_memory='16g', executor_cores='4', max_cores='16'):
    print('Spark cluster is: {}'.format(spark_cluster))
    sc_conf = (
        pyspark.SparkConf().setMaster(spark_cluster) \
        .set('spark.driver.host', HOSTNAME) \
        .set('spark.driver.port', 42000) \
        .set('spark.driver.bindAddress', '0.0.0.0') \
        .set('spark.driver.blockManager.port', 42100) \
        .set('spark.executor.cores', '3') \
        .set('spark.executor.memory', '4500M') \
        .set('spark.driver.memory', '4G') \
        .set('spark.sql.parquet.enableVectorizedReader', True) \
        .set('spark.kubernetes.memoryOverheadFactor', '0.20')
    )
    return sc_conf

In [6]:
def setup_spark():
    spark_config = create_spark_config(SPARK_CLUSTER)
    print('spark_config is: {}'.format(spark_config))
    print("Creating Spark Session at cluster: {}".format(SPARK_CLUSTER))
    spark = SparkSession.builder.appName(SPARK_APP_NAME).enableHiveSupport().config(conf=spark_config).getOrCreate()
    spark.sparkContext.setLogLevel('ERROR')
    hadoopConf = spark.sparkContext._jsc.hadoopConfiguration()
    hadoopConf.set('fs.s3a.endpoint', S3_ENDPOINT)
    hadoopConf.set('fs.s3a.path.style.access', 'true')
    hadoopConf.set('fs.s3a.access.key', os.environ.get('AWS_ACCESS_KEY_ID'))
    hadoopConf.set('fs.s3a.secret.key', os.environ.get('AWS_SECRET_ACCESS_KEY'))
    hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
    print("hadoop is configured!")
    return spark

In [7]:
def form_path_string(bucket_name, repartitioned_data=False):
    table_name = bucket_name[3:-4].lower().replace('-', '_')
#     print('table_name: {}'.format(table_name))
    if not repartitioned_data:
        path_string = 's3a://{}/extraction/sos/parquet/{}/'.format(bucket_name, table_name)
    else:
        path_string = 's3a://{}/extraction/sos/parquet/{}/'.format('DH-SECURE-SOSREPORTS', table_name)
    return path_string

In [8]:
def read_dataframe_from_bucket(bucket_name, repartitioned_data=False):
    src_path = form_path_string(bucket_name, repartitioned_data)
#     print(src_path)
    df = spark.read.parquet(f'{src_path}')
    count = df.count()
    distinct_count = df.distinct().count()
    num_partitions = df.rdd.getNumPartitions()
    return (count, distinct_count, num_partitions)

In [9]:
try:
    spark.stop()
    spark = setup_spark()
except:
    spark = setup_spark()

Spark cluster is: spark://172.44.45.6:7077
spark_config is: <pyspark.conf.SparkConf object at 0x7fdf3643e0b8>
Creating Spark Session at cluster: spark://172.44.45.6:7077
hadoop is configured!


In [10]:
failed_buckets = [
    'DH-KERBEROS-KDC-LOG-TMP',
    'DH-KEYSTONE-LOG-TMP',
    'DH-LSOF-TMP',
    'DH-MESSAGES-TMP',
    'DH-MULTIPATH--V4--LL-TMP',
    'DH-MULTIPATH-LL-TMP',
    'DH-NETSTAT-TMP',
    'DH-NEUTRON-L3-AGENT-LOG-TMP',
    'DH-NEUTRON-OVS-AGENT-LOG-TMP',
    'DH-NEUTRON-SERVER-TMP',
    'DH-NFS-EXPORTS-TMP',
    'DH-NOVA-API-LOG-TMP',
    'DH-NOVA-COMPUTE-LOG-TMP',
    'DH-OPENSTACK-ROUTER-LIST-TMP',
    'DH-OPENSTACK-SECURITY-GROUP-LIST-TMP',
    'DH-OSA-DISPATCHER-LOG-TMP',
    'DH-POSTGRESQL-LOG-TMP',
    'DH-PS-AUX-TMP',
    'DH-PS-AUXWW-TMP',
    'DH-RABBITMQ-REPORT-TMP',
    'DH-RHSM-LOG-TMP',
    'DH-ROUTE-TMP',
    'DH-SAMBA-TMP',
    'DH-SCSI-TMP',
    'DH-SECURE-TMP',
    'DH-SIMPLE-FILE-TMP',
    'DH-SS-TMP',
    'DH-SYSCONFIG-KDUMP-TMP',
    'DH-SYSCONFIG-VIRT-WHO-TMP',
    'DH-SYSCTL-TMP',
    'DH-UP2DATE-TMP',
    'DH-VDSM-LOG-TMP',
    'DH-VGDISPLAY-TMP',
    'DH-VMCORE-DMESG-TMP',
    'DH-VSFTPD-TMP',
    'DH-YUM-REPOS-D-TMP',
    'DH-JOURNAL-SINCE-BOOT-TMP',
    'DH-KDUMP-TMP'
]

In [11]:
for count, bucket_name in enumerate(source_buckets):
    if count < 232: continue
    print('Number: {}, Bucket: {}'.format(count+1, bucket_name))
    if bucket_name in failed_buckets:
        print('Not checking a failed bucket.')
        continue
    initial_count, intial_distinct_count, initial_partitions_count = read_dataframe_from_bucket(bucket_name)
    final_count, final_distinct_count, final_partitions_count = read_dataframe_from_bucket(bucket_name, True)
    if intial_distinct_count != final_distinct_count:
        print('Data Loss:')
        print(bucket_name, intial_distinct_count, final_distinct_count)
    print('Partitions reduced by: {}'.format(initial_partitions_count- final_partitions_count))

Number: 233, Bucket: DH-XINETD-CONF-TMP
Partitions reduced by: 603
Number: 234, Bucket: DH-YUM-CONF-TMP
Partitions reduced by: 636
Number: 235, Bucket: DH-YUM-LOG-TMP
Partitions reduced by: 616
Number: 236, Bucket: DH-YUM-REPOLIST-TMP
Partitions reduced by: 636
Number: 237, Bucket: DH-YUM-REPOS-D-TMP
Not checking a failed bucket.
Number: 238, Bucket: DH-ZIPL-CONF-TMP
Partitions reduced by: 0
Number: 239, Bucket: DH-JOURNAL-SINCE-BOOT-TMP
Not checking a failed bucket.
Number: 240, Bucket: DH-KDUMP-TMP
Not checking a failed bucket.
