In [1]:
from pyspark.sql import *
import pandas as pd
from time import time
from GetPath import GetPath

In [2]:
# Config
input_path='D:/Output/chenbingying/A1/'
output_path='D:/Output/chenbingying/Check/'

In [3]:
def Check_A1(input_path,output_path):
    t_start=time()
    spark = SparkSession.builder.appName('A1_Check').config('spark.dynamicAllocation.enabled','true').config('spark.debug.maxToStringFields', '100').config('spark.sql.execution.arrow.enable','true').config('spark.executor.memory','48g').config('spark.driver.memory', '48g').config('spark.core.connection.ack.wait.timeout','36000s').config('spark.executor.heartbeatInterval','36000s').config('spark.network.timeout', '50000s').config('spark.rpc.lookupTimeout', '5000s').config('spark.shuffle.io.connectionTimeout', '50000s').getOrCreate()
    start=time()
    namelist=[]
    resultlist=[]
    for file in GetPath(input_path,'csv'):
        name=file.split('/')[-1]
        print("正在处理：",name)
        spark.read.option('inferSchema','true').option('header','true').csv(file).createOrReplaceTempView('A1_Raw')
        print('读入数据耗时：%.2f秒'%(time()-start))
        start=time()
        df=spark.sql('''select 
            count(*) as TOTAL
            ,count(MEDICAL_INSTITUTION_CODE) as MEDICAL_INSTITUTION_CODE
            ,count(FILE_NUMBER) as FILE_NUMBER
            ,count(ID_NUMBER) as ID_NUMBER
            ,count(BIRTH_DATE) as BIRTH_DATE
            ,count(GENDER) as GENDER
            ,count(ADDRESS_OF_EMPLOYER) as ADDRESS_OF_EMPLOYER
            ,count(RESIDENCE_ADDRESS) as RESIDENCE_ADDRESS
            ,count(REGISTERED_ADDRESS) as REGISTERED_ADDRESS
            ,count(AREA) as AREA
            ,sum(case when ADDRESS_OF_EMPLOYER is not null or RESIDENCE_ADDRESS is not null or REGISTERED_ADDRESS is not null or AREA is not null then 1 else 0 end)
            ,sum(case when MEDICAL_INSTITUTION_CODE is not null and FILE_NUMBER is not null then 1 else 0 end) as MF
            ,sum(case when CARD_TYPE is not null and CARD_NUMBER is not null then 1 else 0 end) as CC
            ,count(id_birth_date) as id_birth_date
            ,count(id_gender) as id_gender
            from A1_Raw
        ''').toPandas()
        for i in df.columns:
            if(i!='TOTAL'):
                df.insert(df.columns.get_loc(i)+1,i+'_RATE',df[i]/df['TOTAL']*100)
        resultlist.append(df)
        namelist.append(name)
        print('处理数据耗时：%.2f秒'%(time()-start))
    result_df=pd.concat(resultlist)
    result_df.insert(0,'文件名',namelist)
    result_df.columns=['文件名','数据总量','MEDICAL_INSTITUTION_CODE非空数据量','MEDICAL_INSTITUTION_CODE完整率','FILE_NUMBER非空数据量','FILE_NUMBER完整率','ID_NUMBER非空数据量','ID_NUMBER完整率','BIRTH_DATE非空数据量','BIRTH_DATE完整率','GENDER非空数据量','GENDER完整率','ADDRESS_OF_EMPLOYER非空数据量','ADDRESS_OF_EMPLOYER完整率','RESIDENCE_ADDRESS非空数据量','RESIDENCE_ADDRESS完整率','REGISTERED_ADDRESS非空数据量','REGISTERED_ADDRESS完整率','AREA非空数据量','AREA完整率','任一ADDRESS非空数据量','任一ADDRESS非空占比','MEDICAL_INSTITUTION_CODE,FILE_NUMBER字段组合非空个数==2数据量','MEDICAL_INSTITUTION_CODE,FILE_NUMBER字段组合非空个数==2占比','CARD_TYPE,CARD_NUMBER字段组合非空个数==2数据量','CARD_TYPE,CARD_NUMBER字段组合非空个数==2占比','ID_BIRTH_DATE非空数据量','ID_BIRTH_DATE完整率','ID_GENDER非空数据量','ID_GENDER完整率']
    result_df.to_csv(output_path+'A1统计.csv',index=None)
    spark.catalog.dropTempView("A1_Raw")
    print(f'已输出至：{output_path}A1统计.csv')
    print('总耗时：%.2f秒'%(time()-t_start))

In [4]:
if __name__=="__main__":
    Check_A1(input_path,output_path)

正在处理： tb_patient_information.csv
读入数据耗时：53.59秒
处理数据耗时：72.21秒
正在处理： tb_patient_information_201807.csv
读入数据耗时：204.64秒
处理数据耗时：78.63秒
已输出至：D:/Output/zhuhao/Check/A1统计.csv
总耗时：340.58秒
