In [4]:
from pyspark.sql import *
import pandas as pd
from GetPath import *
from time import time
from functools import reduce

In [5]:
# Config 
input_path='D:/Output/chenbingying/B2/'
output_path='D:/Output/chenbingying/B3.csv'

In [6]:
def Create_B3(input_path,output_path):
    t_start=time()
    spark = SparkSession.builder.appName('B3_Create').config('spark.dynamicAllocation.enabled','true').config('spark.debug.maxToStringFields', '100').config('spark.sql.execution.arrow.enable','true').config('spark.executor.memory','48g').config('spark.driver.memory', '48g').config('spark.core.connection.ack.wait.timeout','36000s').config('spark.executor.heartbeatInterval','36000s').config('spark.network.timeout', '50000s').config('spark.rpc.lookupTimeout', '5000s').config('spark.shuffle.io.connectionTimeout', '50000s').getOrCreate()
    start=time()
    files=[]
    filelist=GetPath(input_path,'csv')
    # 读入数据
    for file in filelist:
        print('开始处理',file.split('/')[-1].split('.csv')[0]+'.csv')
        files.append(spark.read.option('inferSchema','true').option('header','true').csv(file))
    print('读入数据耗时：%.2f秒'%(time()-start))
    # 合并数据
    reduce(lambda x,y:x.unionByName(y), files).createOrReplaceTempView('B2_Raw')
    # 筛选ICD编码、性别、年龄、就诊日期、是否广州都非空
    spark.sql('select * from B2_Raw where ICD_CODE is not null and GENDER is not null and AGE is not null and VISITING_DATE is not null and CITY is not null').createOrReplaceTempView('B2_Clean_1')
    spark.sql('''
    select to_date(VISITING_DATE) as VISITING_DATE
    ,case when DISTRICT is null then '其他' else substr(DISTRICT,1,2) end as DISTRICT
    ,GENDER
    ,AGE
    ,ICD_CODE from B2_Clean_1
    ''').createOrReplaceTempView('B2_Clean_2')
    # 组合统计
    start=time()
    result=spark.sql('''
        select VISITING_DATE
        ,DISTRICT
        ,count(*) as total_cnt
        ,count(GENDER=1 or null) as male_cnt
        ,count(GENDER=2 or null) as female_cnt
        ,count((AGE>=0 and AGE<=6) or null) as age0_6
        ,count((AGE>=7 and AGE<=17) or null) as age7_17
        ,count((AGE>=18 and AGE<65) or null) as age18_64
        ,count(AGE>=65 or null) as age_up65
        ,count(REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') or null) as Nonaccidental_disease
        ,count((REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') and AGE>=0 and AGE<=6) or null) as Nonaccidental_disease_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') and AGE>=7 and AGE<=17) or null) as Nonaccidental_disease_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') and AGE>=18 and AGE<65) or null) as Nonaccidental_disease_age_18_64     
        ,count((REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') and AGE>=65) or null) as Nonaccidental_disease_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') and GENDER=1) or null) as Nonaccidental_disease_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'[A-R][0-9][0-9]') and GENDER=2) or null) as Nonaccidental_disease_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') or null) as Circulatory_diseases
        ,count((REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') and AGE>=0 and AGE<=6) or null) as Circulatory_diseases_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') and AGE>=7 and AGE<=17) or null) as Circulatory_diseases_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') and AGE>=18 and AGE<65) or null) as Circulatory_diseases_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') and AGE>=65) or null) as Circulatory_diseases_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') and GENDER=1) or null) as Circulatory_diseases_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'I[0-9][0-9]') and GENDER=2) or null) as Circulatory_diseases_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') or null) as Respiratory_diseases
        ,count((REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') and AGE>=0 and AGE<=6) or null) as Respiratory_diseases_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') and AGE>=7 and AGE<=17) or null) as Respiratory_diseases_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') and AGE>=18 and AGE<65) or null) as Respiratory_diseases_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') and AGE>=65) or null) as Respiratory_diseases_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') and GENDER=1) or null) as Respiratory_diseases_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'J[0-9][0-9]') and GENDER=2) or null) as Respiratory_diseases_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'I6[0-8]') or null) as Cerebrovascular_disease
        ,count((REGEXP_LIKE(ICD_CODE,'I6[0-8]') and AGE>=0 and AGE<=6) or null) as Cerebrovascular_disease_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'I6[0-8]') and AGE>=7 and AGE<=17) or null) as Cerebrovascular_disease_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'I6[0-8]') and AGE>=18 and AGE<65) or null) as Cerebrovascular_disease_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'I6[0-8]') and AGE>=65) or null) as Cerebrovascular_disease_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'I6[0-8]') and GENDER=1) or null) as Cerebrovascular_disease_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'I6[0-8]') and GENDER=2) or null) as Cerebrovascular_disease_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'I10') or null) as Idiopathic_hypertension
        ,count((REGEXP_LIKE(ICD_CODE,'I10') and AGE>=0 and AGE<=6) or null) as Idiopathic_hypertension_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'I10') and AGE>=7 and AGE<=17) or null) as Idiopathic_hypertension_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'I10') and AGE>=18 and AGE<65) or null) as Idiopathic_hypertension_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'I10') and AGE>=65) or null) as Idiopathic_hypertension_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'I10') and GENDER=1) or null) as Idiopathic_hypertension_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'I10') and GENDER=2) or null) as Idiopathic_hypertension_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'I2[0-5]') or null) as Ischemic_heart_disease
        ,count((REGEXP_LIKE(ICD_CODE,'I2[0-5]') and AGE>=0 and AGE<=6) or null) as Ischemic_heart_disease_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'I2[0-5]') and AGE>=7 and AGE<=17) or null) as Ischemic_heart_disease_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'I2[0-5]') and AGE>=18 and AGE<65) or null) as Ischemic_heart_disease_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'I2[0-5]') and AGE>=65) or null) as Ischemic_heart_disease_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'I2[0-5]') and GENDER=1) or null) as Ischemic_heart_disease_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'I2[0-5]') and GENDER=2) or null) as Ischemic_heart_disease_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'I2[1-2]') or null) as AMI
        ,count((REGEXP_LIKE(ICD_CODE,'I2[1-2]') and AGE>=0 and AGE<=6) or null) as AMI_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'I2[1-2]') and AGE>=7 and AGE<=17) or null) as AMI_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'I2[1-2]') and AGE>=18 and AGE<65) or null) as AMI_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'I2[1-2]') and AGE>=65) or null) as AMI_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'I2[1-2]') and GENDER=1) or null) as AMI_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'I2[1-2]') and GENDER=2) or null) as AMI_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'J4[0-4]') or null) as COPD
        ,count((REGEXP_LIKE(ICD_CODE,'J4[0-4]') and AGE>=0 and AGE<=6) or null) as COPD_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'J4[0-4]') and AGE>=7 and AGE<=17) or null) as COPD_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'J4[0-4]') and AGE>=18 and AGE<65) or null) as COPD_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'J4[0-4]') and AGE>=65) or null) as COPD_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'J4[0-4]') and GENDER=1) or null) as COPD_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'J4[0-4]') and GENDER=2) or null) as COPD_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'J00') or null) as Acute_nasopharyngitis
        ,count((REGEXP_LIKE(ICD_CODE,'J00') and AGE>=0 and AGE<=6) or null) as Acute_nasopharyngitis_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'J00') and AGE>=7 and AGE<=17) or null) as Acute_nasopharyngitis_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'J00') and AGE>=18 and AGE<65) or null) as Acute_nasopharyngitis_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'J00') and AGE>=65) or null) as Acute_nasopharyngitis_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'J00') and GENDER=1) or null) as Acute_nasopharyngitis_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'J00') and GENDER=2) or null) as Acute_nasopharyngitis_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'J4[5-6]') or null) as Asthma
        ,count((REGEXP_LIKE(ICD_CODE,'J4[5-6]') and AGE>=0 and AGE<=6) or null) as Asthma_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'J4[5-6]') and AGE>=7 and AGE<=17) or null) as Asthma_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'J4[5-6]') and AGE>=18 and AGE<65) or null) as Asthma_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'J4[5-6]') and AGE>=65) or null) as Asthma_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'J4[5-6]') and GENDER=1) or null) as Asthma_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'J4[5-6]') and GENDER=2) or null) as Asthma_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'J30') or null) as Vasomotor_and_allergic_rhinitis
        ,count((REGEXP_LIKE(ICD_CODE,'J30') and AGE>=0 and AGE<=6) or null) as Vasomotor_and_allergic_rhinitis_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'J30') and AGE>=7 and AGE<=17) or null) as Vasomotor_and_allergic_rhinitis_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'J30') and AGE>=18 and AGE<65) or null) as Vasomotor_and_allergic_rhinitis_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'J30') and AGE>=65) or null) as Vasomotor_and_allergic_rhinitis_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'J30') and GENDER=1) or null) as Vasomotor_and_allergic_rhinitis_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'J30') and GENDER=2) or null) as Vasomotor_and_allergic_rhinitis_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'G30') or null) as Alzheimer_disease
        ,count((REGEXP_LIKE(ICD_CODE,'G30') and AGE>=0 and AGE<=6) or null) as Alzheimer_disease_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'G30') and AGE>=7 and AGE<=17) or null) as Alzheimer_disease_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'G30') and AGE>=18 and AGE<65) or null) as Alzheimer_disease_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'G30') and AGE>=65) or null) as Alzheimer_disease_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'G30') and GENDER=1) or null) as Alzheimer_disease_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'G30') and GENDER=2) or null) as Alzheimer_disease_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') or null) as Malignant_tumour_disease
        ,count((REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') and AGE>=0 and AGE<=6) or null) as Malignant_tumour_disease_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') and AGE>=7 and AGE<=17) or null) as Malignant_tumour_disease_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') and AGE>=18 and AGE<65) or null) as Malignant_tumour_disease_age_18_64
        ,count((REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') and AGE>=65) or null) as Malignant_tumour_disease_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') and GENDER=1) or null) as Malignant_tumour_disease_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'(C[0-8][0-9]|C9[0-6])') and GENDER=2) or null) as Malignant_tumour_gender_female
        ,count(REGEXP_LIKE(ICD_CODE,'C3[3-4]') or null) as Lung_tumour_disease
        ,count((REGEXP_LIKE(ICD_CODE,'C3[3-4]') and AGE>=0 and AGE<=6) or null) as Lung_tumour_disease_age_0_6
        ,count((REGEXP_LIKE(ICD_CODE,'C3[3-4]') and AGE>=7 and AGE<=17) or null) as Lung_tumour_disease_age_7_17
        ,count((REGEXP_LIKE(ICD_CODE,'C3[3-4]') and AGE>=18 and AGE<65) or null) as Lung_tumour_disease_age_18_64     
        ,count((REGEXP_LIKE(ICD_CODE,'C3[3-4]') and AGE>=65) or null) as Lung_tumour_disease_age_up65
        ,count((REGEXP_LIKE(ICD_CODE,'C3[3-4]') and GENDER=1) or null) as Lung_tumour_disease_gender_male
        ,count((REGEXP_LIKE(ICD_CODE,'C3[3-4]') and GENDER=2) or null) as Lung_tumour_disease_gender_female
        from B2_Clean_2
        group by VISITING_DATE,DISTRICT
        order by VISITING_DATE
    ''').toPandas()
    print('匹配数据耗时：%.2f秒'%(time()-start))
    result.to_csv(output_path,index=None,encoding='GBK')
    # 释放资源
    spark.catalog.dropTempView("B2_Raw")
    spark.catalog.dropTempView("B2_Clean_1")
    spark.catalog.dropTempView("B2_Clean_2")
    print(f'已输出至：{output_path}')
    print('总耗时：%.2f秒'%(time()-t_start))

In [7]:
if __name__=="__main__":
    Create_B3(input_path,output_path)

开始处理 tb_cis_op_visiting_record_201401_201806.csv
开始处理 tb_cis_op_visiting_record_201807_201912.csv
开始处理 tb_cis_op_visiting_record_202001_202012.csv
开始处理 tb_cis_op_visiting_record_202101_202112.csv
开始处理 tb_cis_op_visiting_record_202201_202205.csv
读入数据耗时：246.94秒
匹配数据耗时：402.48秒
已输出至：D:/Output/zhuhao/B3.csv
总耗时：655.91秒
