In [16]:
import numpy as np
import pandas as pd
import glob

# 文件合并
df = pd.concat(
    [pd.read_csv(f, encoding='utf_8_sig') for f in glob.glob('*.csv')],
    ignore_index=False)
df.drop(columns=['Unnamed: 0'], inplace=True)

# 去除重复的医生数据只保留第一条
df.drop_duplicates(['name', 'address', 'level', 'service_price_img', 'service_price_video'], keep='first', inplace=True)

# 索引重置
df.reset_index(drop=True, inplace=True)

df['service_price_img'] = df['service_price_img'].astype(str)
df['service_price_video'] = df['service_price_video'].astype(str)
# 去除service_price_img和service_price_video列中的“¥”
df['service_price_img'] = df['service_price_img'].apply(lambda x: x.replace('￥', ''))
df['service_price_video'] = df['service_price_video'].apply(lambda x: x.replace('￥', ''))

# 将service_price_img和service_price_video列的值转为float类型
df['service_price_img'] = df['service_price_img'].astype(float)
df['service_price_video'] = df['service_price_video'].astype(float)

# 'service_price_img'列为空，'have_img'列为0反之为1, 'service_price_video'同理
df['have_img'] = df['service_price_img'].apply(lambda x: 0 if pd.isnull(x) else 1)
df['have_video'] = df['service_price_video'].apply(lambda x: 0 if pd.isnull(x) else 1)

# 去除爬虫后level列数据里存在的空格，并将level列对应得到level_transform列
df['level'] = df['level'].apply(lambda x: x.replace('\xa0\xa0', ''))
df['level_transform'] = df['level'].apply(lambda x: 1 if x in ['住院医师', '医师'] else 2 
                                          if x in ['主治医师', '主治中医师'] else 3 
                                          if x in ['副主任医师', '副主任中医师'] else 4 
                                          if x in ['主任医师', '主任中医师'] else 0)

# 将level_transform列的值为0的数据删除
df = df[df['level_transform'] != 0]

# 判断医院是否为三甲医院
df_hospital = pd.read_csv('../三甲医院名单.csv', encoding='utf_8_sig')
top_hospital_list = df_hospital['hospital_name'].tolist()
df['hospital_level'] = df['address'].apply(lambda x: 1 if any(i in x for i in top_hospital_list) else 0)

# 保存到csv文件
df.to_csv('all.csv', encoding='utf_8_sig')

In [6]:
df_hospital = pd.read_csv('../三甲医院名单.csv', encoding='utf_8_sig')
df_hospital

Unnamed: 0.1,Unnamed: 0,hospital_name
0,0,鞍钢集团公司总医院
1,1,安徽省第二人民医院
2,2,安徽医科大学第二附属医院
3,3,安徽医科大学第一附属医院
4,4,安庆市立医院
...,...,...
215,215,葫芦岛市中心医院
216,216,湖南省儿童医院
217,217,湖南省脑科医院
218,218,湖南省直中医医院
