# POI Extractor

This code extracts the points of interest (POIs) from the Amap API based on the user's input.


In [14]:
API_KEY = "fe7011c045b61849c928b90bacf7dd3d"

In [1]:
import requests
import json
from collections import OrderedDict
from typing import Dict, List

class PrecisionAMapGeocoder:
    """精密版逆地理编码处理器"""
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://restapi.amap.com/v3/geocode/regeo"
        self.cache = OrderedDict()
        self.max_cache_size = 200
        self.road_hierarchy = {0: '主干道', 1: '次干道', 2: '支路'}

    def _get_cache_key(self, params: Dict) -> tuple:
        """生成精确缓存键"""
        return tuple((k, str(v)) for k, v in sorted(params.items()))

    def _cache_management(self, key: tuple, data: Dict):
        """智能缓存管理"""
        if key in self.cache:
            self.cache.move_to_end(key)
            return
        if len(self.cache) >= self.max_cache_size:
            self.cache.popitem(last=False)
        self.cache[key] = data

    def _enhance_poi(self, poi_data: List) -> List[Dict]:
        """POI数据精密化处理"""
        enhanced = []
        for p in poi_data:
            loc = p.get('location', '').split(',')
            enhanced.append({
                'id': p.get('id'),
                'name': p.get('name'),
                'type': p.get('type'),
                'distance': round(float(p.get('distance', 9999)), 3),
                'weight': float(p.get('poiweight', 0)),
                'address': p.get('address'),
                'lng': loc[0] if len(loc)==2 else None,
                'lat': loc[1] if len(loc)==2 else None,
                'business_area': p.get('businessarea'),
                'tel': p.get('tel')
            })
        return sorted(enhanced, key=lambda x: x['distance'])

    def _refine_roads(self, road_data: List) -> List[Dict]:
        """道路数据精细化处理"""
        refined = []
        for r in road_data:
            loc = r.get('location', '').split(',')
            refined.append({
                'id': r.get('id'),
                'name': r.get('name'),
                'level': int(r.get('level', 0)),
                'level_desc': self.road_hierarchy.get(int(r.get('level', 0)), '未知'),
                'distance': round(float(r.get('distance', 9999)), 3),
                'direction': r.get('direction'),
                'lng': loc[0] if len(loc)==2 else None,
                'lat': loc[1] if len(loc)==2 else None
            })
        return sorted(refined, key=lambda x: x['distance'])

    def get_location_info(self, lng: float, lat: float, **kwargs) -> Dict:
        """
        精确地理信息查询
        :param lng: 经度
        :param lat: 纬度
        :param kwargs: 扩展参数
          - radius: 搜索半径(米)
          - poitype: POI类型代码
          - roadlevel: 道路等级
        """
        params = {
            'key': self.api_key,
            'location': f"{lng},{lat}",
            'radius': kwargs.get('radius', 1000),
            'poitype': kwargs.get('poitype'),
            'roadlevel': kwargs.get('roadlevel', 0),
            'extensions': 'all',
            'output': 'json'
        }
        
        cache_key = self._get_cache_key(params)
        if cache_key in self.cache:
            return self.cache[cache_key]

        try:
            resp = requests.get(self.base_url, params=params, timeout=10)
            resp.raise_for_status()
            raw = resp.json()

            result = {
                'status': int(raw.get('status', 0)),
                'info': raw.get('info', ''),
                'infocode': raw.get('infocode', ''),
                'address': {},
                'pois': [],
                'roads': [],
                'quality': {}
            }

            if result['status'] == 1:
                regeocode = raw.get('regeocode', {})
                addr_comp = regeocode.get('addressComponent', {})
                
                # 精密地址处理
                result['address'] = {
                    'formatted': regeocode.get('formatted_address', ''),
                    'country': addr_comp.get('country'),
                    'province': addr_comp.get('province'),
                    'city': addr_comp.get('city') or addr_comp.get('citycode', ''),
                    'district': addr_comp.get('district'),
                    'street': addr_comp.get('streetNumber', {}).get('street'),
                    'number': addr_comp.get('streetNumber', {}).get('number'),
                    'township': addr_comp.get('township')
                }

                # POI精密处理
                result['pois'] = self._enhance_poi(regeocode.get('pois', []))
                
                # 道路精密处理
                result['roads'] = self._refine_roads(regeocode.get('roads', []))
                
                # 质量评估
                main_roads = [r for r in result['roads'] if r['level'] == 0]
                total_poi = len(result['pois'])
                search_area = (params['radius']/1000)**2  # 平方公里
                
                result['quality'] = {
                    'poi_density': total_poi/search_area if search_area>0 else 0,
                    'main_road_ratio': len(main_roads)/len(result['roads']) if result['roads'] else 0,
                    'poi_coverage': min(total_poi/50, 1.0)  # 标准化覆盖率
                }

            self._cache_management(cache_key, result)
            return result

        except requests.exceptions.RequestException as e:
            return {'status': 0, 'info': f"网络异常: {str(e)}"}
        except json.JSONDecodeError:
            return {'status': 0, 'info': '响应数据解析失败'}
        except Exception as e:
            return {'status': 0, 'info': f"系统错误: {str(e)}"}

# 验证测试
if __name__ == "__main__":
    # 需替换真实API Key
    # AMAP_KEY = "your_amap_key_here"
    
    # 初始化处理器
    processor = PrecisionAMapGeocoder(API_KEY)
    
    # 测试坐标（示例中的坐标）
    test_lng = 106.508308
    test_lat = 29.53811
    
    # 执行查询
    data = processor.get_location_info(
        lng=test_lng,
        lat=test_lat,
        poitype="加油站",
        radius=2000,
        roadlevel=0
    )
    
    # 结构验证
    assert data.keys() == {'status', 'info', 'infocode', 'address', 'pois', 'roads', 'quality'}, "结构验证失败"
    assert isinstance(data['pois'], list), "POI数据异常"
    assert isinstance(data['roads'], list), "道路数据异常"
    print("数据结构验证通过")
    
    # 打印示例数据
    import pprint
    pprint.pprint(data)


KeyboardInterrupt: 

In [27]:
import pandas as pd,time
from operator import itemgetter

def extract_poi_data(result: dict,poi_type: str) -> dict:
    """从逆地理编码结果中提取结构化POI信息"""
    output = {}
    
    # 提取加油站信息（按距离排序）
    pois = sorted(
        [p for p in result.get('pois', []) if poi_type in p['type']],
        key=itemgetter('distance')
    )
    for idx, poi in enumerate(pois[:5], 1):  # 保留前5个最近加油站
        output.update({
            # f'Distance{poi_type}_{idx}_id': poi['id'],
            # f'Distance{poi_type}_{idx}_name': poi['name'],
            f'{poi_type}_{idx}_distance': poi['distance']
        })
    
    # 提取主干道信息（按距离排序）
    roads = sorted(result.get('roads', []), key=itemgetter('distance'))
    for idx, road in enumerate(roads[:3], 1):  # 保留前3条最近道路
        output.update({
            # f'Road_{idx}_id': road['id'],
            # f'Road_{idx}_name': road['name'],
            f'Road_{idx}_distance': road['distance']
        })
    
    # 提取基础地理信息
    addr = result['address']
    output.update({
        'province': addr['province'],
        'city': addr['city'][0] if addr['city'] else '',
        'district': addr['district'],
        'street': addr['street'],
        # 'nearest_poi_name': result['nearest_poi']['name'],
        # 'nearest_road_distance': result['nearest_road']['distance']
        # f'nearest_{poi_type}_distance':result
    })
    
    # 提取其他POI元数据
    output['poi_count'] = len(result.get('pois', []))
    output['road_count'] = len(result.get('roads', []))
    
    return output

# 使用示例
import os
import pandas as pd
from tqdm import tqdm  # 进度条支持，可选

def process_poi_batch(input_df: pd.DataFrame, poi_type: str, batch_size=100):
    """带批次处理的POI数据增强"""
    # 创建存储目录
    output_dir = f"poi_output/{poi_type}"
    os.makedirs(output_dir, exist_ok=True)
    
    # 初始化缓冲
    buffer = []
    batch_num = 0
    
    # 处理进度条
    for _, row in tqdm(input_df.iterrows(), total=len(input_df)):
        # 调用地理编码接口
        time.sleep(0.4)  # 限制请求频率
        result = processor.get_location_info(
            lng=row['gcj02_lng'],
            lat=row['gcj02_lat'],
            poitype=poi_type,
            radius=3000
        )
        
        # 合并数据
        merged = {
            **row.to_dict(),
            'poi_count': len(result.get('pois', [])),
            'road_count': len(result.get('roads', []))
        }
        
        # 添加POI距离
        for i in range(5):
            key = f'{poi_type}_{i+1}_distance'
            merged[key] = result['pois'][i]['distance'] if len(result['pois'])>i else None
        
        # 添加道路距离
        for i in range(3):
            key = f'Road_{i+1}_distance'
            merged[key] = result['roads'][i]['distance'] if len(result['roads'])>i else None
        
        buffer.append(merged)
        
        # 批次写入
        if len(buffer) >= batch_size:
            save_batch(buffer, batch_num, output_dir, poi_type)
            batch_num += 1
            buffer = []
    
    # 写入剩余数据
    if buffer:
        save_batch(buffer, batch_num, output_dir, poi_type)

def save_batch(data: list, batch_num: int, output_dir: str, poi_type: str):
    """保存批次文件"""
    df = pd.DataFrame(data)
    file_path = f"{output_dir}/{poi_type}_batch_{batch_num}.csv"
    df.to_csv(file_path, index=False)



In [30]:
df = pd.read_csv('小区地理位置POI预处理版本.csv')
amuse_list = ["银行", "餐饮服务", "风景名胜", "体育休闲服务"]
for amuse_name in amuse_list:
    process_poi_batch(df,poi_type=amuse_name)

  0%|          | 0/3602 [00:00<?, ?it/s]

100%|██████████| 3602/3602 [33:38<00:00,  1.78it/s]
100%|██████████| 3602/3602 [33:49<00:00,  1.77it/s]
100%|██████████| 3602/3602 [33:44<00:00,  1.78it/s]
100%|██████████| 3602/3602 [34:35<00:00,  1.74it/s]


```json
{
    "交通": ["地铁站", "公交站"],
    "教育": ["幼儿园", "小学", "中学", "大学"],
    "医疗": ["医院", "药店"],
    "购物": ["商场", "超市", "市场"],
    "生活": ["银行", "ATM", "餐厅", "咖啡馆"],
    "娱乐": ["公园", "电影院", "健身房", "体育馆"]
}


In [34]:
# process_poi_batch(df,r"便民商店")
process_poi_batch(df,"政府机关")

100%|██████████| 3602/3602 [33:49<00:00,  1.77it/s]


In [31]:
edu_list = ["幼儿园", "中学", "小学"]
medic_list = ["综合医院", "诊所"]
shopping_list = ["超级市场", "商场", "便民商店/便利店"]

request_list = edu_list + medic_list + shopping_list

for request in request_list:
    print(f"Processing {request}...")
    process_poi_batch(df,request)

Processing 幼儿园...


100%|██████████| 3602/3602 [34:02<00:00,  1.76it/s]


Processing 中学...


100%|██████████| 3602/3602 [33:30<00:00,  1.79it/s]


Processing 小学...


100%|██████████| 3602/3602 [33:41<00:00,  1.78it/s]


Processing 综合医院...


100%|██████████| 3602/3602 [33:40<00:00,  1.78it/s]


Processing 诊所...


100%|██████████| 3602/3602 [34:03<00:00,  1.76it/s]


Processing 超级市场...


100%|██████████| 3602/3602 [33:42<00:00,  1.78it/s]


Processing 商场...


100%|██████████| 3602/3602 [33:42<00:00,  1.78it/s]


Processing 便民商店/便利店...


  3%|▎         | 99/3602 [00:56<33:34,  1.74it/s]


OSError: Cannot save file into a non-existent directory: 'poi_output\便民商店\便利店\便民商店'

In [None]:
# poi_dict = {
#     "交通": ["加油站", "充电站"],
#     "教育": ["幼儿园", "中学", "小学"],
#     "医疗": ["综合医院", "诊所"],
#     "购物": ["超级市场", "商场", "便民商店/便利店"],
#     "生活娱乐": ["银行", "餐饮服务", "风景名胜", "体育休闲服务"]
# }
['中学', '体育休闲服务', '便民商店', '商场', '地铁站', '小学', '幼儿园', '政府机关', '综合医院', '诊所', '超级市场', '银行', '风景名胜', '餐饮服务']

# Combine the Dataset

In [15]:
total_df = pd.read_csv(r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\poi_output\综合医院\综合医院_batch_23.csv")
print(total_df.columns)

Index(['input_name', 'input_address', 'output_address', 'gcj02_lng',
       'gcj02_lat', 'poi_count', 'road_count', '综合医院_1_distance',
       '综合医院_2_distance', '综合医院_3_distance', '综合医院_4_distance',
       '综合医院_5_distance', 'Road_1_distance', 'Road_2_distance',
       'Road_3_distance'],
      dtype='object')


In [None]:
import pandas as pd,os
file_path = r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\poi_output"
sample_cols = Index(['input_name', 'input_address', 'output_address', 'gcj02_lng',
       'gcj02_lat', 'poi_count', 'road_count', '综合医院_1_distance',
       '综合医院_2_distance', '综合医院_3_distance', '综合医院_4_distance',
       '综合医院_5_distance', 'Road_1_distance', 'Road_2_distance',
       'Road_3_distance'],
      dtype='object')
common_cols = ['input_name', 'input_address', 'output_address', 'gcj02_lng',
       'gcj02_lat', 'poi_count', 'road_count',  'Road_1_distance', 'Road_2_distance','Road_3_distance']
# 获取所有POI类型文件夹名称（需确保 poi_output 下只有文件夹）
poi_categories = [item for item in os.listdir(file_path) 
                  if os.path.isdir(os.path.join(file_path, item))]
print(poi_categories)
# 遍历每个POI类型（如"中学"、"加油站"）
for poi_name in poi_categories:
    # 构建POI文件夹路径（例如 .../poi_output/中学）
    combined_df = pd.DataFrame()
    poi_folder = os.path.join(file_path, poi_name)
    
    # 遍历该POI文件夹内的所有CSV文件
    for file in os.listdir(poi_folder):
        if file.endswith(".csv") and file.startswith(f"{poi_name}_batch_"):
            full_path = os.path.join(poi_folder, file)
            df = pd.read_csv(full_path)
            combined_df = pd.concat([combined_df, df], axis=0, ignore_index=True)
            



['中学', '体育休闲服务', '便民商店', '商场', '地铁站', '小学', '幼儿园', '政府机关', '综合医院', '诊所', '超级市场', '银行', '风景名胜', '餐饮服务']


In [22]:
import pandas as pd
import os

file_path = r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\poi_output"

# 公共字段用于横向合并
merge_keys = ['input_name', 'input_address', 'output_address', 'gcj02_lng', 'gcj02_lat','road_count',  'Road_1_distance', 'Road_2_distance','Road_3_distance']

# 获取所有 POI 类别的文件夹
poi_categories = [item for item in os.listdir(file_path) if os.path.isdir(os.path.join(file_path, item))]
print(f"检测到 {len(poi_categories)} 个类别：", poi_categories)

# 初始化主表（用于存放全部合并结果）
final_df = None

for poi_name in poi_categories:
    poi_folder = os.path.join(file_path, poi_name)
    combined_df = pd.DataFrame()

    # 纵向合并该类别下所有批次文件
    for file in os.listdir(poi_folder):
        if file.endswith(".csv") and file.startswith(f"{poi_name}_batch_"):
            df = pd.read_csv(os.path.join(poi_folder, file), encoding='utf-8')
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # 提取要合并的列
    if not set(merge_keys).issubset(set(combined_df.columns)):
        print(f"警告：{poi_name} 缺失合并关键列，跳过")
        continue

    # 提取 distance 相关列
    distance_cols = [col for col in combined_df.columns if 'distance' in col and col not in merge_keys]
    poi_df = combined_df[merge_keys + distance_cols]

    # 避免列名冲突：加上 POI 类别前缀
    # rename_dict = {col: f"{poi_name}_{col}" for col in distance_cols}
    
    # poi_df.rename(columns=rename_dict, inplace=True)

    # 横向合并（左连接）
    if final_df is None:
        final_df = poi_df
    else:
        final_df = pd.merge(final_df, poi_df, on=merge_keys, how='outer')

# 输出
# output_path = os.path.join(file_path, 'final_combined_POI_distances.csv')
os.makedirs("output", exist_ok=True)
output_path = os.path.join("output", 'final_combined_POI_distances.csv')
final_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\n合并成功，文件保存至：{output_path}")


检测到 14 个类别： ['中学', '体育休闲服务', '便民商店', '商场', '地铁站', '小学', '幼儿园', '政府机关', '综合医院', '诊所', '超级市场', '银行', '风景名胜', '餐饮服务']

合并成功，文件保存至：output\final_combined_POI_distances.csv


In [25]:
file_path = r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\xiaoqu_data"
xiaoqu_df = pd.DataFrame()
for file in os.listdir(file_path):
    # os.listdir(file_path)
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(file_path, file))
        xiaoqu_df = pd.concat([xiaoqu_df, df],axis = 0)
xiaoqu_out = os.path.join("output","xiaoqu_data.csv")
xiaoqu_df.to_csv(xiaoqu_out,index=False)

In [29]:
xiaoqu_list = pd.read_csv(r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\community_list.csv")

poi_df = pd.read_csv(r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\output\final_combined_POI_distances.csv")

xiaoqu_df = pd.read_csv(r"C:\Users\Fisher Man\OneDrive\Desktop\Work Sheet\Py\Some Projects\Urban_Econ\Eassay3\HousingPriceCrawler\output\xiaoqu_data.csv")

print(f" shape for xiaoqu_list: {xiaoqu_list.shape}\t shape for poi_df: {poi_df.shape}\t shape for xiaoqu_df: {xiaoqu_df.shape} \n cols for xiaoqu_list: {xiaoqu_list.columns}\t cols for poi_df: {poi_df.columns}\t cols for xiaoqu_df: {xiaoqu_df.columns}")

 shape for xiaoqu_list: (3853, 5)	 shape for poi_df: (3602, 79)	 shape for xiaoqu_df: (3753, 23) 
 cols for xiaoqu_list: Index(['district', 'community', 'link', 'district_CN', 'location'], dtype='object')	 cols for poi_df: Index(['input_name', 'input_address', 'output_address', 'gcj02_lng',
       'gcj02_lat', 'road_count', 'Road_1_distance', 'Road_2_distance',
       'Road_3_distance', '中学_1_distance', '中学_2_distance', '中学_3_distance',
       '中学_4_distance', '中学_5_distance', '体育休闲服务_1_distance',
       '体育休闲服务_2_distance', '体育休闲服务_3_distance', '体育休闲服务_4_distance',
       '体育休闲服务_5_distance', '便民商店_1_distance', '便民商店_2_distance',
       '便民商店_3_distance', '便民商店_4_distance', '便民商店_5_distance',
       '商场_1_distance', '商场_2_distance', '商场_3_distance', '商场_4_distance',
       '商场_5_distance', '地铁站_1_distance', '地铁站_2_distance', '地铁站_3_distance',
       '地铁站_4_distance', '地铁站_5_distance', '小学_1_distance', '小学_2_distance',
       '小学_3_distance', '小学_4_distance', '小学_5_distance', '幼儿园_1_di

In [36]:
merged_1 = xiaoqu_list.merge(poi_df, left_on='location', right_on='input_name', how='left')
final_merged = merged_1.merge(xiaoqu_df, left_on='community', right_on='小区名称', how='left')
final_merged_path = os.path.join("output","final_merged.csv")   
# xiaoqu_out = os.path.join("output","xiaoqu_data.csv")
final_merged.to_csv(final_merged_path, index=False,encoding='utf-8-sig')
print(f" shape of merged df :{merged_1.shape} shape of final merged dataframe: {final_merged.shape} \n ")

 shape of merged df :(3853, 84) shape of final merged dataframe: (3859, 107) 
 


In [30]:
print(xiaoqu_list.head())
print(xiaoqu_df.head())
print(poi_df.head())

   district  community                                             link  \
0  jinjiang      皇经楼一期     https://cd.lianjia.com/xiaoqu/3011052976570/   
1  jinjiang       上东家园     https://cd.lianjia.com/xiaoqu/3011053437765/   
2  jinjiang  皇经楼新居二期B区     https://cd.lianjia.com/xiaoqu/1611061607677/   
3  jinjiang  绿地中心468星朗  https://cd.lianjia.com/xiaoqu/1620024208685305/   
4  jinjiang   皇经楼二街68号     https://cd.lianjia.com/xiaoqu/3011052642927/   

  district_CN      location  
0         锦江区      锦江区皇经楼一期  
1         锦江区       锦江区上东家园  
2         锦江区  锦江区皇经楼新居二期B区  
3         锦江区  锦江区绿地中心468星朗  
4         锦江区   锦江区皇经楼二街68号  
        小区名称      参考均价  在售房源数  关注人数              小区ID          经度         纬度  \
0      皇经楼一期   9819元/㎡    NaN    92     3011052976570  104.114738  30.599153   
1       上东家园  11030元/㎡    NaN   451     3011053437765  104.114559  30.605445   
2  皇经楼新居二期B区  11347元/㎡    NaN   158     1611061607677  104.114660  30.596701   
3  绿地中心468星朗  11365元/㎡    NaN   140  162002420868