In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# 加载数据
data_cleaned = pd.read_csv('./iBite_table_processed.csv')

# 特征和目标变量
# 直接使用经纬度作为特征
X = data_cleaned[['latitude', 'longitude']]

# 结合其他特征
X = pd.concat([X, data_cleaned[['head.w', 'head.h', 'head.l', 'th.w', 'body.l', 'wing.l']]], axis=1)
X

Unnamed: 0,latitude,longitude,head.w,head.h,head.l,th.w,body.l,wing.l
0,-1.235132,1.425063,-0.262986,-0.142677,-0.766951,-0.430514,0.043552,0.660228
1,-1.235132,1.425063,-0.262986,-0.142677,-0.766951,-0.430514,0.043552,0.660228
2,-1.235132,1.425063,-0.127646,-0.076605,-0.734680,-0.290133,0.348621,0.762769
3,-1.402920,1.462618,-0.184905,0.016674,-0.721772,-0.094451,-0.048041,0.687047
4,0.977303,-0.279847,-0.247370,-0.088265,-0.979938,0.224597,-0.319934,-1.155538
...,...,...,...,...,...,...,...,...
1350,0.979260,-0.279480,-1.210370,-1.157081,-0.534602,-1.213245,-0.751934,-0.413298
1351,0.979260,-0.279480,-1.184343,-1.133761,-0.528148,-1.208991,-0.769243,-0.442483
1352,0.979260,-0.279480,-1.231191,-1.125988,-0.534602,-1.196229,-0.810351,-0.458258
1353,0.979260,-0.279480,-1.278040,-1.129874,-0.612052,-1.145181,-0.824775,-0.516628


In [2]:
#使用Haversine公式计算地理距离
import numpy as np
from math import radians, sin, cos, sqrt, atan2

# Haversine公式计算球面距离
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # 地球平均半径，单位为千米
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

# 计算与参考点的距离（可以自定义参考点，如城市中心）
reference_point = (51.5074, -0.1278)  # 例如伦敦的经纬度
data_cleaned['distance_to_reference'] = data_cleaned.apply(
    lambda row: haversine(row['latitude'], row['longitude'], reference_point[0], reference_point[1]), axis=1)

# 将计算出的距离特征加入特征集
X = pd.concat([data_cleaned[['distance_to_reference']], data_cleaned[['head.w', 'head.h', 'body.l']]], axis=1)
X

Unnamed: 0,distance_to_reference,head.w,head.h,body.l
0,5866.531055,-0.262986,-0.142677,0.043552
1,5866.531055,-0.262986,-0.142677,0.043552
2,5866.531055,-0.127646,-0.076605,0.348621
3,5885.273403,-0.184905,0.016674,-0.048041
4,5618.708464,-0.247370,-0.088265,-0.319934
...,...,...,...,...
1350,5618.490812,-1.210370,-1.157081,-0.751934
1351,5618.490812,-1.184343,-1.133761,-0.769243
1352,5618.490812,-1.231191,-1.125988,-0.810351
1353,5618.490812,-1.278040,-1.129874,-0.824775


In [3]:
#克里金插值
import numpy as np
import geopandas as gpd
from pykrige.ok import OrdinaryKriging

# 创建GeoDataFrame
gdf = gpd.GeoDataFrame(data_cleaned, geometry=gpd.points_from_xy(data_cleaned.longitude, data_cleaned.latitude))

# 选择插值的特征
latitude = gdf['latitude'].values
longitude = gdf['longitude'].values
iBite = gdf['iBite'].values

# 创建Kriging插值模型
OK = OrdinaryKriging(longitude, latitude, iBite, variogram_model='linear')
gridx = np.linspace(min(longitude), max(longitude), 100)
gridy = np.linspace(min(latitude), max(latitude), 100)
z, ss = OK.execute('grid', gridx, gridy)

# 插值结果可以用作特征加入模型
data_cleaned['kriging_iBite'] = z.ravel()[:len(data_cleaned)]  # 将插值结果映射回原数据集

# 将克里金插值结果作为特征
X = pd.concat([data_cleaned[['kriging_iBite']], data_cleaned[['head.w', 'body.l']]], axis=1)
X

Unnamed: 0,kriging_iBite,head.w,body.l
0,1374.371956,-0.262986,0.043552
1,1374.371956,-0.262986,0.043552
2,1374.371956,-0.127646,0.348621
3,1374.371956,-0.184905,-0.048041
4,1374.371956,-0.247370,-0.319934
...,...,...,...
1350,1374.371956,-1.210370,-0.751934
1351,1374.371956,-1.184343,-0.769243
1352,1374.371956,-1.231191,-0.810351
1353,1374.371956,-1.278040,-0.824775


In [4]:
from sklearn.cluster import DBSCAN
import numpy as np

# 使用DBSCAN进行空间聚类
coords = data_cleaned[['latitude', 'longitude']].values
db = DBSCAN(eps=0.1, min_samples=5, metric='haversine').fit(np.radians(coords))

# 将聚类标签加入数据集
data_cleaned['cluster'] = db.labels_

# 使用聚类结果作为特征
X = pd.concat([data_cleaned[['cluster']], data_cleaned[['head.w', 'body.l']]], axis=1)
X

Unnamed: 0,cluster,head.w,body.l
0,0,-0.262986,0.043552
1,0,-0.262986,0.043552
2,0,-0.127646,0.348621
3,0,-0.184905,-0.048041
4,0,-0.247370,-0.319934
...,...,...,...
1350,0,-1.210370,-0.751934
1351,0,-1.184343,-0.769243
1352,0,-1.231191,-0.810351
1353,0,-1.278040,-0.824775


In [5]:
#将经纬度数据转换为栅格网格编码
import h3

# 定义转换函数
def latlon_to_h3(lat, lon, resolution=8):
    return h3.geo_to_h3(lat, lon, resolution)

# 将经纬度转换为H3网格编码
data_cleaned['h3_index'] = data_cleaned.apply(lambda row: latlon_to_h3(row['latitude'], row['longitude']), axis=1)

# 将H3编码作为特征
X = pd.concat([data_cleaned[['h3_index']], data_cleaned[['head.w', 'body.l']]], axis=1)
X

ModuleNotFoundError: No module named 'h3'