In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

fb_train = pd.read_csv('./FBlocation/train.csv')
# 将时间戳转换为年月日格式
fb_train['time'] = pd.to_datetime(fb_train['time'], unit='s')
# 提取星期、小时、分钟作为新特征
fb_train['weekday'] = fb_train['time'].dt.weekday  # 星期几(0-6)
fb_train['hour'] = fb_train['time'].dt.hour  # 小时(0-23)
fb_train['minute'] = fb_train['time'].dt.minute  # 分钟(0-59)
fb_train


Unnamed: 0,row_id,x,y,accuracy,time,place_id,weekday,hour,minute
0,0,0.7941,9.0809,54,1970-01-06 10:45:02,8523065625,1,10,45
1,1,5.9567,4.7968,13,1970-01-03 03:49:15,1757726713,5,3,49
2,2,8.3078,7.0407,74,1970-01-04 17:37:28,1137537235,6,17,37
3,3,7.3665,2.5165,65,1970-01-09 03:43:07,6567393236,4,3,43
4,4,4.0961,1.1307,31,1970-01-06 11:08:50,7440663949,1,11,8
...,...,...,...,...,...,...,...,...,...
29118016,29118016,6.5133,1.1435,67,1970-01-05 15:02:20,8671361106,0,15,2
29118017,29118017,5.9186,4.4134,67,1970-01-02 10:51:20,9077887898,4,10,51
29118018,29118018,2.9993,6.3680,67,1970-01-09 12:55:58,2838334300,4,12,55
29118019,29118019,4.0637,8.0061,70,1970-01-09 20:29:35,1007355847,4,20,29


In [3]:
# 显示数据集基本信息
print("数据集基本信息:")
print(fb_train.info())

print("\n数据集统计描述:")
print(fb_train.describe())

print("\n数据集前5行:")
print(fb_train.head())

print("\n数据集形状:")
print(f"行数: {fb_train.shape[0]}, 列数: {fb_train.shape[1]}")

print("\n检查缺失值:")
print(fb_train.isnull().sum())


数据集基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29118021 entries, 0 to 29118020
Data columns (total 9 columns):
 #   Column    Dtype         
---  ------    -----         
 0   row_id    int64         
 1   x         float64       
 2   y         float64       
 3   accuracy  int64         
 4   time      datetime64[ns]
 5   place_id  int64         
 6   weekday   int32         
 7   hour      int32         
 8   minute    int32         
dtypes: datetime64[ns](1), float64(2), int32(3), int64(3)
memory usage: 1.6 GB
None

数据集统计描述:
             row_id             x             y      accuracy  \
count  2.911802e+07  2.911802e+07  2.911802e+07  2.911802e+07   
mean   1.455901e+07  4.999770e+00  5.001814e+00  8.284912e+01   
min    0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00   
25%    7.279505e+06  2.534700e+00  2.496700e+00  2.700000e+01   
50%    1.455901e+07  5.009100e+00  4.988300e+00  6.200000e+01   
75%    2.183852e+07  7.461400e+00  7.510300e+00  7.500000e+01  

In [4]:
# 检查每列的缺失值数量和百分比
missing_values = fb_train.isnull().sum()
missing_percentage = (missing_values / len(fb_train)) * 100

missing_info = pd.DataFrame({
    '缺失值数量': missing_values,
    '缺失值百分比': missing_percentage
})

print("各列缺失值统计:")
print(missing_info)

# 检查是否存在完全重复的行
duplicates = fb_train.duplicated().sum()
print(f"\n完全重复的行数: {duplicates}")
print(f"重复行占总行数的百分比: {(duplicates/len(fb_train))*100:.2f}%")


各列缺失值统计:
          缺失值数量  缺失值百分比
row_id        0     0.0
x             0     0.0
y             0     0.0
accuracy      0     0.0
time          0     0.0
place_id      0     0.0
weekday       0     0.0
hour          0     0.0
minute        0     0.0

完全重复的行数: 0
重复行占总行数的百分比: 0.00%


In [5]:
# 统计每个place_id出现的频次
place_counts = fb_train['place_id'].value_counts()

print("每个place_id出现的次数统计:")
print(place_counts.head())

print(f"\n不同place_id的总数: {len(place_counts)}")
print(f"平均每个place_id出现的次数: {place_counts.mean():.2f}")
print(f"最多出现次数: {place_counts.max()}")
print(f"最少出现次数: {place_counts.min()}")


每个place_id出现的次数统计:
place_id
8772469670    1849
1623394281    1802
1308450003    1757
4823777529    1738
9586338177    1718
Name: count, dtype: int64

不同place_id的总数: 108390
平均每个place_id出现的次数: 268.64
最多出现次数: 1849
最少出现次数: 1


In [13]:
# 根据x和y特征列的数值筛选数据
filtered_fb_train=fb_train.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")

print(f"筛选后的样本数量: {filtered_fb_train.shape[0]}")
print(f"筛选前的样本数量: {fb_train.shape[0]}")
print(f"筛选比例: {filtered_fb_train.shape[0]/fb_train.shape[0]:.2%}")


# 获取出现次数大于3的place_id
frequent_places = place_counts[place_counts > 3].index

# 筛选数据
filtered_data = filtered_fb_train[filtered_fb_train['place_id'].isin(frequent_places)]

print(f"筛选前的样本数量: {filtered_fb_train.shape[0]}")
print(f"筛选后的样本数量: {filtered_data.shape[0]}")
print(f"筛选比例: {filtered_data.shape[0]/filtered_fb_train.shape[0]:.2%}")

# 统计筛选后不同place_id的数量
print(f"\n筛选后不同place_id的数量: {filtered_data['place_id'].nunique()}")
print(f"筛选前不同place_id的数量: {filtered_fb_train['place_id'].nunique()}")


筛选后的样本数量: 17707
筛选前的样本数量: 29116952
筛选比例: 0.06%
筛选前的样本数量: 17707
筛选后的样本数量: 17707
筛选比例: 100.00%

筛选后不同place_id的数量: 802
筛选前不同place_id的数量: 802


In [14]:
# 提取特征列
X = filtered_data[['x', 'y', 'accuracy', 'weekday', 'hour', 'minute']]

# 提取标签列
y = filtered_data['place_id']

print("特征数据形状:", X.shape)
print("标签数据形状:", y.shape)

# 查看前几行数据
print("\n特征数据前5行:")
print(X.head())
print("\n标签数据前5行:")
print(y.head())


特征数据形状: (17707, 6)
标签数据形状: (17707,)

特征数据前5行:
           x       y  accuracy  weekday  hour  minute
600   1.2214  2.7023        17        3    18       9
957   1.1832  2.6891        58        5     2      11
4345  1.1935  2.6550        11        0    15       8
4735  1.1452  2.6074        49        1    23       3
5580  1.0089  2.7287        19        4    11      26

标签数据前5行:
600     6683426742
957     6683426742
4345    6889790653
4735    6822359752
5580    1527921905
Name: place_id, dtype: int64


In [18]:
# 使用sklearn的train_test_split函数划分数据集
from sklearn.model_selection import train_test_split

# 划分训练集和验证集,测试集比例为20%,随机种子设置为42保证可复现性
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用StandardScaler进行特征标准化
from sklearn.preprocessing import StandardScaler

# 创建StandardScaler对象
scaler = StandardScaler()

# 使用训练集数据拟合标准化器并转换训练集
X_train_scaled = scaler.fit_transform(X_train)

# 使用已经拟合的标准化器转换测试集
X_test_scaled = scaler.transform(X_test)

print("标准化后训练集形状:", X_train_scaled.shape)
print("标准化后测试集形状:", X_test_scaled.shape)

# 查看标准化后的数据
print("\n标准化后训练集前5行:")
print(X_train_scaled[:5])

# 打印训练集和测试集的大小
print("\n数据集划分结果:")
print(f"训练集样本数: {X_train.shape[0]}")
print(f"测试集样本数: {X_test.shape[0]}")
print(f"训练集特征数: {X_train.shape[1]}")
print(f"测试集特征数: {X_test.shape[1]}")

# 计算并打印训练集和测试集的比例
train_ratio = X_train.shape[0] / X.shape[0] * 100
test_ratio = X_test.shape[0] / X.shape[0] * 100
print(f"\n训练集占比: {train_ratio:.2f}%")
print(f"测试集占比: {test_ratio:.2f}%")





标准化后训练集形状: (14165, 6)
标准化后测试集形状: (3542, 6)

标准化后训练集前5行:
[[ 1.96320394e-03 -1.23144112e+00 -7.35842462e-02  5.41864566e-01
   1.51169070e+00 -3.29036562e-01]
 [ 9.84166181e-01 -1.87844820e+00 -1.97100793e-01  5.41864566e-01
   1.22365181e+00  9.99354062e-01]
 [ 1.65408563e+00  1.30374111e+00 -1.00052078e-01 -5.19395171e-02
  -1.65673704e+00 -1.31089050e+00]
 [-1.12925474e+00  1.75406484e-01 -7.17634811e-01 -1.23954768e+00
   2.15515716e-01  8.83841834e-01]
 [-9.11563312e-01 -1.83702832e+00  3.75542870e+00  5.41864566e-01
   5.03554602e-01  7.10573492e-01]]

数据集划分结果:
训练集样本数: 14165
测试集样本数: 3542
训练集特征数: 6
测试集特征数: 6

训练集占比: 80.00%
测试集占比: 20.00%


In [20]:
# 导入所需的库
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# 定义参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # K值范围
    'weights': ['uniform', 'distance'], # 权重选项
    'metric': ['euclidean', 'manhattan']  # 距离度量方式
}

# 创建KNN分类器
knn = KNeighborsClassifier()

# 创建网格搜索对象,使用3折交叉验证
grid_search = GridSearchCV(
    knn, 
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# 在训练集上进行网格搜索
print("开始网格搜索...")
grid_search.fit(X_train_scaled, y_train)

# 输出最佳参数和得分
print("\n最佳参数:", grid_search.best_params_)
print("最佳交叉验证得分: {:.4f}".format(grid_search.best_score_))

# 使用最佳模型在测试集上进行预测
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test_scaled, y_test)
print("\n测试集得分: {:.4f}".format(test_score))

# 显示所有参数组合的结果
print("\n所有参数组合的详细结果:")
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results[['params', 'mean_test_score', 'std_test_score']]
print(cv_results.sort_values(by='mean_test_score', ascending=False).head())


开始网格搜索...
Fitting 3 folds for each of 20 candidates, totalling 60 fits





最佳参数: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}
最佳交叉验证得分: 0.4582

测试集得分: 0.4712

所有参数组合的详细结果:
                                               params  mean_test_score  \
19  {'metric': 'manhattan', 'n_neighbors': 11, 'we...         0.458172   
17  {'metric': 'manhattan', 'n_neighbors': 9, 'wei...         0.453794   
15  {'metric': 'manhattan', 'n_neighbors': 7, 'wei...         0.448994   
13  {'metric': 'manhattan', 'n_neighbors': 5, 'wei...         0.443629   
18  {'metric': 'manhattan', 'n_neighbors': 11, 'we...         0.436710   

    std_test_score  
19        0.000677  
17        0.001382  
15        0.001863  
13        0.002905  
18        0.001783  
