# knn实战

In [37]:
import pandas as pd

#读取数据
df=pd.read_csv('data/FBlocation/train.csv')

#数据筛选
# filtered_df=df[(df['x']>1.0)&(df['x']<1.5)&(df['y']>2.5)&(df['y']<2.75)]
filtered_df=df.query('x>1.0 & x<1.5 & y>2.5 & y<2.75')

#time列以分钟为单位,转换为时间格式(假定从时间原起点)
filtered_df.loc[:,'datetime']=pd.to_datetime(filtered_df.loc[:,'time'],unit='m')
print(filtered_df.shape)
filtered_df.head()


(35189, 7)


Unnamed: 0,row_id,x,y,accuracy,time,place_id,datetime
600,600,1.2214,2.7023,17,65380,6683426742,1970-02-15 09:40:00
863,863,1.3828,2.6444,64,245591,5784939944,1970-06-20 13:11:00
957,957,1.1832,2.6891,58,785470,6683426742,1971-06-30 11:10:00
1693,1693,1.2583,2.6839,72,281783,3952821602,1970-07-15 16:23:00
4345,4345,1.1935,2.655,11,400082,6889790653,1970-10-05 20:02:00


In [38]:
from sklearn.model_selection import train_test_split
# 基于 datetime 提取 weekday/hour/day 并进行特征清理与数据集划分

# 复制 filtered_df，避免后续赋值产生 SettingWithCopyWarning
filtered_df = filtered_df.copy()

# 1) 从 datetime 提取星期几(0=周一..6=周日)、小时、天数(每月的日号)
filtered_df['weekday'] = filtered_df['datetime'].dt.weekday
filtered_df['hour'] = filtered_df['datetime'].dt.hour
filtered_df['day'] = filtered_df['datetime'].dt.day

# 只保留出现次数不少于3次的 place_id
valid_ids = filtered_df['place_id'].value_counts()
valid_ids = valid_ids[valid_ids >= 3].index
filtered_df = filtered_df[filtered_df['place_id'].isin(valid_ids)]

# 2) 移除不参与建模的列：row_id、time、datetime
features_df = filtered_df.drop(columns=['row_id','time','datetime'])

# 3) 构造特征矩阵 X 与标签 y（place_id 为分类目标）
X = features_df.drop(columns=['place_id'])
y = features_df['place_id']



In [39]:
#划分为训练集和测试集,加random_state=42是为了保证每次运行结果一致

# 4) 划分训练集与测试集，按 y 分层保证类别分布一致
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# 5) 进行优化, 标准化特征
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 查看切分后的数据形状，并预览处理后的特征
print('X_train:', X_train.shape, 'X_test:', X_test.shape, 'y_train:', y_train.shape, 'y_test:', y_test.shape)
features_df.head()


X_train: (27470, 6) X_test: (6868, 6) y_train: (27470,) y_test: (6868,)


Unnamed: 0,x,y,accuracy,place_id,weekday,hour,day
600,1.2214,2.7023,17,6683426742,6,9,15
863,1.3828,2.6444,64,5784939944,5,13,20
957,1.1832,2.6891,58,6683426742,2,11,30
1693,1.2583,2.6839,72,3952821602,2,16,15
4345,1.1935,2.655,11,6889790653,0,20,5


In [40]:
label=filtered_df['place_id']
len(label.unique())

512

In [41]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# 1. 使用 knn k=5
# 初始化KNN分类器，设置邻居数为5
knn = KNeighborsClassifier(n_neighbors=5)

# 2. 用训练集训练
# 使用训练集的特征和标签进行模型拟合
knn.fit(X_train_scaled, y_train)

# 3. 做出预测
# 使用训练好的模型对测试集进行预测
y_pred = knn.predict(X_test_scaled)

# 4. 计算正确率
# 比较预测结果和真实标签，计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("KNN准确率:", accuracy)


KNN准确率: 0.45661036691904483


In [42]:
# 查看预测与实际值的对比
print("预测值:", y_pred[:10])
print("实际值:", y_test[:10].values)

预测值: [3333445626 4667077219 6424972551 7803770431 2327054745 6829001048
 4607594424 1059958036 3753336835 6875724035]
实际值: [1097200869 4667077219 2460093296 1999597424 6603539415 8048985799
 5496364827 8277918449 4223691526 7707808405]


# 网格搜索

交叉验证: 用于评估模型在不同数据集上的性能, 防止过拟合.
> 原理: 将数据集分为k个折叠, 每次使用k-1个折叠训练模型, 用剩余的1个折叠验证模型性能. 重复k次, 取平均性能作为模型的评估指标.

In [43]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

# 1. 实例化knn分类器
# 这里的 n_neighbors 将通过网格搜索来确定，不需要预先设定
knn = KNeighborsClassifier()

# 2. 实例化网格搜索器
# 定义参数网格，这里我们搜索 k 值在 1 到 11 之间
param_grid = {'n_neighbors': [3,5,7,9,11]}

# 实例化 GridSearchCV，使用 5 折交叉验证
grid_search = GridSearchCV(knn, param_grid, cv=5,n_jobs=-1,verbose=1)#n_jobs=-1表示使用所有CPU核心, verbose=1表示显示搜索进度

# 3. 在训练集上进行拟合
grid_search.fit(X_train_scaled, y_train)

# 4. 输出最佳参数和最佳得分
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)

# 5. 使用最佳模型进行预测和评估
best_knn = grid_search.best_estimator_
y_pred_grid = best_knn.predict(X_test_scaled)
print("网格搜索后KNN准确率:", accuracy_score(y_test, y_pred_grid))
print("最佳超参数仅在测试集上,还需在训练集上评估")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
最佳参数: {'n_neighbors': 11}
最佳得分: 0.4540953767746633
网格搜索后KNN准确率: 0.47058823529411764
最佳超参数仅在测试集上,还需在训练集上评估


In [44]:
print("最佳超参数为:",grid_search.best_params_)
# 使用最佳参数初始化knn
best_knn=KNeighborsClassifier(**grid_search.best_params_)# 初始化KNN分类器
#用训练集训练模型
best_knn.fit(X_train, y_train)

#在测试集上评估模型
best_y_pred=best_knn.predict(X_test)
best_acc=accuracy_score(y_test, best_y_pred)
print(f"使用最优参数KNN测试集 最佳准确率为:{best_acc:.4f}")

最佳超参数为: {'n_neighbors': 11}
使用最优参数KNN测试集 最佳准确率为:0.0767
