In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

info_user = pd.read_csv('test.csv', encoding='gbk')

# 提取info表的用户名和用餐时间，并按人名对用餐人数和金额进行分组求和
info_user1 = info_user['USER_ID'].value_counts()  # 按USER_ID统计出现的次数
info_user1 = info_user1.reset_index() # 将分组结果重置索引
info_user1.columns = ['USER_ID', 'frequence']  # 重命名列为USER_ID和frenqunce

# 求出每个人的消费总金额
info_user2 = info_user[['number_consumers', "expenditure"]].groupby(info_user['USER_ID']).sum()  # 按USER_ID分组，计算number_consumers和expenditure的总和
info_user2 = info_user2.reset_index()
info_user2.columns = ['USER_ID', 'numbers', 'amount']  # 重命名列

# 合并用户的用餐频次和总消费信息
info_user_new = pd.merge(info_user1, info_user2, left_on='USER_ID', right_on='USER_ID', how='left') 

# 对合并后的数据进行处理
info_user = info_user.iloc[:, :4]  # 选择前四列
info_user = info_user.groupby(['USER_ID']).last()  # 按USER_ID分组，选择每组的最后一个
info_user = info_user.reset_index()  # 重置索引

# 合并用餐频次、总消费信息和最近一次用餐信息
info_user_new = pd.merge(info_user_new, info_user, left_on='USER_ID', right_on='USER_ID', how='left')

print('合并后表中的空值数目：', info_user_new.isnull().sum().sum())
info_user_new = info_user_new.dropna(axis=0)  # 删除包含空值的行
info_user_new = info_user_new[info_user_new['numbers'] != 0] # 过滤掉用餐总人数为0的行
print(info_user_new.head())

# 计算每个用户的平均消费金额
info_user_new['average'] = info_user_new['amount']/info_user_new['numbers']
info_user_new['average'] = info_user_new['average'].apply(lambda x: '%.2f' % x)

# 计算每个客户最近一次点餐的时间距离观测窗口结束的天数
# 修改时间列，改为日期
info_user_new['LAST_VISITS'] = pd.to_datetime(info_user_new['LAST_VISITS'])
datefinally = pd.to_datetime('2016-7-31')  # 观测窗口结束时间
time = datefinally - info_user_new['LAST_VISITS']
info_user_new['recently'] = time.apply(lambda x: x.days)   # 计算时间差

# 选择需要输出的列并保存为新的csv文件
info_user_new = info_user_new.loc[:, ['USER_ID', 'ACCOUNT', 'frequence', 'amount', 'average', 'recently', 'type']]
info_user_new.to_csv('test-after.csv', index=False, encoding='gbk')
print(info_user_new.head())

合并后表中的空值数目： 9
   USER_ID  frequence  numbers   amount ACCOUNT      LAST_VISITS type
0     2361         41    237.0  34784.0     薛浩天  2016/7/30 13:29  非流失
1     3768         33    207.0  32699.0     易之赫  2016/7/28 12:24  非流失
2     3762         33    208.0  30394.0     许智蕴  2016/7/27 13:41  非流失
3     2147         32    192.0  27088.0     萧郁丁  2016/7/25 12:34  非流失
4     1131         25    116.0  18910.0      夏晴  2016/7/25 11:35  非流失
   USER_ID ACCOUNT  frequence   amount average  recently type
0     2361     薛浩天         41  34784.0  146.77         0  非流失
1     3768     易之赫         33  32699.0  157.97         2  非流失
2     3762     许智蕴         33  30394.0  146.12         3  非流失
3     2147     萧郁丁         32  27088.0  141.08         5  非流失
4     1131      夏晴         25  18910.0  163.02         5  非流失


In [80]:
# 数据预处理和特征构造
def preprocess_data(data):
    data['LAST_VISITS'] = pd.to_datetime(data['LAST_VISITS'])
    datefinally = pd.to_datetime('2016-7-31')
    data['recently'] = (datefinally - data['LAST_VISITS']).dt.days
    
    user_counts = data['USER_ID'].value_counts().reset_index()
    user_counts.columns = ['USER_ID', 'frequence']
    
    user_sums = data[['USER_ID', 'number_consumers', 'expenditure']].groupby('USER_ID').sum().reset_index()
    user_sums.columns = ['USER_ID', 'numbers', 'amount']
    
    data_new = pd.merge(user_counts, user_sums, on='USER_ID', how='left')
    data_new = pd.merge(data_new, data[['USER_ID', 'ACCOUNT', 'type', 'recently']].drop_duplicates(), on='USER_ID', how='left')
    
    data_new = data_new.dropna(axis=0)
    data_new = data_new[data_new['numbers'] != 0]
    
    data_new['average'] = data_new['amount'] / data_new['numbers']
    data_new['average'] = data_new['average'].apply(lambda x: float('%.2f' % x))
    
    return data_new

In [81]:
# 读取训练数据和测试数据
train_data = pd.read_csv('train.csv', encoding='gbk')
test_data = pd.read_csv('test.csv', encoding='gbk')

# 预处理训练数据和测试数据
train_data_new = preprocess_data(train_data)
test_data_new = preprocess_data(test_data)

# 准备训练数据和标签
X_train = train_data_new[['frequence', 'recently', 'average', 'amount']]
y_train = train_data_new['type']

# 准备测试数据和标签
X_test = test_data_new[['frequence', 'recently', 'average', 'amount']]
y_test = test_data_new['type']

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [91]:
# 2、使用决策树模型做预测
from sklearn.tree import DecisionTreeClassifier

# 参数网格
param_grid_tree = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
}

# 决策树模型
tree = DecisionTreeClassifier()

# 网格搜索
grid_search_tree = GridSearchCV(estimator=tree, param_grid=param_grid_tree, cv=10)
grid_search_tree.fit(X_train_scaled, y_train)

# 最佳参数和准确率
best_params_tree = grid_search_tree.best_params_
best_score_tree = grid_search_tree.best_score_

print(f'Decision Tree - Best Parameters: {best_params_tree}')
print(f'Decision Tree - Best CV Accuracy: {best_score_tree}')

# 使用最佳参数进行预测
best_tree = grid_search_tree.best_estimator_
y_pred_train_best_tree = best_tree.predict(X_train_scaled)
y_pred_test_best_tree = best_tree.predict(X_test_scaled)

# 计算准确率
train_accuracy_best_tree = accuracy_score(y_train, y_pred_train_best_tree)
test_accuracy_best_tree = accuracy_score(y_test, y_pred_test_best_tree)

print(f'Decision Tree with Best Params - Train Accuracy: {train_accuracy_best_tree}, Test Accuracy: {test_accuracy_best_tree}')

Decision Tree - Best Parameters: {'max_depth': 40, 'min_samples_leaf': 4, 'min_samples_split': 2}
Decision Tree - Best CV Accuracy: 0.9105074462217321
Decision Tree with Best Params - Train Accuracy: 0.9749661705006766, Test Accuracy: 0.8986175115207373


In [83]:
# 3、使用KNN模型做预测
from sklearn.neighbors import KNeighborsClassifier

# 参数网格
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# KNN模型
knn = KNeighborsClassifier()

# 网格搜索
grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=10)
grid_search_knn.fit(X_train_scaled, y_train)

# 最佳参数和准确率
best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

print(f'KNN - Best Parameters: {best_params_knn}')
print(f'KNN - Best CV Accuracy: {best_score_knn}')

# 使用最佳参数进行预测
best_knn = grid_search_knn.best_estimator_
y_pred_train_best_knn = best_knn.predict(X_train_scaled)
y_pred_test_best_knn = best_knn.predict(X_test_scaled)

# 计算准确率
train_accuracy_best_knn = accuracy_score(y_train, y_pred_train_best_knn)
test_accuracy_best_knn = accuracy_score(y_test, y_pred_test_best_knn)

print(f'KNN with Best Params - Train Accuracy: {train_accuracy_best_knn}, Test Accuracy: {test_accuracy_best_knn}')

KNN - Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}
KNN - Best CV Accuracy: 0.9044493473064902
KNN with Best Params - Train Accuracy: 0.9546684709066305, Test Accuracy: 0.9055299539170507


In [84]:
# 4、使用支持向量机做预测

In [85]:
#（1）比较4种核函数的分类准确率
# 分别使用4种核函数（线性核、多项式核、高斯核和Sigmoid核）对该数据集进行分类，最后比较4种核函数的分类准确率
# 输出所有参数组合的准确率

# 核函数比较
def evaluate_kernel(kernel_type):
    svc = SVC(kernel=kernel_type, decision_function_shape='ovo')
    svc.fit(X_train, y_train)
    y_pred_train = svc.predict(X_train)
    y_pred_test = svc.predict(X_test)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    return train_accuracy, test_accuracy

kernels = ['linear', 'rbf', 'poly', 'sigmoid']

for kernel in kernels:
    train_acc, test_acc = evaluate_kernel(kernel)
    print(f'Kernel: {kernel} - Train Accuracy: {train_acc}, Test Accuracy: {test_acc}')

Kernel: linear - Train Accuracy: 0.9519621109607578, Test Accuracy: 0.923963133640553
Kernel: rbf - Train Accuracy: 0.922192151556157, Test Accuracy: 0.8963133640552995
Kernel: poly - Train Accuracy: 0.9147496617050067, Test Accuracy: 0.8870967741935484
Kernel: sigmoid - Train Accuracy: 0.7706359945872802, Test Accuracy: 0.7396313364055299


In [86]:
#（2）数据标准化对支持向量机分类准确率的影响
# 不进行标准化的数据集
svc = SVC(kernel='rbf', decision_function_shape='ovo')
svc.fit(X_train, y_train)
y_pred_train = svc.predict(X_train)
y_pred_test = svc.predict(X_test)
train_accuracy_no_scaling = accuracy_score(y_train, y_pred_train)
test_accuracy_no_scaling = accuracy_score(y_test, y_pred_test)

print(f'Without scaling - Train Accuracy: {train_accuracy_no_scaling}, Test Accuracy: {test_accuracy_no_scaling}')

# 进行标准化的数据集
svc.fit(X_train_scaled, y_train)
y_pred_train = svc.predict(X_train_scaled)
y_pred_test = svc.predict(X_test_scaled)
train_accuracy_scaling = accuracy_score(y_train, y_pred_train)
test_accuracy_scaling = accuracy_score(y_test, y_pred_test)

print(f'With scaling - Train Accuracy: {train_accuracy_scaling}, Test Accuracy: {test_accuracy_scaling}')

Without scaling - Train Accuracy: 0.922192151556157, Test Accuracy: 0.8963133640552995
With scaling - Train Accuracy: 0.9512855209742895, Test Accuracy: 0.9400921658986175


In [87]:
#（3）高斯核函数、多项式核函数的参数调节

In [88]:
#（3.1）高斯核函数的参数调节
# 高斯核函数只有一个参数γ的值可调节。

# 高斯核函数参数调节
param_grid_rbf = {
    'gamma': [0.01, 0.1, 1]
}

svc_rbf = SVC(kernel='rbf', decision_function_shape='ovo')
grid_search_rbf = GridSearchCV(estimator=svc_rbf, param_grid=param_grid_rbf, cv=10)
grid_search_rbf.fit(X_train_scaled, y_train)

print("RBF Kernel - Best Parameters:", grid_search_rbf.best_params_)
print("RBF Kernel - Best Accuracy:", grid_search_rbf.best_score_)

RBF Kernel - Best Parameters: {'gamma': 0.01}
RBF Kernel - Best Accuracy: 0.9057915057915059


In [89]:
#（3.2）多项式核函数参数的调节
# 对于多项式核函数来说，它有三个参数共同作用在一个公式上影响其分类准确率，因此只能使用网格搜索法来功能调整三个对多项式函数有影响的参数
param_grid_poly = {
    'gamma': [0.01, 0.1, 1],
    'degree': [2, 3, 4]
}

svc_poly = SVC(kernel='poly', decision_function_shape='ovo')
grid_search_poly = GridSearchCV(estimator=svc_poly, param_grid=param_grid_poly, cv=10)
grid_search_poly.fit(X_train_scaled, y_train)

print("Polynomial Kernel - Best Parameters:", grid_search_poly.best_params_)
print("Polynomial Kernel - Best Accuracy:", grid_search_poly.best_score_)

Polynomial Kernel - Best Parameters: {'degree': 3, 'gamma': 1}
Polynomial Kernel - Best Accuracy: 0.8660047802904944


In [90]:
#（4） 松弛系数惩罚项C的调整
#在实际应用种，松弛系数惩罚项C和核函数的相关参数（gamma,degree等）往往搭配一起调整，这是SVM调参的重点。可以使用学习曲线或者网格搜索来调整C值。

param_grid_C = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': [0.01, 0.1, 1],
    'degree': [2, 3, 4]  # 仅用于多项式核函数
}

svc = SVC(decision_function_shape='ovo')
grid_search_C = GridSearchCV(estimator=svc, param_grid=param_grid_C, cv=10)
grid_search_C.fit(X_train_scaled, y_train)

print("Best Parameters with C tuning:", grid_search_C.best_params_)
print("Best Accuracy with C tuning:", grid_search_C.best_score_)

Best Parameters with C tuning: {'C': 100, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
Best Accuracy with C tuning: 0.9281853281853282
