# 导入库

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [3]:
## 设置字符集，防止中文乱码
plt.rcParams['font.sans-serif']=['simHei']
plt.rcParams['axes.unicode_minus']=False

In [4]:
# jupyter展示图片，非内嵌显示; 这段代码不属于python代码； 相当于是jupyter的工具参数设置
# tk: 显示出来，inline：内嵌显示，默认为inline
%matplotlib inline

# 导入数据

In [5]:
dataset = pd.read_csv('./datas/data.csv',encoding='utf8')
dataset.head(1)

Unnamed: 0,label,from_163,from_tsinghua,from_126,from_yahoo,from_12,from_21,from_tom,from_cernet,from_sohu,...,20,21,22,23,24,25,26,27,28,29
0,1,0,0,0,0,0,0,0,0,0,...,-0.004307,-0.035336,-0.01433,0.006857,0.06397,-0.003622,0.033663,-0.011141,0.030228,-0.03324


# 模型选择

## 划分数据集

In [6]:
data = dataset.drop(columns=['label'])
ydata = dataset['label']

In [7]:
train_x,test_x,train_y,test_y = train_test_split(data, ydata, test_size=0.2, random_state=10)

In [8]:
print(train_x.shape,test_x.shape)
print(train_y.shape,test_y.shape)

(51696, 42) (12924, 42)
(51696,) (12924,)


## 导入库

In [9]:
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix # 精确率、召回率、f1、混淆矩阵

## KNN模型

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
%%time
knn = KNeighborsClassifier()
model = knn.fit(train_x,train_y)

Wall time: 1.74 s


In [12]:
y_hat = model.predict(test_x)

In [13]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))
print('混淆矩阵：\n',confusion_matrix(test_y,y_hat))

精确率：0.9772
召回率：0.9857
f1值：0.9814
混淆矩阵：
 [[4155  197]
 [ 123 8449]]


## LR模型

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
%%time
lr = LogisticRegression(solver='liblinear')
model = lr.fit(train_x,train_y)

Wall time: 1.49 s


In [16]:
y_hat = model.predict(test_x)

In [17]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))
print('混淆矩阵：\n',confusion_matrix(test_y,y_hat))

精确率：0.9715
召回率：0.9851
f1值：0.9782
混淆矩阵：
 [[4104  248]
 [ 128 8444]]


## bytes模型

In [18]:
from sklearn.naive_bayes import BernoulliNB

In [19]:
%%time
nb = BernoulliNB()
model = nb.fit(train_x,train_y)

Wall time: 311 ms


In [20]:
y_hat = model.predict(test_x)

In [21]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))
print('混淆矩阵：\n',confusion_matrix(test_y,y_hat))

精确率：0.9592
召回率：0.9871
f1值：0.9729
混淆矩阵：
 [[3992  360]
 [ 111 8461]]


## SVM模型

In [22]:
from sklearn.svm import SVC

In [23]:
%%time
svc = SVC(gamma='scale')
model = svc.fit(train_x,train_y)

Wall time: 59.5 s


In [24]:
y_hat = model.predict(test_x)

In [25]:
# 效果评估
print("精确率：%.4f"%precision_score(test_y,y_hat))
print("召回率：%.4f"%recall_score(test_y,y_hat))
print("f1值：%.4f"%f1_score(test_y,y_hat))
print('混淆矩阵：\n',confusion_matrix(test_y,y_hat))

精确率：0.9602
召回率：0.9907
f1值：0.9752
混淆矩阵：
 [[4000  352]
 [  80 8492]]


# 模型调参

# 模型保存

In [28]:
import joblib
import os