## test

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings  
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline

In [None]:
#数据读取 
df = pd.read_csv('../input/security.csv')
df.columns

对分析数据集查看是否存在数据缺失，并对一些数据间的相关性分析

In [None]:
#绘制热力图检查数据集是否完整
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='jet')

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
# 检查每列的值是否唯一
df.nunique()

In [None]:
df['WHOIS_COUNTRY'].unique()

In [None]:
df['WHOIS_STATE_CITY'].unique()

In [None]:
df['WHOIS_REG_YEAR'].unique()

In [None]:
plt.figure(figsize=(16,8))
sns.countplot(data=df, x='WHOIS_REG_YEAR', hue='TIPO')

In [None]:
plt.figure(figsize=(16,8))
sns.countplot(data=df, x='WHOIS_COUNTRY', hue='TIPO')

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
#独特值在字符串列统计
df[['URL','DOMAIN_NAME','CHARSET', 'SERVER', 'CACHE_CONTROL','WHOIS_COUNTRY','WHOIS_STATE_CITY']].nunique()

In [None]:
#把（Benigna）良性替换为'0'，恶性替换为'1'
def target(tipo):
    if tipo == "Benigna":
        return 0
    else:
        return 1
df['TIPO'] = df['TIPO'].apply(target)
df['TIPO'].head(20)

In [None]:
# 通过绘制热力图我们可以看到和target没有明显的关联性
corr = df.corr()
sns.heatmap(corr, cmap="Greens")

In [None]:
# 我将从字符串列中创建虚拟值，以便能够使用它利用机器学习进行分类
moddf = pd.get_dummies(df, columns=['URL','DOMAIN_NAME','CHARSET', 'SERVER', 'CACHE_CONTROL','WHOIS_COUNTRY','WHOIS_STATE_CITY'])
moddf.columns

现在,由于我修改了数据集并基于字符串列值创建了新的特征，因此我可以尝试使用ML进行分类。

In [None]:
# modff数据集分割
from sklearn.cross_validation import train_test_split

In [None]:
X = moddf.drop(axis=1, columns=['TIPO'])
X.head(5)

In [None]:
y = moddf['TIPO']
y.head(5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

## 决策树

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree= DecisionTreeClassifier()
train_t=tree.fit(X_train, y_train)
pred_t= tree.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(classification_report(y_test, pred_t))
print('\n')
print(accuracy_score(y_test, pred_t))

## logistic回归

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logr = LogisticRegression()

In [None]:
logr.fit(X_train, y_train)

In [None]:
pred = logr.predict(X_test)

In [None]:
# Importing metrics modules to evaluate the the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
#结果输出
print(classification_report(y_test, pred))
print('\n')
print(accuracy_score(y_test, pred))

## kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# 使用StandardScale 处理变量
scaler = StandardScaler()
sclr_train = X
scaler.fit(sclr_train)
scaled_features = scaler.transform(sclr_train)

In [None]:
df_features = pd.DataFrame(scaled_features,columns=sclr_train.columns)
df_features.head()

In [None]:
# 对处理后的数据进行分割
X_train, X_test, y_train, y_test = train_test_split(scaled_features,y,test_size=0.30, random_state=100)

In [None]:
error_rate = []

for i in range(1,15):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
# 基于错误率绘图
plt.figure(figsize=(10,4))
plt.plot(range(1,15),error_rate,color='b', linestyle='-', marker='*',
         markerfacecolor='red', markersize=10)
plt.xlabel('K')
plt.ylabel('Error')

In [None]:
# 由图得出 n_neighbours=1 是最低的错误率
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train, y_train)

In [None]:
knnpred = knn.predict(X_test)

In [None]:
# 评估kNN的精准度
print(classification_report(y_test, knnpred))
print('\n')
print(accuracy_score(y_test, knnpred))

## xgboost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print(classification_report(y_test, xgb_pred))
print('\n')
print(accuracy_score(y_test, xgb_pred))

## 结果对比

In [None]:
print("DecisionTree:")
print('\n')
print(classification_report(y_test, pred_t))
print('\n')
confmat = confusion_matrix(y_true = y_test, y_pred =pred_t)
fig, ax = plt.subplots(figsize = (2.5, 2.5))  
ax.matshow(confmat, cmap = plt.cm.Oranges, alpha = 0.3)  
for i in range(confmat.shape[0]):  
    for j in range(confmat.shape[1]):  
        ax.text(x = j, y = i,          #ax.text()在轴上添加文本  
                s = confmat[i, j],   
                va = 'center',   
                ha = 'center')  
plt.xlabel('Predicted label')  
plt.ylabel('True label')  
plt.tight_layout()  
plt.show()  
print(accuracy_score(y_test, pred_t))
print('\n')
print('\n')

print("Logistic Regression:")
print('\n')
print(classification_report(y_test, pred))
print('\n')
confmat = confusion_matrix(y_true = y_test, y_pred =pred)
fig, ax = plt.subplots(figsize = (2.5, 2.5))  
ax.matshow(confmat, cmap = plt.cm.Blues, alpha = 0.3)  
for i in range(confmat.shape[0]):  
    for j in range(confmat.shape[1]):  
        ax.text(x = j, y = i,          #ax.text()在轴上添加文本  
                s = confmat[i, j],   
                va = 'center',   
                ha = 'center')  
plt.xlabel('Predicted label')  
plt.ylabel('True label')  
plt.tight_layout()  
plt.show()  
print(accuracy_score(y_test, pred))
print('\n')
print('\n')


print("kNN:")
print('\n')
print(classification_report(y_test, knnpred))
print('\n')
confmat = confusion_matrix(y_true = y_test, y_pred =knnpred) 
fig, ax = plt.subplots(figsize = (2.5, 2.5))  
ax.matshow(confmat, cmap = plt.cm.Reds, alpha = 0.3)  
for i in range(confmat.shape[0]):  
    for j in range(confmat.shape[1]):  
        ax.text(x = j, y = i,          #ax.text()在轴上添加文本  
                s = confmat[i, j],   
                va = 'center',   
                ha = 'center')  
plt.xlabel('Predicted label')  
plt.ylabel('True label')  
plt.tight_layout()  
plt.show()  
print(accuracy_score(y_test, knnpred))
print('\n')
print('\n')

print("xgboost:")
print('\n')
print(classification_report(y_test,xgb_pred))
print('\n')
confmat = confusion_matrix(y_true = y_test, y_pred =xgb_pred)
fig, ax = plt.subplots(figsize = (2.5, 2.5))  
ax.matshow(confmat, cmap = plt.cm.Greens, alpha = 0.3)  
for i in range(confmat.shape[0]):  
    for j in range(confmat.shape[1]):  
        ax.text(x = j, y = i,          #ax.text()在轴上添加文本  
                s = confmat[i, j],   
                va = 'center',   
                ha = 'center')  
plt.xlabel('Predicted label')  
plt.ylabel('True label')  
plt.tight_layout()  
plt.show()  
print(accuracy_score(y_test,xgb_pred))
print('\n')
print('\n')

## 指标说明
TP – True Positives；  
FP – False Positive；    
Precision = TP/(TP + FP)；  
FN – False Negatives；  
Recall = TP/(TP+FN)；  