In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from user_agents import parse

In [2]:
# 读取训练集数据并合并，此时数据格式为DataFrame
train=pd.concat([
    pd.read_csv('./data/train/SQL注入.csv'),
    pd.read_csv('./data/train/XSS跨站脚本.csv'),
    pd.read_csv('./data/train/命令执行.csv'),
    pd.read_csv('./data/train/白.csv'),
    pd.read_csv('./data/train/目录遍历.csv'),
    pd.read_csv('./data/train/远程代码执行.csv'),
],axis=0).reset_index(drop=True)
train=train.astype(str)

In [3]:
# 读取测试集数据，此时数据格式为DataFrame
test=pd.read_csv('./data/test/test.csv')
test=test.astype(str)

In [4]:
# 合并训练集、测试集,并重置index
data=pd.concat([train,test],axis=0).reset_index(drop=True)

In [5]:
# 空值处理，用NaN去填充空值
data=data.fillna('NaN')

In [6]:
# agent解析
agent_cols=['browser_family', 'os_family', 'device_family','device_brand','device_model']


def get_ua(a):
    user_agent = parse(a['user_agent'])
    browser_family=str(user_agent.browser.family)
    os_family=str(user_agent.os.family)
    device_family=str(user_agent.device.family)
    device_brand=str(user_agent.device.brand)
    device_model=str(user_agent.device.model)
    return browser_family,os_family,device_family,device_brand,device_model


data[agent_cols] = data.apply(get_ua, axis=1, result_type="expand")

In [7]:
# TfidfVectorizer 文本向量化
texts=data['user_agent'].values.tolist()
n_components = 16
tf = TfidfVectorizer(min_df= 3, max_df=0.5,analyzer = 'char_wb', ngram_range = (2,5))  #计算tf-idf
X = tf.fit_transform(texts)  # 得到tf-idf矩阵，稀疏矩阵表示法
svd = TruncatedSVD(n_components=n_components,random_state=42)  # 数据降维，计算矩阵的前n_components个奇异值和向量
X_svd = svd.fit_transform(X)   # 得到tf-idf矩阵的前16个奇异值和向量
df_tfidf = pd.DataFrame(X_svd)
df_tfidf.columns = [f'user_agent_name_tfidf_{i}' for i in range(n_components)]

In [8]:
data=pd.concat([data,df_tfidf],axis=1)  # 在列方向，合并data和df_tfidf
cate_cols=['method','user_agent','url','refer','body'] + agent_cols

In [9]:
# 对不连续的数字或者文本进行编号
for col in cate_cols:
    lbl = LabelEncoder()    # 对数据集进行编码
    lbl.fit(data[col])    # 计算出唯一的值并为其赋值
    data[col] = lbl.transform(data[col])

In [10]:
train, test = data[:len(train)], data[len(train):]

In [11]:
# 排除特征
feature_names = list(
    filter(lambda x: x not in ['id','lable','url'],train.columns))

In [12]:
# label转为int类型
train['lable']=train['lable'].apply(lambda i:int(i))
train['lable'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['lable']=train['lable'].apply(lambda i:int(i))


array([1, 5, 4, 0, 2, 3], dtype=int64)

In [13]:
# k折交叉验证划分训练集，LightGBM建模
def lgb_model(train, target, test, k):
    feats = [f for f in train.columns if f not in ['lable',  'url']]

    print('Current num of features:', len(feats))

    oof_probs = np.zeros((train.shape[0],6))
    output_preds = 0
    score = []
    
    # LightGBM参数设置
    parameters = {
        'learning_rate': 0.05,       # 学习率
        'boosting_type': 'gbdt',     # 提升类型为传统的梯度提升决策树
        'objective': 'multiclass',   # 学习目标为softmax的目标函数
        'metric': 'multi_error',     # 评估函数为出错率分类
        'num_class': 6,              # 类别数量
        'num_leaves': 31,            # 每棵树的叶子数
        'feature_fraction': 0.6,     # 特征抽取比例
        'bagging_fraction': 0.8,     # 样本采样比例
        'min_data_in_leaf': 15,      # 一个叶子上数据的最小数量，可以用来处理过拟合
        'verbose': -1,               # 冗长警告
        'nthread': 4,                # LightGBM的线程数
        'max_depth': 7               # 树的深度
    }

    # k折交叉验证
    folds = StratifiedKFold(n_splits = k,                 # 折叠次数，默认为3，至少为2
                            shuffle = True,              # 是否在每次分割之前打乱顺序
                            random_state = 2022          # 随机种子，在shuffle==True时使用，默认使用np.random。
                           )
    
    for i, (train_index, test_index) in enumerate(folds.split(train, target)):
        train_X = train[feats].iloc[train_index, :]
        train_Y = target.iloc[train_index]
        test_X = train[feats].iloc[test_index, :]
        test_Y = target.iloc[test_index]
        dtrain = lgb.Dataset(train_X,label=train_Y)
        dval = lgb.Dataset(test_X,label=test_Y)
        # 采用LightGBM训练模型
        lgb_model = lgb.train(
            parameters,
            dtrain,
            num_boost_round = 10000,
            valid_sets = [dval],
            callbacks = [early_stopping(100), log_evaluation(100)]
        )
        oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration) 
        score.append(lgb_model.best_score['valid_0']['multi_error'])
        output_preds += lgb_model.predict(test[feats],num_iteration=lgb_model.best_iteration) / folds.n_splits 
        print(offline_score)
            
    return output_preds, oof_probs

In [14]:
print('开始训练模型BEGIN')
lgb_preds, lgb_oof = lgb_model(train = train[feature_names],
                               target = train['lable'],
                               test = test[feature_names], 
                               k = 10)
print('训练模型结束END')

开始训练模型BEGIN
Current num of features: 25
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_error: 0.105057
[200]	valid_0's multi_error: 0.0972306
[300]	valid_0's multi_error: 0.0978326
Early stopping, best iteration is:
[234]	valid_0's multi_error: 0.0960265
[0.09602649006622517]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_error: 0.10897
[200]	valid_0's multi_error: 0.105057
[300]	valid_0's multi_error: 0.102348
[400]	valid_0's multi_error: 0.103552
Early stopping, best iteration is:
[340]	valid_0's multi_error: 0.101746
[0.09602649006622517, 0.10174593618302227]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_error: 0.10295
[200]	valid_0's multi_error: 0.0972306
Early stopping, best iteration is:
[197]	valid_0's multi_error: 0.0972306
[0.09602649006622517, 0.10174593618302227, 0.09723058398555087]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's mul

In [15]:
# 读取提交格式
sub=pd.read_csv('data/submit_example.csv')

In [16]:
# 获取最大概率标签
sub['predict']=np.argmax(lgb_preds,axis=1)

In [17]:
# 保存
sub.to_csv('data/sub.csv',index=None)

In [18]:
# 显示分数
accuracy_score(train['lable'],np.argmax(lgb_oof,axis=1))

0.9068304283693067