### 解压文件

In [1]:
import os
import gzip
import numpy as np
from scapy.all import *
import csv
import pandas as pd

In [9]:
# 解析后文件目标文件夹
target_file = '/Users/lfg/Downloads/恶意1pcap'
# 解析文件来源文件夹
source_file = '/Users/lfg/Downloads/恶意1'

In [7]:
def un_gz(file_name,target_file):
    names = file_name.split('/')
    f_name = os.path.join(target_file,names[5]+'.pcap')
    g_file = gzip.GzipFile(file_name)
    open(f_name,'wb').write(g_file.read())
    g_file.close()

In [10]:
for root,ds,files in os.walk(source_file):
    for f in files:
        path = os.path.join(root,f)
        if path.endswith('dump.gz'):
            un_gz(path,target_file)
print('finish')

finish


### 解析pcap文件并保存为csv文件

In [66]:
def analyse_pcap(pkts,fname):
    df = pd.DataFrame(columns=['srcIp','distIp','srcPort','distPort','time','length','TCP'])
    index = 0
    for pkt in pkts:
        m = []
        m.append(pkt['IP'].src)
        m.append(pkt['IP'].dst)
        m.append(pkt['IP'].sport)
        m.append(pkt['IP'].dport)
        m.append(pkt.time)
        m.append(pkt['IP'].len)
        try:
            if pkt['TCP']:
                m.append(1)
        except:
            m.append(0)
        df.loc[index] = m
        index += 1
    df.to_csv('/Users/lfg/Downloads/德宾流量/' + fname + '.csv')
    return df

In [4]:
###################################
# 示例：可以将pcap文件解析为下面这种格式
###################################

pkts = rdpcap('/Users/lfg/Downloads/恶意pcap/004f18054d3aca49bcb1faf6e515ac81.pcap')
analyse_pcap(pkts,'111')

Unnamed: 0,srcIp,distIp,srcPort,distPort,time,length,TCP
0,10.0.2.15,10.0.2.3,43304,53,1362773854.268346,58,0
1,10.0.2.3,10.0.2.15,53,43304,1362773855.561569,128,0
2,10.0.2.15,61.147.115.37,51794,80,1362773855.573770,60,1
3,61.147.115.37,10.0.2.15,80,51794,1362773856.007992,44,1
4,10.0.2.15,61.147.115.37,51794,80,1362773856.009029,40,1
...,...,...,...,...,...,...,...
1092,10.0.2.15,61.147.115.37,39383,80,1362773958.891449,40,1
1093,61.147.115.37,10.0.2.15,80,39383,1362773963.663704,41,1
1094,10.0.2.15,61.147.115.37,39383,80,1362773963.664607,40,1
1095,61.147.115.37,10.0.2.15,80,39383,1362773968.673496,41,1


In [68]:
# i用来控制一次解析的文件数量
i = 0
for root,dirs,files in os.walk(target):
    r = False
    for file in files:
        try:
            path = os.path.join(root,file)
            md5 = path.split('/')[-1].split('.')[0]
            pkts = rdpcap(path)
            analyse_pcap(pkts,md5)
            i += 1
            if i == 700:
                f = True
                break
        except:
            pass
    if r:
        break

### 提取特征

#### Average Packet Size

In [8]:
def Average_Packet_Size(df):
    try:
        return df['length'].sum() / df['length'].count()
    except:
        return np.nan

In [18]:
Average_Packet_Size(df)

400.85225505443236

#### Ratio of Incoming to Outgoing Bytes

In [9]:
def Ratio_of_Incoming_to_Outgoing_Bytes(df,Ip):
    try:
        srcs = df[df['srcIp'] == Ip]
        dists = df[df['distIp'] == Ip]
        return srcs['length'].sum() / dists['length'].sum()
    except:
        return np.nan

In [14]:
Ratio_of_Incoming_to_Outgoing_Bytes(df,'10.0.2.15')

0.2386966551326413

#### Average Number of Bytes Received per Second

In [10]:
def Average_Number_of_Bytes_Received_per_Second(df,distIp):
    try:
        group = df[df['distIp'] == distIp]
        group = group.sort_values(by = 'time',ascending = True,ignore_index = True)
        t = group.loc[len(group) - 1,'time'] - group.loc[0,'time']
        res = group['length'].sum() / t
        return res
    except:
        return np.nan

In [16]:
Average_Number_of_Bytes_Received_per_Second(df,'10.0.2.15')

896.6701897991172

#### Average Number of Packets Sent per Flow

In [11]:
def Average_Number_of_Packets_Sent_per_Flow(df,srcIp):
    try:
        group = df[df['srcIp'] == srcIp]
        count = 0

        for sPort in list(group['srcPort'].unique()):
            g = group[group['srcPort'] == sPort]
            for dIp in list(g['distIp'].unique()):
                d = g[g['distIp'] == dIp]

                count += len(list(d['distPort'].unique()))
        res = group['srcIp'].count() / count
        return res
    except:
        return np.nan

In [20]:
Average_Number_of_Packets_Sent_per_Flow(df,'10.0.2.15')

8.525

#### Average Number of Packets Received per Flow

In [12]:
def Average_Number_of_Packets_Received_per_Flow(df,distIp):
    try:
        group = df[df['distIp'] == distIp]
        count = 0
        l = len(list(group['distPort'].unique()))
        for dPort in list(group['distPort'].unique()):
            g = group[group['distPort'] == dPort]
            for sIp in list(g['srcIp'].unique()):
                d = g[g['srcIp'] == sIp]

                count += len(list(d['srcPort'].unique()))
        res = group['distIp'].count() / count
        return res
    except:
        return np.nan

In [22]:
Average_Number_of_Packets_Received_per_Flow(df,'10.0.2.15')

7.55

#### Average Number of Bytes Sent per Flow

In [13]:
def Average_Number_of_Packets_Sent_per_Flow(df,srcIp):
    try:
        group = df[df['srcIp'] == srcIp]
        count = 0
        l = len(list(group['srcPort'].unique()))
        for sPort in list(group['srcPort'].unique()):
            g = group[group['srcPort'] == sPort]
            for dIp in list(g['distIp'].unique()):
                d = g[g['distIp'] == dIp]

                count += len(list(d['distPort'].unique()))
        res = group['length'].sum() / count
        return res
    except:
        return np.nan

In [28]:
Average_Number_of_Packets_Sent_per_Flow(df,'10.0.2.15')

1241.7

#### Average Number of Bytes Received per Flow

In [14]:
def Average_Number_of_Bytes_Received_per_Flow(df,distIp):
    try:
        group = df[df['distIp'] == distIp]
        count = 0
        l = len(list(group['distPort'].unique()))
        for dPort in list(group['distPort'].unique()):
            g = group[group['distPort'] == dPort]
            for sIp in list(g['srcIp'].unique()):
                d = g[g['srcIp'] == sIp]

                count += len(list(d['srcPort'].unique()))
        res = group['length'].sum() / count
        return res
    except:
        return np.nan

In [31]:
Average_Number_of_Bytes_Received_per_Flow(df,'10.0.2.15')

5202.0

### 7个特征合并

In [65]:
column = ['Average_Packet_Size','Ratio_of_Incoming_to_Outgoing_Bytes','Average_Number_of_Bytes_Received_per_Second','Average_Number_of_Packets_Sent_per_Flow','Average_Number_of_Packets_Received_per_Flow','Average_Number_of_Bytes_Sent_per_Flow','Average_Number_of_Bytes_Received_per_Flow']

In [69]:
###################################################
# 分别从恶意和非恶意解析后的csv文件夹中读取数据，形成特征集
###################################################

# 控制数据量
index = 0
trait = pd.DataFrame(columns = column)
for root,dirs,files in os.walk(target_file):
    if index > 11000:
        break
    for file in files:
        path = os.path.join(root,file)
        c = pd.read_csv(path)
        if index > 11000:
            break
        try:
            for srcIp in list(c['srcIp'].unique()):

                mid = []
                mid.append(Average_Packet_Size(c))
                mid.append(Ratio_of_Incoming_to_Outgoing_Bytes(c,srcIp))
                mid.append(Average_Number_of_Bytes_Received_per_Second(c,srcIp))
                mid.append(Average_Number_of_Packets_Sent_per_Flow(c,srcIp))
                mid.append(Average_Number_of_Packets_Received_per_Flow(c,srcIp))
                mid.append(Average_Number_of_Packets_Sent_per_Flow(c,srcIp))
                mid.append(Average_Number_of_Bytes_Received_per_Flow(c,srcIp))
                
                trait.loc[index] = mid
                index += 1
                if index > 11000:
                    break
        except:
            pass

  """
  if sys.path[0] == '':


In [52]:
trait = pd.concat([trait_b,trait_m])

In [53]:
trait

Unnamed: 0,Average_Packet_Size,Ratio_of_Incoming_to_Outgoing_Bytes,Average_Number_of_Bytes_Received_per_Second,Average_Number_of_Packets_Sent_per_Flow,Average_Number_of_Packets_Received_per_Flow,Average_Number_of_Bytes_Sent_per_Flow,Average_Number_of_Bytes_Received_per_Flow,label
0,499.167702,0.104490,606.796107,691.181818,7.272727,691.181818,6614.818182,0
1,499.167702,4.230159,2.157807,266.500000,1.000000,266.500000,63.000000,0
2,499.167702,13.395373,100.927322,12739.000000,15.000000,12739.000000,951.000000,0
3,499.167702,10.028467,45.782289,10427.600000,10.200000,10427.600000,1039.800000,0
4,499.167702,5.678601,123.393282,6820.000000,11.000000,6820.000000,1201.000000,0
...,...,...,...,...,...,...,...,...
3636,465.863568,0.425721,808.756066,384.000000,6.000000,384.000000,902.000000,1
3637,465.863568,26.574747,85.564317,21642.916667,15.500000,21642.916667,814.416667,1
3638,610.942308,0.054580,792.477975,913.444444,15.000000,913.444444,16736.000000,1
3639,610.942308,2.780822,306.537937,203.000000,1.000000,203.000000,73.000000,1


In [54]:
trait.columns

Index(['Average_Packet_Size', 'Ratio_of_Incoming_to_Outgoing_Bytes',
       'Average_Number_of_Bytes_Received_per_Second',
       'Average_Number_of_Packets_Sent_per_Flow',
       'Average_Number_of_Packets_Received_per_Flow',
       'Average_Number_of_Bytes_Sent_per_Flow',
       'Average_Number_of_Bytes_Received_per_Flow', 'label'],
      dtype='object')

In [72]:
# 去除因为除0而产生的inf数据
trait_d[np.isinf(trait_d)] = np.nan
trait_d = trait_d.dropna(how = 'any')
trait_d

Unnamed: 0,Average_Packet_Size,Ratio_of_Incoming_to_Outgoing_Bytes,Average_Number_of_Bytes_Received_per_Second,Average_Number_of_Packets_Sent_per_Flow,Average_Number_of_Packets_Received_per_Flow,Average_Number_of_Bytes_Sent_per_Flow,Average_Number_of_Bytes_Received_per_Flow
0,69.625000,1.897269,67.766846,364.750000,4.200000,364.750000,307.600000
1,69.625000,0.829971,30.578858,576.000000,8.000000,576.000000,694.000000
2,69.625000,0.511111,485.381785,92.000000,4.000000,92.000000,180.000000
3,69.625000,0.860619,1314.569004,389.000000,5.500000,389.000000,452.000000
6,87.116883,2.552966,184.099967,321.333333,3.666667,321.333333,629.333333
...,...,...,...,...,...,...,...
248,538.000000,16.646648,70.305836,35757.000000,31.000000,35757.000000,2148.000000
249,121.555556,0.838655,619.862610,499.000000,4.000000,499.000000,595.000000
250,121.555556,1.192385,271.758089,595.000000,5.000000,595.000000,499.000000
251,101.833333,2.916667,750.573186,455.000000,3.000000,455.000000,156.000000


In [40]:
# 归一化
for c in column:
    zuixiao = trait[c].min()
    zuida = trait[c].max()
    trait[c] = (trait[c] - zuixiao) / (zuida - zuixiao)
# trait.to_csv('/Users/lfg/Downloads/良性11.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### 加入机器学习算法

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [51]:
trait

Unnamed: 0,Average_Packet_Size,Ratio_of_Incoming_to_Outgoing_Bytes,Average_Number_of_Bytes_Received_per_Second,Average_Number_of_Packets_Sent_per_Flow,Average_Number_of_Packets_Received_per_Flow,Average_Number_of_Bytes_Sent_per_Flow,Average_Number_of_Bytes_Received_per_Flow,label
118,0.257048,0.064696,1.164660e-05,0.003206,0.014758,0.003206,0.003744,1
1984,0.316869,0.136906,2.449566e-06,0.002047,0.009160,0.002047,0.001019,1
3230,0.241437,0.074754,4.965482e-07,0.002164,0.010687,0.002164,0.002126,0
944,0.165241,0.053810,7.131088e-06,0.002424,0.013740,0.002424,0.003393,1
1428,0.580530,0.358690,9.378684e-07,0.013827,0.027176,0.013827,0.002867,1
...,...,...,...,...,...,...,...,...
3330,0.154967,0.011596,3.645258e-07,0.000189,0.007634,0.000189,0.001246,0
2474,0.532721,0.008152,2.944196e-06,0.000325,0.009160,0.000325,0.003095,1
2117,0.654250,0.137320,5.557285e-06,0.011735,0.020420,0.011735,0.006556,0
4086,0.154671,0.041058,4.392771e-05,0.000821,0.008588,0.000821,0.001440,0


In [57]:
trait = trait.sample(frac = 1.0)

In [58]:
Y = trait['label']
X = trait.drop('label',axis=1)

In [59]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.1,random_state=12)

In [60]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)
y_predict = dtc.predict(X_test)

In [61]:
dtc.score(X_test,Y_test)


0.7023370233702337