# Class 2 Lab Solution

### A nice Python function to help read data

We still need our helper function read_traffic

In [1]:
import subprocess
import datetime
import pandas as pd

def read_traffic(filename, fields=[], display_filter="", 
              timeseries=False, strict=False):
    """ Read PCAP file into Pandas DataFrame object. 
    Uses tshark command-line tool from Wireshark.

    filename:       Name or full path of the PCAP file to read
    fields:         List of fields to include as columns
    display_filter: Additional filter to restrict frames
    strict:         Only include frames that contain all given fields 
                    (Default: false)
    timeseries:     Create DatetimeIndex from frame.time_epoch 
                    (Default: false)

    Syntax for fields and display_filter is specified in
    Wireshark's Display Filter Reference:
 
      http://www.wireshark.org/docs/dfref/
    """
    if timeseries:
        fields = ["frame.time_epoch"] + fields
    fieldspec = " ".join("-e %s" % f for f in fields)

    display_filters = fields if strict else []
    if display_filter:
        display_filters.append(display_filter)
    filterspec = "-Y '%s'" % " and ".join(f for f in display_filters)

    options = "-r %s -n -T fields -Eheader=y" % filename
    cmd = "tshark %s %s %s" % (options, filterspec, fieldspec)
    #print cmd
    proc = subprocess.Popen(cmd, shell = True, 
                                 stdout=subprocess.PIPE)
    if timeseries:
        df = pd.read_table(proc.stdout, 
                        index_col = "frame.time_epoch", 
                        parse_dates=True, 
                        date_parser=datetime.datetime.fromtimestamp)
    else:
        df = pd.read_table(proc.stdout)
    return df

## Analyze the trace at TCP level

Read relevant information from the raw data. To recover required stream information, we need at least following information from each frame:

packet's capture timestamp, stream id, source ip, source port, destination ip, destination port, packet length

In [5]:
fields=["tcp.stream", "ip.src", "tcp.srcport", "ip.dst", "tcp.dstport", "tcp.len"]
ts=read_traffic("traffic_dump.pcap", fields, timeseries=True, strict=True)
ts.head()

Unnamed: 0_level_0,tcp.stream,ip.src,tcp.srcport,ip.dst,tcp.dstport,tcp.len
frame.time_epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-07-21 21:51:07.095278,0,192.168.1.64,42760,74.125.19.83,80,0
2008-07-21 21:51:07.103728,0,74.125.19.83,80,192.168.1.64,42760,0
2008-07-21 21:51:07.114897,1,192.168.1.64,35011,74.125.19.19,80,1351
2008-07-21 21:51:07.139448,1,74.125.19.19,80,192.168.1.64,35011,0
2008-07-21 21:51:07.319680,1,74.125.19.19,80,192.168.1.64,35011,1214


In order to compute the starting and ending time of streams, we need the capture time to be a column, not just being the index. We create a new column called "capture.time", and replicate the timestamps from index to this column.

In [6]:
import numpy as np
ts['capture.time']=ts.index.to_datetime()
ts.head()


Unnamed: 0_level_0,tcp.stream,ip.src,tcp.srcport,ip.dst,tcp.dstport,tcp.len,capture.time
frame.time_epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-07-21 21:51:07.095278,0,192.168.1.64,42760,74.125.19.83,80,0,2008-07-21 21:51:07.095278
2008-07-21 21:51:07.103728,0,74.125.19.83,80,192.168.1.64,42760,0,2008-07-21 21:51:07.103728
2008-07-21 21:51:07.114897,1,192.168.1.64,35011,74.125.19.19,80,1351,2008-07-21 21:51:07.114897
2008-07-21 21:51:07.139448,1,74.125.19.19,80,192.168.1.64,35011,0,2008-07-21 21:51:07.139448
2008-07-21 21:51:07.319680,1,74.125.19.19,80,192.168.1.64,35011,1214,2008-07-21 21:51:07.319680


Now, we can group packets into streams.

In [7]:
grouped=ts.groupby("tcp.stream")


We then use group object's agg function to generate a new data frame, which is frame based. 

Note that we assume each stream's source ip, source port, destination ip, destination port are the same with those in the stream's first packet. It may not be 100% accurate in real-world.

In [10]:
streams=grouped.agg({'capture.time':[np.min, np.max],
            'tcp.len':np.sum,
            'ip.src':lambda x:x[0],
            'tcp.srcport':lambda x:x[0],
            'ip.dst':lambda x:x[0],
            'tcp.dstport':lambda x: x[0]})
streams.head()

Unnamed: 0_level_0,tcp.srcport,tcp.len,ip.dst,capture.time,capture.time,tcp.dstport,ip.src
Unnamed: 0_level_1,<lambda>,sum,<lambda>,amin,amax,<lambda>,<lambda>
tcp.stream,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,42760,0,74.125.19.83,2008-07-21 21:51:07.095278,2008-07-21 21:51:07.436746,80,192.168.1.64
1,35011,2565,74.125.19.19,2008-07-21 21:51:07.114897,2008-07-21 21:51:07.558553,80,192.168.1.64
2,39153,5158,74.125.19.103,2008-07-21 21:51:07.423663,2008-07-21 21:51:07.655556,443,192.168.1.64
3,34050,8266,74.125.19.103,2008-07-21 21:51:07.659558,2008-07-21 21:52:27.168965,443,192.168.1.64
4,38913,5017,209.85.171.97,2008-07-21 21:51:08.404617,2008-07-21 21:53:29.160668,443,192.168.1.64


Change the column names and the index name

In [24]:
streams.columns=['src_port', 'stream_len', 'dst_ip','start_time', 'end_time','dst_port','src_ip']
streams.index.rename('stream_id', inplace=True)
streams.head()

Unnamed: 0_level_0,src_port,stream_len,dst_ip,start_time,end_time,dst_port,src_ip
stream_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,42760,0,74.125.19.83,2008-07-21 21:51:07.095278,2008-07-21 21:51:07.436746,80,192.168.1.64
1,35011,2565,74.125.19.19,2008-07-21 21:51:07.114897,2008-07-21 21:51:07.558553,80,192.168.1.64
2,39153,5158,74.125.19.103,2008-07-21 21:51:07.423663,2008-07-21 21:51:07.655556,443,192.168.1.64
3,34050,8266,74.125.19.103,2008-07-21 21:51:07.659558,2008-07-21 21:52:27.168965,443,192.168.1.64
4,38913,5017,209.85.171.97,2008-07-21 21:51:08.404617,2008-07-21 21:53:29.160668,443,192.168.1.64


In [25]:
# Now define our function:
# select all streams that involve a given ip address
def findStreamsByIP(ip=None):
    if (ip is None):
        return None
    
    myStreams = streams[(streams['dst_ip']==ip) | (streams['src_ip']==ip)]
    return myStreams

In [29]:
streams.src_ip.value_counts()

192.168.15.4       1630
192.168.1.64        359
69.22.167.201         3
204.89.131.52         3
209.85.171.93         1
69.22.167.215         1
123.153.71.129        1
38.107.163.7          1
76.13.208.11          1
205.188.9.131         1
209.85.171.190        1
63.241.243.20         1
200.198.253.230       1
65.125.141.18         1
89.1.128.81           1
69.39.67.98           1
63.247.140.161        1
190.165.27.4          1
209.73.191.242        1
89.216.64.93          1
74.125.19.19          1
66.228.249.145        1
66.114.51.42          1
74.66.229.33          1
70.50.251.14          1
168.167.149.94        1
58.8.102.59           1
66.151.152.143        1
125.211.216.53        1
208.99.185.63         1
Name: src_ip, dtype: int64

In [27]:
myStreams = findStreamsByIP('192.168.1.64')
myStreams

Unnamed: 0_level_0,src_port,stream_len,dst_ip,start_time,end_time,dst_port,src_ip
stream_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,42760,0,74.125.19.83,2008-07-21 21:51:07.095278,2008-07-21 21:51:07.436746,80,192.168.1.64
1,35011,2565,74.125.19.19,2008-07-21 21:51:07.114897,2008-07-21 21:51:07.558553,80,192.168.1.64
2,39153,5158,74.125.19.103,2008-07-21 21:51:07.423663,2008-07-21 21:51:07.655556,443,192.168.1.64
3,34050,8266,74.125.19.103,2008-07-21 21:51:07.659558,2008-07-21 21:52:27.168965,443,192.168.1.64
4,38913,5017,209.85.171.97,2008-07-21 21:51:08.404617,2008-07-21 21:53:29.160668,443,192.168.1.64
5,46756,2521,72.14.223.191,2008-07-21 21:51:08.797425,2008-07-21 21:51:09.228901,443,192.168.1.64
6,46062,3067,209.85.171.97,2008-07-21 21:51:09.304315,2008-07-21 21:53:20.405529,443,192.168.1.64
7,42608,3369,74.125.19.19,2008-07-21 21:51:09.534025,2008-07-21 21:52:29.151718,443,192.168.1.64
8,44018,25630,209.3.183.2,2008-07-21 21:51:19.258123,2008-07-21 21:51:26.769411,80,192.168.1.64
9,34871,23989,209.3.183.2,2008-07-21 21:51:19.259024,2008-07-21 21:51:26.769867,80,192.168.1.64
