In [13]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

In [None]:
'''Files relevant to this demo: smallFlows.pcap, smallFlow_stats.csv'''

# Import Data

In [2]:
flow_file = "smallFlow_stats.csv"
flow_df = pd.read_csv(flow_file,names=['srcip','srcport','dstip','dstport','proto','total_fpackets','total_fvolume',
                                              'total_bpackets','total_bvolume','min_fpktl','mean_fpktl','max_fpktl','std_fpktl',
                                              'min_bpktl','mean_bpktl','max_bpktl','std_bpktl','min_fiat','mean_fiat','max_fiat',
                                              'std_fiat','min_biat','mean_biat','max_biat','std_biat','duration','min_active',
                                              'mean_active','max_active','std_active','min_idle','mean_idle','max_idle','std_idle',
                                              'sflow_fpackets','sflow_fbytes','sflow_bpackets','sflow_bbytes','fpsh_cnt','bpsh_cnt',
                                              'furg_cnt','burg_cnt','total_fhlen','total_bhlen','misc'])

In [3]:
flow_df

Unnamed: 0,srcip,srcport,dstip,dstport,proto,total_fpackets,total_fvolume,total_bpackets,total_bvolume,min_fpktl,...,sflow_fbytes,sflow_bpackets,sflow_bbytes,fpsh_cnt,bpsh_cnt,furg_cnt,burg_cnt,total_fhlen,total_bhlen,misc
0,192.168.3.131,55950,72.14.213.102,80,6,5,1156,3,619,40,...,1156,3,619,1,1,0,0,212,132,0
1,192.168.3.131,55955,207.46.148.38,80,6,5,660,3,407,40,...,660,3,407,1,1,0,0,212,128,0
2,192.168.3.131,55954,65.55.17.37,80,6,13,2369,19,25234,40,...,2369,19,25234,3,4,0,0,532,764,0
3,192.168.3.131,58264,208.82.236.129,80,6,5,765,4,388,40,...,765,4,388,1,1,0,0,212,172,0
4,192.168.3.131,58265,208.82.236.129,80,6,6,858,8,6980,40,...,858,8,6980,1,4,0,0,252,332,0
5,192.168.3.131,58272,208.82.236.129,80,6,7,856,8,6916,40,...,856,8,6916,1,4,0,0,292,332,0
6,192.168.3.131,55963,65.54.95.140,80,6,12,1685,13,13111,40,...,1685,13,13111,3,3,0,0,492,532,0
7,192.168.3.131,55973,65.54.95.142,80,6,16,1040,26,34256,40,...,1040,26,34256,1,1,0,0,652,1052,0
8,192.168.3.131,55960,206.108.207.139,80,6,6,763,4,694,40,...,763,4,694,1,1,0,0,252,172,0
9,192.168.3.131,52201,72.14.213.102,443,6,6,434,5,2255,40,...,434,5,2255,1,1,0,0,252,212,0


# Preprocess data 

In [5]:
#Identify which ports you want to try and predict, in this case we are picking ports 53,443,80
port_filter = [53,443,80]
flow_df = flow_df.loc[flow_df['dstport'].isin(port_filter) | flow_df['srcport'].isin(port_filter)]
del flow_df['misc']
flow_df


Unnamed: 0,srcip,srcport,dstip,dstport,proto,total_fpackets,total_fvolume,total_bpackets,total_bvolume,min_fpktl,...,sflow_fpackets,sflow_fbytes,sflow_bpackets,sflow_bbytes,fpsh_cnt,bpsh_cnt,furg_cnt,burg_cnt,total_fhlen,total_bhlen
0,192.168.3.131,55950,72.14.213.102,80,6,5,1156,3,619,40,...,5,1156,3,619,1,1,0,0,212,132
1,192.168.3.131,55955,207.46.148.38,80,6,5,660,3,407,40,...,5,660,3,407,1,1,0,0,212,128
2,192.168.3.131,55954,65.55.17.37,80,6,13,2369,19,25234,40,...,13,2369,19,25234,3,4,0,0,532,764
3,192.168.3.131,58264,208.82.236.129,80,6,5,765,4,388,40,...,5,765,4,388,1,1,0,0,212,172
4,192.168.3.131,58265,208.82.236.129,80,6,6,858,8,6980,40,...,6,858,8,6980,1,4,0,0,252,332
5,192.168.3.131,58272,208.82.236.129,80,6,7,856,8,6916,40,...,7,856,8,6916,1,4,0,0,292,332
6,192.168.3.131,55963,65.54.95.140,80,6,12,1685,13,13111,40,...,12,1685,13,13111,3,3,0,0,492,532
7,192.168.3.131,55973,65.54.95.142,80,6,16,1040,26,34256,40,...,16,1040,26,34256,1,1,0,0,652,1052
8,192.168.3.131,55960,206.108.207.139,80,6,6,763,4,694,40,...,6,763,4,694,1,1,0,0,252,172
9,192.168.3.131,52201,72.14.213.102,443,6,6,434,5,2255,40,...,6,434,5,2255,1,1,0,0,252,212


# Create training and testing data

In [7]:
stats = flow_df.ix[:,'total_fpackets':]
stats

Unnamed: 0,total_fpackets,total_fvolume,total_bpackets,total_bvolume,min_fpktl,mean_fpktl,max_fpktl,std_fpktl,min_bpktl,mean_bpktl,...,sflow_fpackets,sflow_fbytes,sflow_bpackets,sflow_bbytes,fpsh_cnt,bpsh_cnt,furg_cnt,burg_cnt,total_fhlen,total_bhlen
0,5,1156,3,619,40,231,984,420,40,206,...,5,1156,3,619,1,1,0,0,212,132
1,5,660,3,407,40,132,488,199,40,135,...,5,660,3,407,1,1,0,0,212,128
2,13,2369,19,25234,40,182,692,268,44,1328,...,13,2369,19,25234,3,4,0,0,532,764
3,5,765,4,388,40,153,593,246,40,97,...,5,765,4,388,1,1,0,0,212,172
4,6,858,8,6980,40,143,646,246,40,872,...,6,858,8,6980,1,4,0,0,252,332
5,7,856,8,6916,40,122,604,212,40,864,...,7,856,8,6916,1,4,0,0,292,332
6,12,1685,13,13111,40,140,438,179,40,1008,...,12,1685,13,13111,3,3,0,0,492,532
7,16,1040,26,34256,40,65,428,96,40,1317,...,16,1040,26,34256,1,1,0,0,652,1052
8,6,763,4,694,40,127,551,207,40,173,...,6,763,4,694,1,1,0,0,252,172
9,6,434,5,2255,40,72,222,73,40,451,...,6,434,5,2255,1,1,0,0,252,212


# Create data labels (by taking the min of src and dst port)

In [9]:
def port_func(row):
    return min(row['srcport'],row['dstport'])

ports = flow_df.apply(port_func,axis=1)
ports.values

array([ 80,  80,  80,  80,  80,  80,  80,  80,  80, 443, 443, 443, 443,
       443, 443, 443, 443, 443, 443, 443, 443, 443, 443,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
        80, 443,  80,  80, 443, 443, 443, 443, 443, 443, 443, 443, 443,
       443,  80,  80, 443,  80,  80, 443, 443,  80,  80, 443,  80,  80,
        80,  80, 443,  80, 443,  80,  80,  80,  80,  80,  80,  80, 443,
       443,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80, 44

# Scale data for use with models

In [10]:
stats = preprocessing.scale(stats)
stats

array([[-0.48048821, -0.25433735, -0.46770332, ...,  0.        ,
        -0.48398725, -0.48751471],
       [-0.48048821, -0.3396972 , -0.46770332, ...,  0.        ,
        -0.48398725, -0.49040161],
       [-0.02356618, -0.04558436, -0.00806669, ...,  0.        ,
        -0.0278163 , -0.03138372],
       ..., 
       [-0.42337296, -0.17001009, -0.46770332, ...,  0.        ,
        -0.42696588, -0.48751471],
       [-0.65183397, -0.43021993, -0.49643061, ...,  0.        ,
        -0.59517892,  0.02346748],
       [ 0.26201009,  0.99060839, -0.29533958, ...,  0.        ,
         0.25729055, -0.32007422]])

# Split data into test and train data 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(stats,
                                                    ports.values, test_size=0.2, random_state=41)

# Logistic Regression Model 

In [17]:
lgs = linear_model.LogisticRegression(C=1e5)
lgs_accuracy = lgs.fit(X_train, y_train).score(X_test, y_test)
print "Logistic regression accuracy:", lgs_accuracy
lgs_result = lgs.predict(X_test)
print classification_report(y_test, lgs_result)

Logistic regression accuracy: 0.842857142857
             precision    recall  f1-score   support

         53       1.00      1.00      1.00         3
         80       0.85      0.94      0.90        50
        443       0.75      0.53      0.62        17

avg / total       0.84      0.84      0.83        70



# Support Vector Machine (SVM) Model

In [19]:
clf = svm.SVC()
clf_accuracy = clf.fit(X_train, y_train).score(X_test, y_test)
print "SVM accuracy:",clf_accuracy
clf_result = clf.predict(X_test)
print classification_report(y_test, clf_result)

SVM accuracy: 0.814285714286
             precision    recall  f1-score   support

         53       1.00      1.00      1.00         3
         80       0.79      1.00      0.88        50
        443       1.00      0.24      0.38        17

avg / total       0.85      0.81      0.77        70



# Naive Bayes Model 

In [20]:
gnb = GaussianNB()
gnb_accuracy = gnb.fit(X_train, y_train).score(X_test, y_test)
print "Naive Bayes Classifer", gnb_accuracy
gnb_result = gnb.predict(X_test)
print classification_report(y_test, gnb_result)

Naive Bayes Classifer 0.757142857143
             precision    recall  f1-score   support

         53       1.00      1.00      1.00         3
         80       0.81      0.86      0.83        50
        443       0.50      0.41      0.45        17

avg / total       0.74      0.76      0.75        70

