In [1]:
#Netflows are total count of occurence of a src_ip in a particular window
import pandas as pd
import numpy as np
#Edit path to the file for extracting features
def getFeatures(file_path):
    df=pd.read_csv(file_path)
    temp = pd.get_dummies(df.Tag)
    df['attack'] = temp.Attack
    df['normal'] = temp.Normal
    del temp, df['Tag']
    group = df.groupby(['bucket','source'])
    features = group.sourcePort.nunique()
    features = pd.DataFrame(features)
    features.rename(columns={'sourcePort':'usrc_port'},inplace=True)
    features['udest_ip'] = group.destination.nunique()
    features['udest_port'] = group.destinationPort.nunique()
    features['netflows'] = group.destination.count()
    features['bytes'] = group.totalSourceBytes.sum()
    features['packets'] = group.totalSourcePackets.sum()
    features['attack'] = group.attack.sum()
    features['normal'] = group.normal.sum()
    features.reset_index(inplace=True) 
    return features
    
def featurestoArray(features):
    #Converting DF to np array
    features_array = features.copy()
    del features_array['attack'],features_array['normal'],features_array['source'],features_array['bucket']
    features_array = features_array.as_matrix()
    return features_array

def getClusterFeatures(labeled_features):
    group = labeled_features.groupby(['label'])
    clusterfeatures = group.bucket.count()
    clusterfeatures = pd.DataFrame(clusterfeatures)
    clusterfeatures.rename(columns={'bucket':'instances'},inplace=True) 
    clusterfeatures['netflows']=group.netflows.sum()
    clusterfeatures['avgnetflows']=group.netflows.mean()
    clusterfeatures['stdnetflows']=group.netflows.std()
    clusterfeatures['usrc_ip']=group.source.nunique()
    clusterfeatures['avgsrc_port']=group.usrc_port.mean()
    clusterfeatures['stdsrc_port']=group.usrc_port.std()
    clusterfeatures['avgdest_ip']=group.udest_ip.mean()
    clusterfeatures['stddest_ip']=group.udest_ip.std()
    clusterfeatures['avgdest_port']=group.udest_port.mean()
    clusterfeatures['stddest_port']=group.udest_port.std()
    clusterfeatures['avgbytes']=group.bytes.mean()
    clusterfeatures['stdbytes']=group.bytes.std()
    clusterfeatures['avgpackets']=group.packets.mean()
    clusterfeatures['stdpackets']=group.packets.std()
    clusterfeatures['attack']=group.attack.sum()
    clusterfeatures['normal']=group.normal.sum()
    #True = Botnet    False = Normal
    clusterfeatures['label'] = (group.attack.sum()/group.normal.sum()>0.01)
    return clusterfeatures

In [2]:
def getLabeledFeaturesGMM(features_array):
    X_train = features_array
    from sklearn import mixture
    #Number of clusters = number of components
    # Four covariance Type ['spherical', 'diag', 'tied', 'full'] iterations can be changed
    clf = mixture.GMM(n_components=40, covariance_type='full', n_iter=100)
    label = clf.fit_predict(X_train)
    return label           

def addLabelTofeatures(features, label):
    labeled_features = features.copy()
    labeled_features['label'] = label
    return labeled_features

In [4]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer 

features = getFeatures(file_path="/home/ankit/Desktop/MTP/working_directory/ndf/ndf3.csv")
X_train, X_test = train_test_split(features, test_size=0.3,random_state=0)
X_train

Unnamed: 0,bucket,source,usrc_port,udest_ip,udest_port,netflows,bytes,packets,attack,normal
5832,315.0,192.168.2.112,34,13,4,34,19403.0,234.0,1.0,33.0
238,23.0,192.168.1.101,42,12,2,44,53006.0,751.0,0.0,44.0
7675,424.0,192.168.5.122,20,1,1,20,3016.0,32.0,0.0,20.0
613,41.0,192.168.5.122,6,1,1,6,564.0,6.0,0.0,6.0
2515,136.0,192.168.3.116,2,1,1,2,4112.0,52.0,0.0,2.0
9377,524.0,192.168.3.114,8,4,2,8,93634.0,1333.0,0.0,8.0
7885,437.0,192.168.2.110,2,2,1,2,1309.0,17.0,0.0,2.0
5357,287.0,192.168.2.111,2,2,2,3,5688.0,78.0,0.0,3.0
2615,141.0,192.168.3.115,11,5,2,11,6954.0,100.0,0.0,11.0
5621,303.0,192.168.2.107,11,10,3,11,2687.0,34.0,0.0,11.0


In [78]:
#Reading unlabelled test file
#X_test
# df=pd.read_csv("F:/MTP/working_directory/17Jan_df_bucket.csv")
# group = df.groupby(['bucket','SrcAddr'])
# X_test = group.Sport.nunique()
# X_test = pd.DataFrame(X_test)
# X_test['udest_ip'] = group.DstAddr.nunique()
# X_test['udest_port'] = group.Dport.nunique()
# X_test['netflows'] = group.DstAddr.count()
# X_test['bytes'] = group.TotBytes.sum()
# X_test['packets'] = group.sTos.sum()
# X_test['attack'] = group.sTos.sum() #dummy
# X_test['normal'] = group.sTos.sum() #dummy
# X_test.reset_index(inplace=True) 
# X_test.rename(columns={'Sport':'usrc_port'},inplace=True)
# X_test.rename(columns={'SrcAddr':'source'},inplace=True)
# X_test
#df

Converting Dataframe to numpy object

In [5]:
feature_array = featurestoArray(features=X_train)
label = getLabeledFeaturesGMM(features_array=feature_array)
X_train = addLabelTofeatures(features=X_train,label=label)
train_cluster_features = getClusterFeatures(labeled_features=X_train) 
train_labels = train_cluster_features['label']
del train_cluster_features['attack'],train_cluster_features['normal'],train_cluster_features['label']
train_cluster_features = Imputer.fit_transform(Imputer(strategy="most_frequent",axis=0),train_cluster_features)


feature_array = featurestoArray(features=X_test)
label = getLabeledFeaturesGMM(features_array=feature_array)
X_test = addLabelTofeatures(features=X_test,label=label)
test_cluster_features = getClusterFeatures(labeled_features=X_test)
test_labels = test_cluster_features['label']
del test_cluster_features['attack'],test_cluster_features['normal'],test_cluster_features['label'] 
test_cluster_features = Imputer.fit_transform(Imputer(strategy="most_frequent",axis=0),test_cluster_features)
del feature_array



In [6]:
train_cluster_features = train_cluster_features.astype(np.float32)
test_cluster_features = test_cluster_features.astype(np.float32)
train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)
train_cluster_features

array([[  5.90000000e+01,   4.47000000e+02,   7.57627106e+00,
          2.72411585e+00,   1.70000000e+01,   7.13559341e+00,
          2.52202463e+00,   2.72881365e+00,   2.01571846e+00,
          5.18644047e+00,   2.07176900e+00,   6.09366113e+03,
          1.25897974e+03,   7.59322052e+01,   1.57237625e+01],
       [  1.30000000e+01,   2.22800000e+03,   1.71384613e+02,
          2.56565075e+01,   2.00000000e+00,   1.70076920e+02,
          2.55292702e+01,   1.34615383e+01,   5.47137070e+00,
          3.15384626e+00,   6.88737214e-01,   9.45829938e+05,
          1.03745531e+05,   1.01503848e+04,   1.20448206e+03],
       [  6.00000000e+00,   7.71000000e+02,   1.28500000e+02,
          3.34230461e+01,   3.00000000e+00,   1.28500000e+02,
          3.34230461e+01,   2.00000000e+00,   1.26491106e+00,
          2.00000000e+00,   1.26491106e+00,   1.03255481e+06,
          3.10748938e+05,   8.68916699e+03,   3.13657788e+03],
       [  4.00000000e+01,   1.81400000e+03,   4.53499985e+01,
     

In [7]:
#normalize cluster features
import sklearn.preprocessing
train_cluster_features= sklearn.preprocessing.normalize(train_cluster_features,axis=0)
test_cluster_features= sklearn.preprocessing.normalize(test_cluster_features,axis=0)

In [15]:
#from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_cluster_features,train_labels)
cluster_label_predict=clf.predict(test_cluster_features)
cluster_label_predict

array([ 0.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,
        1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.], dtype=float32)

In [18]:
print sum(abs(cluster_label_predict-test_labels))
temp = X_test.groupby(['label'])
t = temp.packets.count()
label_predicted = [-1 if e == 0 else e for e in cluster_label_predict]
test_labels = [-1 if e == 0 else e for e in test_labels]
#Confusion Matrix of Classification of clusters acccording to decision tree classifier
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, label_predicted) #0 = normal

19.0


array([[16,  1],
       [ 1, 22]])

In [19]:
#Unique sourse IP's presetn in the test dataset
X_test['pred'] = [label_predicted[x[0]] for x in zip(X_test['label'])]
X_test['act_label'] = [test_labels[x[0]] for x in zip(X_test['label'])]
X_test.source.unique()

array(['192.168.2.106', '192.168.2.111', '192.168.3.115', '64.237.127.131',
       '192.168.1.105', '192.168.1.103', '192.168.4.119', '192.168.2.109',
       '192.168.2.113', '192.168.5.122', '192.168.2.110', '192.168.2.107',
       '0.0.0.0', '192.168.1.102', '192.168.1.104', '192.168.3.116',
       '192.168.4.120', '192.168.4.118', '192.168.2.112', '192.168.4.121',
       '192.168.3.114', '192.168.1.101', '192.168.2.108', '192.168.5.123',
       '131.202.240.209', '131.202.240.218', '131.202.243.84',
       '216.115.108.141', '192.168.5.124', '142.166.115.14',
       '192.168.3.117', '63.111.123.26', '217.76.44.243', '142.166.115.13'], dtype=object)

In [20]:
#Malicious IP's present in the test dataset
malicious_IP = X_test.groupby(['pred']).get_group(1)['source']
malicious_IP.unique()

array(['192.168.1.105', '192.168.2.109', '192.168.2.110', '192.168.4.118',
       '192.168.1.103', '192.168.2.113', '192.168.2.112', '192.168.4.120',
       '192.168.3.116', '192.168.3.114'], dtype=object)

In [21]:
#Malicious IP's predicted by Model in the test dataset
malicious_act = X_test.groupby(['act_label']).get_group(1)['source']
malicious_act.unique()

array(['192.168.1.105', '192.168.2.109', '192.168.2.110', '192.168.4.118',
       '192.168.1.103', '192.168.2.113', '192.168.2.112', '192.168.4.120',
       '192.168.3.116', '192.168.3.114'], dtype=object)