# 1.Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from tensorflow.keras import regularizers

In [None]:
from sklearn.model_selection import train_test_split

In [None]:

XL=pd.read_csv('9_510_L_resample_95_5.csv')
XU_train=pd.read_csv('9_510_U_resample_95_5.csv')# No resampling done in the unlabeled set, It just denotes the pair of L and U


In [None]:
from sklearn.utils import shuffle
XU_train.head()

In [None]:
XL.head()

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler 

In [None]:
a=XL.groupby('ProtocolName')['ProtocolName'].count().sort_values(ascending=False)


In [None]:
a   # see resampled instances

In [None]:
'''
Function mask_noise

Input: numpy array , fraction_delete(between 0 and 1)

Removes a fraction_delete fraction of the input array X randomly

Output: numpy array after deletions

'''
def mask_noise(X,fraction_delete=0.2):
    mask=np.random.rand(np.shape(X)[0],np.shape(X)[1])
    mask=1*(mask>fraction_delete)
    return mask*X

In [None]:
'''
Function Gaussian_noise

Input: numpy array , sigma

adds noise distributed normally with a stddev sigma 

Output: numpy array after adding noise

'''
def Gaussian_noise(X,sigma=0.5):
    noise = np.random.normal(0,sigma,np.shape(X))
    return noise+X

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
import keras.backend as K
import tensorflow as tf
from tensorflow.keras import regularizers
from keras.models import load_model
import keras
from keras.layers import Dropout

In [None]:
'''
Sparcity Autoencoder's Sparcity Activity Regularization

'''
def kl_divergence(rho, rho_hat):
    return rho * tf.math.log(rho) - rho * tf.math.log(rho_hat+1e-12) + (1 - rho) * tf.math.log(1 - rho) - (1 - rho) * tf.math.log(1 - rho_hat+1e-12)

class SparseActivityRegularizer(tf.keras.regularizers.Regularizer):

    def __init__(self, p=0.1, sparsityBeta=3):
        self.p = p
        self.sparsityBeta = sparsityBeta

    def __call__(self, x):
        regularization = 0            

        p_hat = K.mean(x, axis=0)
        regularization += self.sparsityBeta * K.sum(kl_divergence(self.p, p_hat))

        return regularization

    def get_config(self):
        return {"name": self.__class__.__name__} 

In [None]:
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.optimizers import Adagrad
from keras.optimizers import SGD
from sklearn.preprocessing import LabelEncoder

In [None]:
yL_train=XL.iloc[:,-1]
XL_train=XL.iloc[:,:-1]

In [None]:
yL_train=pd.get_dummies(yL_train)#One-hot_encoding

# Split into train,test and val

#randomly split into train(80%) and val(20%)
XL_train,XL_val,yL_train,yL_val=train_test_split(XL_train,yL_train,random_state=0,test_size=0.2)

#randomly split val into val(10%) and test(10%)
XL_val,XL_test,yL_val,yL_test=train_test_split(XL_val,yL_val,random_state=0,test_size=0.5)

In [None]:
XL_test.to_csv('XL_test_95L-5U.csv',index=False)
yL_test.to_csv('yL_test_95L-5U.csv',index=False)


In [None]:
yL_test

In [None]:
pd.read_csv('yL_test_95L-5U') 


 Reversing the one hot encoding for each set to be used for ML methods

In [None]:
yL_train2=yL_train[yL_train==1].stack().reset_index().drop(0,1)['level_1'] # reversing the one hot encoding for each set

In [None]:
yL_val2=yL_val[yL_val==1].stack().reset_index().drop(0,1)['level_1']# reversing the one hot encoding for each set

In [None]:
yL_test2=yL_test[yL_test==1].stack().reset_index().drop(0,1)['level_1']# reversing the one hot encoding for each set

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

#scaling and normalization
scaler=StandardScaler()
XU_train=scaler.fit_transform(XU_train)
XL_train=scaler.fit_transform(XL_train)
XL_val=scaler.transform(XL_val)
XL_test=scaler.transform(XL_test)


# 2. Training the SDAE
#### Training the Encoder parts

In [None]:
from keras_sequential_ascii import keras2ascii
from tensorflow.keras import regularizers
from keras.models import Sequential

In [None]:
from keras import backend as K


In [None]:
'''
Function SDAE
Inputs: numpy array X, list layers,list activations,float rho,int n_epochs,string denoise,float fraction_delete

denoise is the mode of adding noise
rho is the sparcity constraint parameter
layers is a list of no. of hidden nodes in each hidden layer
activations is a list of activations used in each encoding layer

trains each autoencoder layer and outputs all the encoder-layer models.

Output: a list of keras models

'''
def SDAE(X,layers,activations,rho,n_epochs,denoise,fraction_delete=0.2):
    layers.insert(0,len(X[0]))
    #layers.insert(0,len(X_train.iloc[0]))
    ec=[]
    for i in range(len(layers)-1):
        encoding_dim = layers[i+1]

    # this is our input placeholder
        input_ = Input(shape=(layers[i],))
    # define regulariser
        if rho[i] != None: 
            regulariser = SparseActivityRegularizer(rho[i],3)
        else:
            regulariser=None

        # "encoded" is the encoded representation of the input
        encoded = Dense(encoding_dim, activation=activations[i],bias_initializer='random_normal',kernel_initializer='he_uniform',activity_regularizer=regulariser,name='encoder'+str(i+1))(input_)
        
        # "decoded" is the lossy reconstruction of the input
        decoded = Dense(layers[i],activation=None,name='decoder'+str(i+1))(encoded)

        # this model maps an input to its reconstruction
        autoencoder = Model(input_, decoded)

        # this model maps an input to its encoded representation
        encoder = Model(input_, encoded)

        # create a placeholder for an encoded (32-dimensional) input
        encoded_input = Input(shape=(encoding_dim,))

        autoencoder.compile(optimizer=RMSprop(learning_rate=0.001,clipnorm=0.1), loss='mse')
        
        if denoise =='Gauss':
            X_noisy=Gaussian_noise(X,fraction_delete)
        else:
            X_noisy=mask_noise(X,fraction_delete)
        
        print('Training Layer ' + str(i+1)+' : ')
        autoencoder.fit(X_noisy,X,
                epochs=n_epochs,
                batch_size=256,
                shuffle=True,
                validation_data=(X,X))
        
        ec.append(encoder)
        X=encoder.predict(X)
    print('All Layers Done!')    
    return ec       

In [None]:
#Train Sparse Stacked Denoising Autoencoder
layers = [100,200,400,50]
activations=['relu','relu','relu','relu']
rho=[0.06,0.06,0.06,0.06]
traint0=time.time()
stacked_layers=SDAE(XU_train,layers,activations,rho,10,'mask',0.02)

# 3.Training the SoftMax classifier

In [None]:
'''
Performs a Forward Pass through the SDAE

'''
def SDAE_predict(stacked_layers,X):
    for layer in stacked_layers:
        X1=layer.predict(X)
        X=X1
    return X1    

In [None]:
#forward pass through SDAE
XL_train1=SDAE_predict(stacked_layers,XL_train)
XL_val1=SDAE_predict(stacked_layers,XL_val)
XL_test1=SDAE_predict(stacked_layers,XL_test)

### Please skip this section to part 6 If only ML methods are to be tested, The following is the DL NN model

In [None]:
#Train Classifier separately
input1 = Input(shape=(layers[-1],))
L1 = Dense(len(yL_train.columns),activation='softmax')(input1)
classifier=Model(input1,L1)   
classifier.compile(optimizer=SGD(learning_rate=0.1,momentum=0.9),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
history=classifier.fit(XL_train1,yL_train,
                epochs=50,
                #batch_size=256, #default batch_size=32
                shuffle=True,
                validation_data=(XL_val1, yL_val))

# 4. Fine Tuning
#### Fine Tuning Involves removing the sparsity regularizer from all the layers , adding dropout etc and performing backpropagation to improve the overall model accuracy 

In [None]:
# change regularisation settings
a=[]
for i in range(len(stacked_layers)):
    encoder=stacked_layers[i]
    #encoder.layers[-2].activity_regularizer=None  
    encoder.layers[-1].activity_regularizer=None #remove Sparse activity regularizer
    encoder.save('temp.h5')
    encoder=load_model('temp.h5')
    print(encoder.losses)
    a.append(encoder)
    
classifier.save('temp.h5')
classifier=load_model('temp.h5')
print(classifier.losses)
a.append(classifier)    

In [None]:
# add dropout till n-2 layers
model=Sequential()
for i in a:
    model.add(i)
    if i != classifier and i != encoder:
        model.add(Dropout(0.02))

In [None]:
model.save('temp.h5')
model=load_model('temp.h5')

In [None]:
model.compile(optimizer=RMSprop(learning_rate=0.0001),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
#fine tune model runs for no. of epochs set below
history=model.fit(XL_train,yL_train,
                epochs=750,
                batch_size=256,
                shuffle=True,
                validation_data=(XL_val, yL_val),
                verbose=0)
traint1=time.time()

In [None]:
print("Train_time: "+str(traint1-traint0))

In [None]:
test0=time.time()
print(model.evaluate(XL_test,yL_test))
test1=time.time()
print("Test_time: "+str(test1-test0))

In [None]:
#Visualise convergence with no. of epochs
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])


# 5. Results
#### Below are the results obtained on our NN model

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
y_pred=model.predict(XL_test)

In [None]:
y_pred1=y_pred.argmax(axis=1)
y_pred1=np.append(y_pred1,np.arange(24))
y_pred1=pd.get_dummies(y_pred1,columns=yL_train.columns)
y_pred1.columns=yL_test.columns
y_pred1=y_pred1.iloc[:-24]

In [None]:
yL_test.sum()

In [None]:
y_test1 = yL_test.columns[np.where(yL_test!=0)[1]]

In [None]:
y_pred1=y_pred1.columns[np.where(y_pred1!=0)[1]]
y_pred1

In [None]:
cm=confusion_matrix(y_test1,y_pred1,labels=yL_test.columns)

In [None]:
cm_pc=pd.DataFrame((cm.T/np.sum(cm,axis=1)).T,columns=yL_test.columns,index=yL_test.columns)
cm_pc=cm_pc.replace(np.nan,0)


In [None]:
# Plot confusion matrix
plt.figure(figsize=(35,25))
df_cm=pd.DataFrame(cm,columns=yL_test.columns,index=yL_test.columns)
sns.set(font_scale=1.0)
sns.heatmap(cm_pc, annot=True,fmt='.1%',cmap=sns.color_palette("Reds",50)) # font size
plt.show()

In [None]:
print('Test Accuracy : '+str(accuracy_score(y_test1, y_pred1)))

In [None]:
print('weighted F1 Score : ' + str(f1_score(y_pred1,y_test1,average = 'weighted')))
print('Macro F1 Score : ' + str(f1_score(y_pred1,y_test1,average = 'macro')))
print('Micro F1 Score : ' + str(f1_score(y_pred1,y_test1,average = 'micro')))
print('F1 Scores : ' )
pd.DataFrame(f1_score(y_pred1,y_test1,labels=yL_test.columns,average = None),columns=['F1-score'],index=yL_test.columns)

In [None]:
len(yL_train.columns)# No. of distinct labels

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
print('Weighted Precision : '+str(precision_score(y_test1, y_pred1,average="weighted")))
print('Macro Precision : '+str(precision_score(y_test1, y_pred1,average="macro")))
print('Micro Precision : '+str(precision_score(y_test1, y_pred1,average="micro")))
print('Precision Scores : ' )
pd.DataFrame(precision_score(y_pred1,y_test1,labels=yL_test.columns,average = None),columns=['Precision-score'],index=yL_test.columns)

In [None]:
print('Weighted Recall : '+str(recall_score(y_test1, y_pred1,average="weighted")))
print('Macro Recall : '+str(recall_score(y_test1, y_pred1,average="macro")))
print('Micro Recall : '+str(recall_score(y_test1, y_pred1,average="micro")))
print('Recall Scores : ' )
pd.DataFrame(recall_score(y_pred1,y_test1,labels=yL_test.columns,average = None),columns=['recall-score'],index=yL_test.columns)

#### Confusion Matrix with 10 chosen well-known classes
Classes : AMAZON, FACEBOOK, GMAIL, GOOGLE, HTTP, OFFICE_365, SKYPE, TWITTER, WHATSAPP, YOUTUBE

In [None]:
from sklearn.metrics import confusion_matrix
apps=['AMAZON','FACEBOOK','GMAIL','GOOGLE','HTTP','OFFICE_365','SKYPE','TWITTER','WHATSAPP','YOUTUBE']#reporting for these well-known Apps
y_test1[y_test1.isin(apps)]
cm=confusion_matrix(y_test1[y_test1.isin(apps)],y_pred1[y_test1.isin(apps)],labels=y_test1[y_test1.isin(apps)].unique())

In [None]:
cm_pc=pd.DataFrame((cm.T/np.sum(cm,axis=1)).T,columns=y_test1[y_test1.isin(apps)].unique(),index=y_test1[y_test1.isin(apps)].unique())
cm_pc=cm_pc.replace(np.nan,0)

In [None]:
plt.figure(figsize=(15,10))
df_cm=pd.DataFrame(cm,columns=y_test1[y_test1.isin(apps)].unique(),index=y_test1[y_test1.isin(apps)].unique())
sns.set(font_scale=1.1)
sns.heatmap(cm_pc, annot=True,fmt='.1%',cmap=sns.color_palette("Reds",150)) # font size
plt.show()

# 6. ML methods
#### Performing ML methods on Transformed data

In [None]:
# import sklearn libraries
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# RandomForest


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(random_state=0, max_depth = 60)
clf.fit(XL_train1,yL_train2)

In [None]:
print(clf.score(XL_train1,yL_train2))# train accuracy
print(clf.score(XL_test1,yL_test2))# test accuracy

In [None]:
y_pred=clf.predict(XL_test1)

In [None]:
# Results
print('weighted F1 Score : ' + str(f1_score(y_pred,yL_test2,average='weighted')))
print('weighted Precision : ' + str(precision_score(y_pred,yL_test2,average='weighted')))
print('weighted Recall : ' + str(recall_score(y_pred,yL_test2,average='weighted')))

In [None]:
# Results
print('macro F1 Score : ' + str(f1_score(y_pred,yL_test2,average='macro')))
print('macro Precision : ' + str(precision_score(y_pred,yL_test2,average='macro')))
print('macro Recall : ' + str(recall_score(y_pred,yL_test2,average='macro')))

# SVM

In [None]:
from sklearn.svm import SVC
clf=SVC(random_state=0,kernel='rbf').fit(XL_train1,yL_train2)

In [None]:
print(clf.score(XL_train1,yL_train2))# train accuracy
print(clf.score(XL_test1,yL_test2))# test accuracy

In [None]:
y_pred=clf.predict(XL_test1)

In [None]:
#results
print('weighted F1 Score : ' + str(f1_score(y_pred,yL_test2,average = 'weighted')))
print('weighted Precision : ' + str(precision_score(y_pred,yL_test2,average = 'weighted')))
print('weighted Recall : ' + str(recall_score(y_pred,yL_test2,average = 'weighted')))

In [None]:
#results
print('macro F1 Score : ' + str(f1_score(y_pred,yL_test2,average = 'macro')))
print('macro' Precision :  + str(precision_score(y_pred,yL_test2,average = 'macro')))
print('macro Recall : ' + str(recall_score(y_pred,yL_test2,average = 'macro')))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier(max_depth=60).fit(XL_train1,yL_train2)

In [None]:
clf.score(XL_train1,yL_train2) #train accuracy

In [None]:
print(clf.score(XL_test1,yL_test2))# test accuracy

In [None]:
#results
y_pred=clf.predict(XL_test1)
print('weighted F1 Score : ' + str(f1_score(y_pred,yL_test2,average = 'weighted')))
print('weighted Precision : ' + str(precision_score(y_pred,yL_test2,average = 'weighted')))
print('weighted Recall : ' + str(recall_score(y_pred,yL_test2,average = 'weighted')))

In [None]:
#results
y_pred=clf.predict(XL_test1)
print('macro F1 Score : ' + str(f1_score(y_pred,yL_test2,average = 'macro')))
print('macro Precision : ' + str(precision_score(y_pred,yL_test2,average = 'macro')))
print('macro Recall : ' + str(recall_score(y_pred,yL_test2,average = 'macro')))

### XGBOOST

In [None]:
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

In [None]:
train0=time.time()
clf = OneVsRestClassifier(XGBClassifier(verbosity=0)).fit(XL_train1, yL_train2)
train1=time.time()

In [None]:
print('Train-score: '+str(clf.score(XL_train1,yL_train2)))# train accuracy

In [None]:
print('Test-score: '+str(clf.score(XL_test1,yL_test2)))# test accuracy

In [None]:
#results
y_pred=clf.predict(XL_test1)
print('weighted F1 Score : ' + str(f1_score(y_pred,yL_test2,average = 'weighted')))
print('weighted Precision : ' + str(precision_score(y_pred,yL_test2,average = 'weighted')))
print('weighted Recall : ' + str(recall_score(y_pred,yL_test2,average = 'weighted')))

In [None]:
#results
y_pred=clf.predict(XL_test1)
print('macro F1 Score : ' + str(f1_score(y_pred,yL_test2,average = 'macro')))
print('macro Precision : ' + str(precision_score(y_pred,yL_test2,average = 'maco')))
print('macro Recall : ' + str(recall_score(y_pred,yL_test2,average = 'macro')))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf=DecisionTreeClassifier(max_depth=60).fit(XL_train1,yL_train)
print(clf.score(XL_train1,yL_train)) #train accuracy
ypred=clf.predict(XL_test1)
print(accuracy_score(ypred,yL_test))# test accuracy

In [None]:
len(XL)