In [1]:
"""
File presents operations performed in order to preprocess data to use in machine learning algorithms. 
Dataset downloaded from: https://www.kaggle.com/jsrojas/ip-network-traffic-flows-labeled-with-87-apps
"""

'\nFile presents operations performed in order to preprocess data to use in machine learning algorithms. \nDataset downloaded from: https://www.kaggle.com/jsrojas/ip-network-traffic-flows-labeled-with-87-apps\n'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import socket, struct

import imblearn
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import NearMiss

In [None]:
filepath = "./kaggleDataset/Dataset-Unicauca-Version2-87Atts.csv"
df = pd.read_csv(filepath)
df.head()

In [None]:
df.tail()

In [None]:
# Function that changes IP addresses into numbers

def ip2int(ip):
    packedIP = socket.inet_aton(ip)
    return struct.unpack("!L", packedIP)[0]

In [None]:
# Converting IP addresses into numbers
df['Source.IP'] = df['Source.IP'].apply(ip2int)
df['Destination.IP'] = df['Destination.IP'].apply(ip2int)

In [None]:
# Checking if any value in the dataframe is null
df.isnull().values.any()

In [None]:
# Checking types of values
print(df.dtypes)

In [None]:
# Checking columns that have only one unique value
df.columns[df.nunique() <= 1]

In [None]:
# Checking occurance of each application
df['ProtocolName'].value_counts()

In [None]:
# Features that will be removed from dataset because they have low occurances of records in dataset
feats_toDelete = df['ProtocolName'].value_counts()[-25:].index
feats_toDelete

In [None]:
# Plot the number of records for individual applications
target_count = df['ProtocolName'].value_counts()
plt.figure(figsize=(16,10))
target_count.plot(kind='bar', title='Occurance');

In [None]:
# Removal of the applications saved in feats_toDelete var because they occur in a small amount of records and do not have a major impact on the model
df = df[~df.ProtocolName.isin(feats_toDelete)]

In [None]:
# Plot the number of records for individual applications one more time after some application removal
target_count2 = df['ProtocolName'].value_counts()
plt.figure(figsize=(16,10))
target_count2.plot(kind='bar', title='Occurance');

In [None]:
X = df[feats].astype(float)
Y = df['ProtocolName']

In [None]:
# Convert application names to numbers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [None]:
print(sorted(Counter(Y).items()))

In [None]:
print(sorted(Counter(encoded_Y).items()))

In [None]:
dict_nearMiss = {
"GOOGLE":            10000,
"HTTP":              10000,
"HTTP_PROXY":        10000,
"SSL":               10000,
"HTTP_CONNECT":      10000,
"YOUTUBE":           10000,
"AMAZON":             10000,
"MICROSOFT":          10000,
"GMAIL":              10000,
"WINDOWS_UPDATE":     10000,
"SKYPE":              10000,
"FACEBOOK":           10000,
"DROPBOX":            10000,
"YAHOO":              10000,
"TWITTER":            10000,
"CLOUDFLARE":         10000,
"MSN":                10000,
}

In [None]:
dict_smote = { "CONTENT_FLASH":       10000,
"APPLE":               10000,
"OFFICE_365":        10000,
"WHATSAPP":            10000,
"INSTAGRAM":          10000,
"WIKIPEDIA":           10000,
"MS_ONE_DRIVE":        10000,
"DNS":                 10000,
"IP_ICMP":             10000,
"NETFLIX":             10000,
"APPLE_ITUNES":        10000,
"SPOTIFY":             10000,
"APPLE_ICLOUD":        10000,
"EBAY":               10000,
"SSL_NO_CERT":          10000,
"GOOGLE_MAPS":          10000,
"EASYTAXI":             10000,
"TEAMVIEWER":           10000,
"HTTP_DOWNLOAD":        10000,
"MQTT":                 10000,
"TOR":                  10000,
"FTP_DATA":             10000,
"UBUNTUONE":            10000,
"NTP":                  10000,
"SSH": 10000}
print(dict_smote2)

In [None]:
# getting rid of the problem of unbalanced data set
pipe = make_pipeline(
    SMOTE(sampling_strategy=dict_smote),
    NearMiss(sampling_strategy=dict_nearMiss)
)

In [None]:
X_resampled, y_resampled = pipe.fit_resample(X, Y)

In [None]:
print("Shape pierwotnego pliku " + str(df.shape))
print("Shape X " + str(X.shape))
print("Shape X_resampled" + str(X_resampled.shape))
print("Shape Y " + str(Y.shape))
print("Shape y_resampled" + str(y_resampled.shape))

In [None]:
# creating new dataset
new_dataframe = pd.DataFrame(data = X_resampled, columns = feats) 

In [None]:
new_dataframe['ProtocolName'] = y_resampled

In [None]:
new_dataframe.describe()

In [None]:
new_dataframe.to_csv('KaggleImbalanced.csv', index=False)