In [47]:
!pip install nltk



In [48]:
import pandas as pd
import random
from azureml.core import Workspace, Datastore, Dataset
from nltk.tokenize import WhitespaceTokenizer


In [49]:
# Importing everyone's datasets

usernames_df = pd.read_csv("../Dataset_CSVs/tweetsv2.csv", names=["Data", "Label"], header=0)
hostnames_df = pd.read_csv("../Dataset_CSVs/hostname_stringsv2.csv", names=["Data", "Label"], header=0)
filenames_df = pd.read_csv("../Dataset_CSVs/filenamesv2.csv", names=["Data", "Label"], header=0)
ip_df = pd.read_csv("../Dataset_CSVs/ip_datapointsv2.csv", names=["Data", "Label"], header=0)
servernames_df = pd.read_csv("../Dataset_CSVs/servernamesv2.csv", names=["Data", "Label"], header=0)


In [50]:
ip_df['Label'].value_counts()

IP         2527
nothing    2473
Name: Label, dtype: int64

In [51]:
l = [usernames_df,hostnames_df,filenames_df,ip_df,servernames_df]
for d in l:
    print(d['Label'].value_counts())

username    7319
nothing     3009
Name: Label, dtype: int64
hostname    5025
nothing     4975
Name: Label, dtype: int64
filename    8741
Name: Label, dtype: int64
IP         2527
nothing    2473
Name: Label, dtype: int64
servername    186
Name: Label, dtype: int64


In [52]:
# Combining datasets and randomizing resulting list

dataset = usernames_df.values.tolist() + hostnames_df.values.tolist() + filenames_df.values.tolist() + ip_df.values.tolist() + servernames_df.values.tolist()
random.shuffle(dataset)
print(len(dataset))

34255


In [53]:
# Features

# Username specific features
def LowerUnderscoreUpper(s):
    l = len(s)

    for i in range(0,l-2):
        if s[i].islower() and s[i+1] == '_' and s[i+2].isupper():
            return True
    return False

def HasUnderscore(s):
    s = str(s)

    for c in s:
        if c == '_':
            return True
    return False

def LowerUpperLower(s):
    s = str(s)

    for i in range(0, len(s)-2):
        if s[i].islower() and s[i+1].isupper() and s[i+2].islower():
            return True
    return False

def MultipleLowerUpperLower(s):
    s = str(s)

    flag = False
    for i in range(0, len(s)-2):
        if s[i].islower() and s[i+1].isupper() and s[i+2].islower():
            if flag:
                return True
            else:
                flag = True
    return False

def ExactlyTwoUppercase(s):
    count = 0

    for c in s:
        if c.isupper():
            count+=1
    if count ==2:
        return True
    else:
        return False

def AllLowerMoreThan(bound, s):
    s = str(s)

    if len(s) < bound:
        return False
    for c in s:
        if c.isupper():
            return False
    return True

def AdjacentUppers(s):
    s = str(s)

    for i in range(0,len(s)-1):
        if s[i].isupper() and s[i+1].isupper():
            return True
    return False

def StartLetterEndNonLetter(s):
    s = str(s)

    if s[0].isalpha() and not s[-1].isalpha():
        return True
    return False

def IsUrl(s):
    s = str(s)
    l = len(s)
    if l > 3:
        if s[0:4] == "http":
            return True
    if l > 4:
        if s[0:5] == "https":
            return True
    return False

# Hostname specific features
def LengthGTLT(gt, ls, s):
    l = len(str(s))
    if l >= gt and l <= ls:
        return True
    else:
        return False

def IllegalHostnameChars(s):
    s = str(s)
    illegal_chars = [".", "\", ""/", "*", "?", "\"", "<", ">", "|", ",", "~", ":", "!", "@", "#", "$", "%", "^", "&", "'", "(", ")", "{", "}", " "]
    for c in s:
        if c in illegal_chars:
            return True
    return False

def AlphaOrDigit(s):
    s = str(s)
    if s.isalpha() or s.isdigit():
        return True
    return False

def HostIllegalEnding(s):
    s = str(s)
    if s[-1] == '-' or s[-1] == '.':
        return True
    return False

# Filename specific features

def ContainsPeriod(s):
    s = str(s)
    for c in s:
        if c == '.':
            return True
    return False


def HasSlash(s):
    s = str(s)
    for c in s:
        if c == '/' or c == '\\':
            return True
    return False

def HasMultipleSlash(s):
    s = str(s)
    slashes = ['/','\\']
    flag = False
    for c in s:
        if not flag and c in slashes:
            flag = True
        elif flag and c in slashes:
            return True
    return False

def HasPossibleExtension(s):
    s = str(s)
    for i in range(0,len(s)):
        if s[i] == '.':
            x = s[i:-1]
            if len(x) >= 2 and len(x) <= 4:
                return True
    return False

# IP Features
def NumbersThenPeriod(s):
    s = str(s)
    freq = 0
    for i in range(0,len(s)-1):
        if s[i] >= '0' and s[i] <= '9' and s[i+1] == '.':
            freq += 1
    return True if freq == 3 else False

def AtLeastFourDigits(s):
    s = str(s)
    counter = 0
    for c in s:
        if c.isnumeric():
            counter += 1
    return True if counter >=4 else False

# Servername Features

def HasPeriodAndSlash(s):
    s = str(s)
    period = False
    slash = False
    for c in s:
        if c == '.':
            period = True
        if s =='/':
            slash = True
    return True if period and slash else False

def HasInstanceNumPeriod(s):
    s = str(s)
    for i in range(0,len(s)-1):
        if s[i].isnumeric() and s[i+1] == '.':
            return True
    return False

def HasMultipleNumPeriod(s):
    s = str(s)
    flag = False
    for i in range(0,len(s)-1):
        if s[i].isnumeric() and s[i+1] == '.' and not flag:
            flag = True
        if s[i].isnumeric() and s[i+1] == '.' and flag:
            return True
    return False

# List of the feature names, for resulting csv file column names
feature_names =["LowerUnderscoreUpper", "HasUnderscore", "LowerUpperLower", "MultipleLowerUpperLower", "ExactlyTwoUppercase", "AllLowerMoreThan",
    "AdjacentUppers", "StartLetterEndNonLetter", "LengthGTLT", "IllegalHostnameChars", "AlphaOrDigit", "HostIllegalEnding", "ContainsPeriod",
    "HasSlash", "HasMultipleSlash", "HasPossibleExtension", "NumbersThenPeriod", "AtLeastFourDigits", "HasPeriodAndSlash", "HasInstanceNumPeriod", 
    "HasMultipleNumPeriod", "Label"]

In [54]:
# extracting features and labels
feature_list = []
 
tk = WhitespaceTokenizer() 

for dp in dataset:
    d = str(dp[0])
    label = dp[1]
    tokens = tk.tokenize(d)
    tokens = list(filter(lambda x: not IsUrl(x), tokens))
    dp_list = []
    dp_list.append(True) if True in list(map(lambda x: LowerUnderscoreUpper(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasUnderscore(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: LowerUpperLower(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: MultipleLowerUpperLower(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: ExactlyTwoUppercase(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: AllLowerMoreThan(10,x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: AdjacentUppers(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: StartLetterEndNonLetter(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: LengthGTLT(1, 15, x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: IllegalHostnameChars(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: AlphaOrDigit(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HostIllegalEnding(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: ContainsPeriod(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasSlash(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasMultipleSlash(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasPossibleExtension(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: NumbersThenPeriod(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: AtLeastFourDigits(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasPeriodAndSlash(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasInstanceNumPeriod(x), tokens)) else dp_list.append(False)
    dp_list.append(True) if True in list(map(lambda x: HasMultipleNumPeriod(x), tokens)) else dp_list.append(False)
    dp_list.append(label)
    feature_list.append(dp_list)


In [55]:
# Writing featurized + labeled dataset to a csv file

labeled_features_df = pd.DataFrame(feature_list, columns=feature_names)
labeled_features_df.head()
labeled_features_df.to_csv("./Data/Alpha_Featurizedv2.csv", index=False)

In [59]:
# Registering the featurized + labeled dataset

subscription_id = '9d0dfa04-d2f8-4521-b945-b3a7dbf43946'
resource_group = 'CougsInAzure'
workspace_name = 'CougsInAzure2'

workspace = Workspace(subscription_id, resource_group, workspace_name)
datastore = workspace.get_default_datastore()
datastore.upload(src_dir='Data/', target_path='Data/',overwrite=True)
alpha_featurized_ds = Dataset.Tabular.from_delimited_files(path=[(datastore, ('Data/Alpha_Featurizedv2.csv'))])
featurized_registered_dataset = alpha_featurized_ds.register(workspace=workspace,
                                 name='Alpha_Featurizedv2',
                                 description='Alpha Prototype Training Featurization Data V2')

Uploading an estimated of 3 files
Uploading Data/Alpha_Featurized.csv
Uploaded Data/Alpha_Featurized.csv, 1 files out of an estimated total of 3
Uploading Data/Alpha_Dataset.csv
Uploaded Data/Alpha_Dataset.csv, 2 files out of an estimated total of 3
Uploading Data/Alpha_Featurizedv2.csv
Uploaded Data/Alpha_Featurizedv2.csv, 3 files out of an estimated total of 3
Uploaded 3 files
