In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Assigning data to Pandas DataFrame

Creates a string variable with the folder path to current data storage.

In [None]:
folder_path = 'D:\\AMLI Capstone\\'

### Creating the Pandas DataFrame and naming the columns

Inputs the auth.txt file into a DataFrame.

In [None]:
auth_df = pd.read_csv(
folder_path + 'auth.txt',
names = ['time',
      'source_user@domain',
      'destination_user@domain',
      'source_computer',
      'destination_computer',
      'authentication_type',
      'logon_type',
      'authentication_orientation',
      'success/failure'])

In [None]:
auth_df

Inputs the redteam.txt file into a DataFrame.

In [None]:
redteam_df = pd.read_csv(folder_path + 'redteam.txt', 
                      names= ['time',
                              'user@domain',
                              'source_computer',
                              'destination_computer'])

In [None]:
redteam_df

## Labeling and splitting the data

Creates the threat column classifier, 0 equals non-threat, 1 equals threat

In [None]:
auth_df['threat'] = 0

Creates the threats DataFrame for future use for appending

In [None]:
threats = pd.DataFrame()

threats.rename(
columns = ['time',
      'source_user@domain',
      'destination_user@domain',
      'source_computer',
      'destination_computer',
      'authentication_type',
      'logon_type',
      'authentication_orientation',
      'success/failure'])
print(threats)

This cell iterates through each element in the redteam_df DataFrame.

Each element gets their time value pulled and matched with a list gathered from the auth_df DataFrame with the same times.

The same process happens with the source_user@domain column with the time matched list.

The same process happens with the source_computer column with the source_user@domain matched list.

The same process happens with the destination_computer column with source_computer matched list.

Once the redteam_df element is matched an auth_df element, it sets the element to threat = 1 and the element is appended to the threats DataFrame. The index of the threat is popped off the auth_df DataFrame for strafied splitting later.

In [None]:
for i in range(redteam_df.shape[0] - 1):
    print('Redteam event num: ' + str(i))
    
    print('Finding matches')
    time_df = auth_df.loc[auth_df['time'] == redteam_df.loc[i,'time']]
    
    user_df = time_df.loc[time_df['source_user@domain'] == redteam_df.loc[i,'user@domain']]
    
    comp_df = user_df.loc[user_df['source_computer'] == redteam_df.loc[i,'source_computer']]
    
    final_df = comp_df.loc[comp_df['destination_computer'] == redteam_df.loc[i,'destination_computer']]
    
    
    
    index = final_df.index.tolist()
    
    if (len(index) != 0):
        print('Index: ', index, end='\n\n')
        print('Marking event as a threat')
        threats = pd.concat([threats, final_df])
    else:
        print('No match found in auth_df', end='\n\n')
        
threats['threat'] = 1

In [None]:
auth_df.loc[threats.index, 'threat'] = 1

In [None]:
print(auth_df.iloc[29627158])

In [None]:
auth_df.shape

## Data Preprocessing: String data to ID

Each cell gathers a list of unique catagories from their corresponding column, enumerates each catagory, and divides them by the total number of unique catagories to scale their values.

### source_user@domain

In [None]:
source_user_domain = sorted(list(pd.unique(auth_df['source_user@domain'].values.ravel())))

source_user_domain_to_id = [i[0]/len(source_user_domain) for i in enumerate(source_user_domain)]

source_user_domain_dictionary = {
    source_user_domain[i]:
    source_user_domain_to_id[i]
    for i in range(len(source_user_domain))
}

### destination_user@domain

In [None]:
destination_user_domain = sorted(list(pd.unique(auth_df['destination_user@domain'].values.ravel())))

destination_user_domain_to_id = [i[0]/len(destination_user_domain) for i in enumerate(destination_user_domain)]

destination_user_domain_dictionary = {
    destination_user_domain[i]:
    destination_user_domain_to_id[i] 
    for i in range(len(destination_user_domain))
}

### source_computer

In [None]:
source_computer = sorted(list(pd.unique(auth_df['source_computer'].values.ravel())))

source_computer_to_id = [i[0]/len(source_computer) for i in enumerate(source_computer)]

source_computer_dictionary = {
    source_computer[i]:
    source_computer_to_id[i]
    for i in range(len(source_computer))
}

### destination_computer

In [None]:
destination_computer = sorted(list(pd.unique(auth_df['destination_computer'].values.ravel())))

destination_computer_to_id = [i[0]/len(destination_computer) for i in enumerate(destination_computer)]

destination_computer_dictionary = {
    destination_computer[i]:
    destination_computer_to_id[i]
    for i in range(len(destination_computer))
}

### authentication_type

In [None]:
authentication_type = sorted(list(pd.unique(auth_df['authentication_type'].values.ravel())))

authentication_type_to_id = [i[0]/len(authentication_type) for i in enumerate(authentication_type)]

authentication_type_dictionary = {
    authentication_type[i]:
    authentication_type_to_id[i]
    for i in range(len(authentication_type))
}

### logon_type

In [None]:
logon_type = sorted(list(pd.unique(auth_df['logon_type'].values.ravel())))

logon_type_to_id = [i[0]/len(logon_type) for i in enumerate(logon_type)]

logon_type_dictionary = {
    logon_type[i]:
    logon_type_to_id[i]
    for i in range(len(logon_type))
}

### authentication_orientation

In [None]:
authentication_orientation = sorted(list(pd.unique(auth_df['authentication_orientation'].values.ravel())))

authentication_orientation_to_id = [i[0]/len(authentication_orientation) for i in enumerate(authentication_orientation)]

authentication_orientation_dictionary = {
    authentication_orientation[i]:
    authentication_orientation_to_id[i]
    for i in range(len(authentication_orientation))
}

### success/failure

In [None]:
success_failure = list(pd.unique(auth_df['success/failure'].values.ravel()))

success_failure

### Data Preprocessing and Scaling

In [None]:
processed_train_set = pd.DataFrame(columns= train_set.columns)

processed = pd.DataFrame(columns= train_set.columns)

for i in range(len(train_set)):
    processed.loc[0] = 0

    processed['time'] = train_set['time'].iloc[i]

    processed['source_user@domain'] = source_user_domain_dictionary[train_set['source_user@domain'].iloc[i]]

    processed['destination_user@domain'] = destination_user_domain_dictionary[train_set['destination_user@domain'].iloc[i]]

    processed['source_computer'] = source_computer_dictionary[train_set['source_computer'].iloc[i]]

    processed['destination_computer'] = destination_computer_dictionary[train_set['destination_computer'].iloc[i]]

    processed['authentication_type'] = authentication_type_dictionary[train_set['authentication_type'].iloc[i]]

    processed['logon_type'] = logon_type_dictionary[train_set['logon_type'].iloc[i]]

    processed['authentication_orientation'] = authentication_orientation_dictionary[train_set['authentication_orientation'].iloc[i]]
    
    if train_set['success/failure'].iloc[i] == 'Success':
        processed['success/failure'] = 1
    else:
        processed['success/failure'] = 0

    processed['threat'] = train_set['threat'].iloc[i]
    
    processed_train_set = processed_train_set.append(processed)

processed_train_set

Splits the data into a 18% train set and 2% test set of data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    processed_train_set[FEATURES], processed_train_set[TARGETS], test_size = 0.2, stratify = processed_train_set['threat']
)

y_train= y_train.astype('int')

## Building the model

Features and targets classification

In [None]:
FEATURES = [x for x in auth_df.columns if x != 'threat' and x != 'time']

TARGETS = 'threat'

FEATURES, TARGETS

### K-nearest neighbor

In [None]:
estimator = KNeighborsClassifier(n_neighbors= int(x_train.shape[0]**0.5), p=2, metric='euclidean')

scores = cross_val_score(
    estimator,
    x_train,
    y_train,
    verbose = 1,
    cv = 5
)

In [None]:
scores.mean()

In [None]:
KNN = KNeighborsClassifier(n_neighbors= int(x_train.shape[0]**0.5), p=2, metric='euclidean')
KNN.fit(x_train, y_train)

y_pred = KNN.predict(x_test)

In [None]:
np.unique(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)


print('The confusion matrix is', cm)
print('The F1 score is', f1)
print('The accuracy score is', accuracy)

## Redteam Exploratory Data Analysis

In [None]:
source_unique = redteam_df['destination_computer'].groupby(redteam_df['source_computer']).count()

plt.bar(source_unique.index, source_unique)
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.heatmap(auth_df.corr(),cmap='inferno')

## DataFrames for other data

In [None]:
proc_df = pd.read_csv(folder_path + 'proc.txt',
                     names = ['time',
                              'user@domain',
                              'computer',
                              'process_name',
                              'start/end'])

In [None]:
flows_df = pd.read_csv(folder_path + 'flows.txt', 
                      names = ['time', 
                               'duration', 
                               'source_computer', 
                               'source_port', 
                               'destination_computer', 
                               'destination_port', 
                               'protocol', 
                               'packet_count', 
                               'byte_count'])

In [None]:
dns_df = pd.read_csv(folder_path + 'dns.txt', 
                     names = ['time','source_computer','computer_resolved'])