In [171]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [210]:
class clust():
    def _load_data(self, sklearn_load_ds):
        data = sklearn_load_ds
        X = data.drop('insiderthreat', axis=1)
        Y = data['insiderthreat']
        
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    
    def __init__(self, sklearn_load_ds):
        self._load_data(sklearn_load_ds)
    
    def classify(self, model=LogisticRegression(penalty='l1', C=1.0, solver='liblinear')):
        scaler = StandardScaler()
        X_train = scaler.fit_transform(self.X_train)
        X_test = scaler.fit_transform(self.X_test)
        model.fit(X_train, self.Y_train)
        y_pred = model.predict(X_test)
        print('Accuracy: {}'.format(accuracy_score(self.Y_test, y_pred)))

    def kmeans(self, output='add'):
        n_clusters = len(np.unique(self.Y_train))
        clf = KMeans(n_clusters=n_clusters, random_state=42)
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.predict(self.X_test)
        if output == 'add':
            self.X_train['km_clust'] = y_labels_train
            self.X_test['km_clust'] = y_labels_test
        
        elif output == 'replace':
            self.X_train = y_labels_train[:, np.newaxis]
            self.X_test = y_labels_test[:, np.newaxis]
            

        else:
            raise ValueError('output should be either add or replace')
        return self



In [211]:
df = pd.read_csv("scenario2-training-dataset-transformed-tf.csv")
label_encoding = preprocessing.LabelEncoder()
df['user'] = label_encoding.fit_transform(df['user'])
df['source'] = label_encoding.fit_transform(df['source'])
df['action'] = label_encoding.fit_transform(df['action'].astype(str))


In [212]:
clust(df).kmeans(output='add').classify()

Accuracy: 1.0


In [213]:
clust(df).classify()

Accuracy: 1.0
