In [None]:
# 0: No anomalies
# 1: Separation of Duty (SoD) violation - single subject with read and write access to sensitive data
# 2: Overly permissive access
# 3: Improper privilege assignment
# 4: Critical system file modification
# 5: Contradictory allow rules for same subject-object-permission combination
# 6: Missing necessary port access for network services
# 7: Incorrect type usage
# 8: Domain transition issues
# 9: Mislabeled files or processes
# 10: Unauthorized network access
# 11: Separation of Duty (SoD) violation - single subject with access to multiple mutually exclusive roles
# 12: Contradictory allow and deny rules for same subject-object-permission combination
# 13: Contradictory type transitions for the same process
# 14: Missing necessary file access for system processes
# 15: Missing necessary directory access for system processes
# 16: Missing necessary network access for system processes

In [None]:
!pip install networkx node2vec scikit-learn pandas



In [None]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load the data
objectclass_df = pd.read_csv('/content/Data/objectclass.csv')
attributes_df = pd.read_csv('/content/Data/attributes.csv')
violations_df = pd.read_csv('/content/Data/filtered_separated_permissions.csv')

In [None]:
# Create a graph
G = nx.Graph()

# Add nodes and edges from attributes and object classes
for index, row in attributes_df.iterrows():
    attribute = row['attribute']
    types = row['types'].split(',')
    for t in types:
        G.add_node(t, type=attribute)
        G.add_edge(attribute, t)

for index, row in objectclass_df.iterrows():
    object_class = row['object_class']
    permissions = row['permissions'].split(',')
    for perm in permissions:
        G.add_node(perm, type='permission')
        G.add_edge(object_class, perm)


In [None]:
# Add nodes and edges for violations
for index, row in violations_df.iterrows():
    source = row['source_type']
    target = row['target_type']
    obj_class = row['object_class']
    perms = row['permissions'].split(',')

    # Add nodes for source, target, and object class
    G.add_node(source, type='source_type')
    G.add_node(target, type='target_type')
    G.add_node(obj_class, type='object_class')

    # Add edges between source, target, object class, and permissions
    G.add_edge(source, target)
    G.add_edge(source, obj_class)
    for perm in perms:
        G.add_edge(obj_class, perm)


In [None]:
# Generate node embeddings using Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=15, num_walks=200, workers=2)
model = node2vec.fit(window=5, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/149 [00:00<?, ?it/s]

In [None]:
# Create embeddings for the source, target, and object classes
def get_embedding(row):
    source_emb = model.wv[row['source_type']]
    target_emb = model.wv[row['target_type']]
    obj_class_emb = model.wv[row['object_class']]
    return (source_emb + target_emb + obj_class_emb) / 3

violations_df['embedding'] = violations_df.apply(get_embedding, axis=1)


In [None]:
# Prepare data for classification
X = list(violations_df['embedding'])
y = violations_df['violation_class'].astype(int)


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
svm_clf = SVC(kernel='rbf', gamma='scale', random_state=42)
mlp_clf = MLPClassifier(hidden_layer_sizes=(64, 32, 16), max_iter=500, random_state=42)


In [None]:
# Train the classifiers
rf_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
y_pred_svm = svm_clf.predict(X_test)
y_pred_mlp = mlp_clf.predict(X_test)

In [None]:
# Evaluate the models
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Evaluation:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_svm, "SVM")
evaluate_model(y_test, y_pred_mlp, "MLP Neural Network")

Random Forest Evaluation:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        38
           2       0.80      1.00      0.89        53
           3       1.00      1.00      1.00        46
           4       1.00      1.00      1.00        26
           7       1.00      1.00      1.00        17
           8       1.00      1.00      1.00        14
           9       1.00      1.00      1.00        17
          10       1.00      1.00      1.00        18
          11       1.00      1.00      1.00        14
          13       1.00      1.00      1.00        22
          14       1.00      0.25      0.40        16
          15       1.00      0.95      0.98        22

    accuracy                           0.96       303
   macro avg       0.98      0.93      0.94       303
weighted avg       0.97      0.96      0.95       303

Confusion Matrix:
[[38  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 53  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 46 

0: No anomalies
1: Separation of Duty (SoD) violation - single subject with read and write access to sensitive data
2: Improper privilege assignment
3: Critical system file modification
4: Incorrect type usage
5: Domain transition issues
6: Mislabeled files or processes
7: Unauthorized network access
8: Separation of Duty (SoD) violation - single subject with access to multiple mutually exclusive roles
9: Contradictory type transitions for the same process
10: Missing necessary file access for system processes

In [None]:
objectclass_df = pd.read_csv('/content/Data/objectclass.csv')
attributes_df = pd.read_csv('/content/Data/attributes.csv')
violations_df = pd.read_csv('/content/Data/new_filtered_separated_rules.csv')

In [None]:
# Create a graph
G = nx.Graph()

# Add nodes and edges from attributes and object classes
for index, row in attributes_df.iterrows():
    attribute = row['attribute']
    types = row['types'].split(',')
    for t in types:
        G.add_node(t, type=attribute)
        G.add_edge(attribute, t)

for index, row in objectclass_df.iterrows():
    object_class = row['object_class']
    permissions = row['permissions'].split(',')
    for perm in permissions:
        G.add_node(perm, type='permission')
        G.add_edge(object_class, perm)


In [None]:
# Add nodes and edges for violations
for index, row in violations_df.iterrows():
    source = row['source_type']
    target = row['target_type']
    obj_class = row['object_class']
    perms = row['permissions'].split(',')

    # Add nodes for source, target, and object class
    G.add_node(source, type='source_type')
    G.add_node(target, type='target_type')
    G.add_node(obj_class, type='object_class')

    # Add edges between source, target, object class, and permissions
    G.add_edge(source, target)
    G.add_edge(source, obj_class)
    for perm in perms:
        G.add_edge(obj_class, perm)


In [None]:
# Generate node embeddings using Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=15, num_walks=200, workers=2)
model = node2vec.fit(window=5, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/144 [00:00<?, ?it/s]

In [None]:
# Create embeddings for the source, target, and object classes
def get_embedding(row):
    source_emb = model.wv[row['source_type']]
    target_emb = model.wv[row['target_type']]
    obj_class_emb = model.wv[row['object_class']]
    return (source_emb + target_emb + obj_class_emb) / 3

violations_df['embedding'] = violations_df.apply(get_embedding, axis=1)


In [None]:
# Prepare data for classification
X = list(violations_df['embedding'])
y = violations_df['violation_class'].astype(int)


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
svm_clf = SVC(kernel='rbf', gamma='scale', random_state=42)
mlp_clf = MLPClassifier(hidden_layer_sizes=(64, 32, 16), max_iter=500, random_state=42)


In [None]:
# Train the classifiers
rf_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
y_pred_svm = svm_clf.predict(X_test)
y_pred_mlp = mlp_clf.predict(X_test)

In [None]:
# Evaluate the models
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Evaluation:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_svm, "SVM")
evaluate_model(y_test, y_pred_mlp, "MLP Neural Network")

Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91       126
           1       1.00      1.00      1.00        28
           2       1.00      1.00      1.00        45
           3       1.00      0.97      0.98        33
           4       1.00      1.00      1.00        16
           5       0.80      0.86      0.83        14
           6       0.95      1.00      0.97        19
           7       1.00      0.83      0.91        12
           8       1.00      1.00      1.00        20
           9       1.00      1.00      1.00        23
          10       0.57      0.63      0.60        19

    accuracy                           0.93       355
   macro avg       0.93      0.93      0.93       355
weighted avg       0.94      0.93      0.93       355

Confusion Matrix:
[[114   0   0   0   0   3   0   0   0   0   9]
 [  0  28   0   0   0   0   0   0   0   0   0]
 [  0   0  45   0   0   0   0   0   0   0   0]
 [