In [None]:
# Violation classes:
# 0: No anomalies
# 1: Separation of Duty (SoD) violation
# 2: Overly permissive access
# 3: Improper privilege assignment
# 4: Critical system file modification
# 5: Contradictory rules
# 6: Missing necessary rules
# 7: Incorrect type usage
# 8: Domain transition issues
# 9: Mislabeled files or processes
# 10: Unauthorized network access

In [None]:
!pip install networkx node2vec scikit-learn pandas



In [None]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load the data
objectclass_df = pd.read_csv('/content/Data/objectclass.csv')
attributes_df = pd.read_csv('/content/Data/attributes.csv')
violations_df = pd.read_csv('/content/Data/rules_separated_permissions.csv')

In [None]:
# Create a graph
G = nx.Graph()

# Add nodes and edges from attributes and object classes
for index, row in attributes_df.iterrows():
    attribute = row['attribute']
    types = row['types'].split(',')
    for t in types:
        G.add_node(t, type=attribute)
        G.add_edge(attribute, t)

for index, row in objectclass_df.iterrows():
    object_class = row['object_class']
    permissions = row['permissions'].split(',')
    for perm in permissions:
        G.add_node(perm, type='permission')
        G.add_edge(object_class, perm)


In [None]:
# Add nodes and edges for violations
for index, row in violations_df.iterrows():
    source = row['source_type']
    target = row['target_type']
    obj_class = row['object_class']
    perms = row['permissions'].split(',')

    # Add nodes for source, target, and object class
    G.add_node(source, type='source_type')
    G.add_node(target, type='target_type')
    G.add_node(obj_class, type='object_class')

    # Add edges between source, target, object class, and permissions
    G.add_edge(source, target)
    G.add_edge(source, obj_class)
    for perm in perms:
        G.add_edge(obj_class, perm)


In [None]:
# Generate node embeddings using Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=15, num_walks=200, workers=2)
model = node2vec.fit(window=5, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/407 [00:00<?, ?it/s]

In [None]:
# Create embeddings for the source, target, and object classes
def get_embedding(row):
    source_emb = model.wv[row['source_type']]
    target_emb = model.wv[row['target_type']]
    obj_class_emb = model.wv[row['object_class']]
    return (source_emb + target_emb + obj_class_emb) / 3

violations_df['embedding'] = violations_df.apply(get_embedding, axis=1)


In [None]:
# Prepare data for classification
X = list(violations_df['embedding'])
y = violations_df['violation_class'].astype(int)


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the models
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
svm_clf = SVC(kernel='rbf', gamma='scale', random_state=42)
mlp_clf = MLPClassifier(hidden_layer_sizes=(64, 32, 16), max_iter=500, random_state=42)


In [None]:
# Train the classifiers
rf_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_clf.predict(X_test)
y_pred_svm = svm_clf.predict(X_test)
y_pred_mlp = mlp_clf.predict(X_test)

In [None]:
# Evaluate the models
def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Evaluation:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_svm, "SVM")
evaluate_model(y_test, y_pred_mlp, "MLP Neural Network")

Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.78      0.82       125
           1       0.63      0.71      0.67        34
           2       0.95      1.00      0.98        83
           3       1.00      0.97      0.99        40
           4       1.00      0.88      0.93        24
           5       0.32      0.73      0.44        11
           6       0.00      0.00      0.00         7
           7       1.00      0.86      0.92        14
           8       1.00      1.00      1.00        19
           9       1.00      0.70      0.82        10
          10       1.00      0.88      0.93         8

    accuracy                           0.85       375
   macro avg       0.80      0.77      0.77       375
weighted avg       0.87      0.85      0.85       375

Confusion Matrix:
[[98 12  0  0  0 12  3  0  0  0  0]
 [ 8 24  2  0  0  0  0  0  0  0  0]
 [ 0  0 83  0  0  0  0  0  0  0  0]
 [ 1  0  0 39  0  0  0  0  0  0  0]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
!pip install dask[dataframe]

Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.11-py3-none-any.whl.metadata (2.5 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.10-py3-none-any.whl.metadata (2.5 kB)
  Downloading dask_expr-1.1.9-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.9-py3-none-any.whl (241 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.9/241.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.9


In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

In [None]:
# Load the data
objectclass_df = pd.read_csv('/content/Data/objectclass.csv')
attributes_df = pd.read_csv('/content/Data/attributes.csv')
violations_df = pd.read_csv('/content/Data/rules_separated_permissions.csv')

In [None]:
# Create a graph
G = nx.Graph()

# Add nodes and edges from attributes and object classes
for index, row in attributes_df.iterrows():
    attribute = row['attribute']
    types = row['types'].split(',')
    for t in types:
        G.add_node(t, type=attribute)
        G.add_edge(attribute, t)

for index, row in objectclass_df.iterrows():
    object_class = row['object_class']
    permissions = row['permissions'].split(',')
    for perm in permissions:
        G.add_node(perm, type='permission')
        G.add_edge(object_class, perm)


In [None]:
# Add nodes and edges for violations
for index, row in violations_df.iterrows():
    source = row['source_type']
    target = row['target_type']
    obj_class = row['object_class']
    perms = row['permissions'].split(',')

    # Add nodes for source, target, and object class
    G.add_node(source, type='source_type')
    G.add_node(target, type='target_type')
    G.add_node(obj_class, type='object_class')

    # Add edges between source, target, object class, and permissions
    G.add_edge(source, target)
    G.add_edge(source, obj_class)
    for perm in perms:
        G.add_edge(obj_class, perm)


In [None]:
# Generate node embeddings using Node2Vec
node2vec = Node2Vec(G, dimensions=64, walk_length=15, num_walks=200, workers=4)
model = node2vec.fit(window=5, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/407 [00:00<?, ?it/s]

In [None]:
# Create embeddings for the source, target, and object classes
def get_embedding(row):
    source_emb = model.wv[row['source_type']]
    target_emb = model.wv[row['target_type']]
    obj_class_emb = model.wv[row['object_class']]
    return (source_emb + target_emb + obj_class_emb) / 3

violations_df['embedding'] = violations_df.apply(get_embedding, axis=1)


In [None]:
X = np.array(list(violations_df['embedding']))
y = violations_df['violation_class'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Initialize models with hyperparameters
rf_clf = RandomForestClassifier(n_estimators=300, random_state=42)
svm_clf = SVC(kernel='rbf', gamma='scale', C=10, random_state=42, probability=True)
mlp_clf = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=600, random_state=42)
xgb_clf = XGBClassifier(n_estimators=300, learning_rate=0.05, random_state=42)


In [None]:
# Cross-validation to check model performance
cv_rf = cross_val_score(rf_clf, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_svm = cross_val_score(svm_clf, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_mlp = cross_val_score(mlp_clf, X_train, y_train, cv=5, scoring='accuracy').mean()
cv_xgb = cross_val_score(xgb_clf, X_train, y_train, cv=5, scoring='accuracy').mean()

print("Cross-Validation Results:")
print(f"Random Forest: {cv_rf}")
print(f"SVM: {cv_svm}")
print(f"MLP Neural Network: {cv_mlp}")
print(f"XGBoost: {cv_xgb}")

Cross-Validation Results:
Random Forest: 0.8222200328407224
SVM: 0.7970114942528735
MLP Neural Network: 0.8268374384236454
XGBoost: 0.8268111658456487


In [None]:
# Train the models
rf_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

# Stacking ensemble model
estimators = [
    ('rf', rf_clf),
    ('svm', svm_clf),
    ('mlp', mlp_clf)
]

stacking_clf = StackingClassifier(estimators=estimators, final_estimator=xgb_clf, cv=5)
stacking_clf.fit(X_train, y_train)

def evaluate_model(y_true, y_pred, model_name):
    print(f"{model_name} Evaluation:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\n")

y_pred_stacking = stacking_clf.predict(X_test)
evaluate_model(y_test, y_pred_stacking, "Stacking Ensemble")

Stacking Ensemble Evaluation:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       125
           1       0.51      0.71      0.59        34
           2       0.97      0.93      0.95        83
           3       1.00      0.97      0.99        40
           4       1.00      0.88      0.93        24
           5       0.36      0.45      0.40        11
           6       0.12      0.14      0.13         7
           7       1.00      0.86      0.92        14
           8       1.00      0.89      0.94        19
           9       0.80      0.80      0.80        10
          10       0.89      1.00      0.94         8

    accuracy                           0.83       375
   macro avg       0.77      0.77      0.77       375
weighted avg       0.85      0.83      0.84       375

Confusion Matrix:
[[99 13  0  0  0  7  5  0  0  0  1]
 [10 24  0  0  0  0  0  0  0  0  0]
 [ 0  6 77  0  0  0  0  0  0  0  0]
 [ 0  0  0 39  0  0  0  0  0  1

In [None]:
# detect and categorize violations using the stacking ensemble model
def detect_and_categorize_violations_with_stacking(df):
    X = np.array(list(df['embedding']))
    predictions = stacking_clf.predict(X)
    df['predicted_violation_class'] = predictions
    return df[['source_type', 'target_type', 'object_class', 'permissions', 'predicted_violation_class']]

categorized_violations_df = detect_and_categorize_violations_with_stacking(violations_df)
print("Categorized Violations with Stacking Ensemble:")
print(categorized_violations_df)

Categorized Violations with Stacking Ensemble:
     source_type   target_type object_class   permissions  \
0         user_t   user_home_t          dir          read   
1         user_t   user_home_t          dir         write   
2         user_t   user_home_t          dir        search   
3        staff_t  staff_home_t          dir          read   
4        staff_t  staff_home_t          dir         write   
...          ...           ...          ...           ...   
1242       gdm_t    dns_port_t   udp_socket  name_connect   
1243  iptables_t    ntp_port_t   udp_socket  name_connect   
1244  kerberos_t   http_port_t   tcp_socket  name_connect   
1245      ldap_t    ftp_port_t   tcp_socket  name_connect   
1246   openvpn_t  mysql_port_t   tcp_socket  name_connect   

      predicted_violation_class  
0                             0  
1                             0  
2                             0  
3                             0  
4                             0  
...             