In [41]:
import pandas as pd
from faker import Faker
import random
import os
import numpy as np # Needed for checking NaN
import json

fake = Faker()

## Variables

In [42]:
num_workspaces = 5
max_semantic_models_per_workspace = 5
max_reports_per_semantic_model = 5
num_crew_members = 20
user_groups = ["Captain", "Commander", "Lieutenant Commander", "Lieutenant", "Ensign", "Science Officer", "Engineering Officer", "Medical Officer", "Security Officer", "Away Team", "Bridge Crew"]
workspace_permissions = ["Viewer", "Contributor", "Member", "Admin"]
semantic_model_permissions = ["Read", "ReadWriteExplore", "ReadReshare", "ReadWriteReshareExplore"]
report_permissions = ["Read", "ReadWrite", "Owner"]
all_permissions = {
    "Workspace": workspace_permissions,
    "Semantic Model": semantic_model_permissions,
    "Report": report_permissions
}

## Helper

In [43]:
def generate_starfleet_name(item_type=""):
    prefixes = ["USS", "NCC", "DS", "Starbase"]
    suffixes = ["Enterprise", "Voyager", "Defiant", "Discovery", "Excelsior", "Intrepid", "Odyssey", "Phoenix", "Reliant", "Yorktown"]
    descriptors = ["Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Prime", "Auxiliary"]
    if item_type == "Workspace":
        return f"{random.choice(prefixes)}-{random.choice(suffixes)}"
    elif item_type == "Semantic Model":
        return f"{random.choice(suffixes)} {random.choice(descriptors)}"
    elif item_type == "Report":
        return f"{random.choice(suffixes)} {random.randint(1, 10)}"
    else:
        return fake.word().capitalize()

def generate_item_id(item_type):
    return f"{item_type[:3].upper()}-{fake.uuid4()[:8]}"

def generate_user():
    first_name = fake.first_name()
    last_name = fake.last_name()
    user_id = f"SF-{fake.uuid4()[:6].upper()}"
    return {
        "userId": user_id,
        "fullName": f"{first_name} {last_name}"
    }

def generate_user_group_id(group_name):
    group_id = f"UG-{fake.uuid4()[:6].upper()}"
    return {
        "groupId": group_id,
        "groupName": group_name
    }

def generate_access_to_item_group_id(item_id, permission):
    return f"ATG-{item_id}-{permission.replace(' ', '')}"

## Generate Dimensions

### Generate Users and Groups

In [44]:
# Generate Users and assign to groups
users = [generate_user() for _ in range(num_crew_members)]
groups = [generate_user_group_id(group) for group in user_groups]
for group in groups:
    assigned_users = random.sample(users, random.randint(1, 5))
    group["assignedUsers"] = assigned_users

print(json.dumps(groups, indent= 4 ))

[
    {
        "groupId": "UG-3CC272",
        "groupName": "Captain",
        "assignedUsers": [
            {
                "userId": "SF-4E1026",
                "fullName": "Aaron Gonzalez"
            },
            {
                "userId": "SF-FD292A",
                "fullName": "Gina Webster"
            },
            {
                "userId": "SF-6D835A",
                "fullName": "Tyler Miles"
            }
        ]
    },
    {
        "groupId": "UG-2C73C1",
        "groupName": "Commander",
        "assignedUsers": [
            {
                "userId": "SF-94F330",
                "fullName": "Peter Williams"
            },
            {
                "userId": "SF-9DE5A9",
                "fullName": "Sandra Sanchez"
            },
            {
                "userId": "SF-D1D5EF",
                "fullName": "Vincent Bates"
            },
            {
                "userId": "SF-85B4C5",
                "fullName": "Kimberly Williams"
            }

### Workspaces

In [45]:
workspaces_data = []
for i in range(num_workspaces):
    workspace_id = generate_item_id("WKS")
    workspaces_data.append({
        "workspaceId": workspace_id,
        "workspaceName": generate_starfleet_name("Workspace"),
    })
df_workspaces = pd.DataFrame(workspaces_data)
display(df_workspaces)

Unnamed: 0,workspaceId,workspaceName
0,WKS-5b6962ef,USS-Odyssey
1,WKS-d6a39ae5,Starbase-Intrepid
2,WKS-a8083ba1,USS-Odyssey
3,WKS-bf7914a3,NCC-Yorktown
4,WKS-179874b1,NCC-Discovery


### Semantic Models

In [46]:
semantic_models_data = []
for workspace_id in df_workspaces['workspaceId']:
    num_semantic_models = random.randint(1, max_semantic_models_per_workspace)
    for i in range(num_semantic_models):
        semantic_model_id = generate_item_id("SEM")
        semantic_models_data.append({
            "semanticModelId": semantic_model_id,
            "workspaceId": workspace_id,
            "semanticModelName": generate_starfleet_name("Semantic Model")
        })
df_semantic_models = pd.DataFrame(semantic_models_data)
display(df_semantic_models)

Unnamed: 0,semanticModelId,workspaceId,semanticModelName
0,SEM-1821cc6c,WKS-5b6962ef,Phoenix Beta
1,SEM-18ba0dae,WKS-5b6962ef,Reliant Gamma
2,SEM-9bc1467b,WKS-5b6962ef,Odyssey Epsilon
3,SEM-e72cdcc2,WKS-5b6962ef,Intrepid Prime
4,SEM-1de02fa3,WKS-d6a39ae5,Reliant Gamma
5,SEM-5231095a,WKS-d6a39ae5,Enterprise Prime
6,SEM-90557bc9,WKS-d6a39ae5,Excelsior Zeta
7,SEM-a92061c8,WKS-d6a39ae5,Excelsior Prime
8,SEM-35e88b64,WKS-d6a39ae5,Reliant Prime
9,SEM-cdb96f07,WKS-a8083ba1,Excelsior Epsilon


### Reports

In [47]:
reports_data = []
for index, sm_row in df_semantic_models.iterrows():
    workspace_id = sm_row['workspaceId']
    semantic_model_id = sm_row['semanticModelId']
    num_reports = random.randint(1, max_reports_per_semantic_model)
    for i in range(num_reports):
        report_id = generate_item_id("REP")
        reports_data.append({
            "reportId": report_id,
            "workspaceId": workspace_id,
            "semanticModelId": semantic_model_id,
            "reportName": sm_row['semanticModelName'] + generate_starfleet_name("Report")
        })
df_reports = pd.DataFrame(reports_data)
display(df_reports)

Unnamed: 0,reportId,workspaceId,semanticModelId,reportName
0,REP-a63a7063,WKS-5b6962ef,SEM-1821cc6c,Phoenix BetaYorktown 1
1,REP-10bd7fab,WKS-5b6962ef,SEM-1821cc6c,Phoenix BetaEnterprise 5
2,REP-68f2b0a1,WKS-5b6962ef,SEM-1821cc6c,Phoenix BetaOdyssey 6
3,REP-0fe5c64f,WKS-5b6962ef,SEM-1821cc6c,Phoenix BetaDiscovery 2
4,REP-e2e2649f,WKS-5b6962ef,SEM-18ba0dae,Reliant GammaDiscovery 4
5,REP-bcc5c7ee,WKS-5b6962ef,SEM-18ba0dae,Reliant GammaPhoenix 4
6,REP-640d2234,WKS-5b6962ef,SEM-9bc1467b,Odyssey EpsilonPhoenix 6
7,REP-1720d3a1,WKS-5b6962ef,SEM-9bc1467b,Odyssey EpsilonDefiant 9
8,REP-26d555aa,WKS-5b6962ef,SEM-e72cdcc2,Intrepid PrimeReliant 9
9,REP-50b3a470,WKS-5b6962ef,SEM-e72cdcc2,Intrepid PrimeDiscovery 3


### Items

In [48]:
items_data = []
for index, row in df_workspaces.iterrows():
    items_data.append({
        "itemId": row['workspaceId'], "itemName": row['workspaceName'], "itemType": "Workspace",
        "workspaceId": row['workspaceId'], "semanticModelId": None
    })
for index, row in df_semantic_models.iterrows():
    items_data.append({
        "itemId": row['semanticModelId'], "itemName": row['semanticModelName'], "itemType": "Semantic Model",
        "workspaceId": row['workspaceId'], "semanticModelId": row['semanticModelId']
    })
for index, row in df_reports.iterrows():
    items_data.append({
        "itemId": row['reportId'], "itemName": row['reportName'], "itemType": "Report",
        "workspaceId": row['workspaceId'], "semanticModelId": row['semanticModelId']
    })
df_items = pd.DataFrame(items_data)
display(df_items)

Unnamed: 0,itemId,itemName,itemType,workspaceId,semanticModelId
0,WKS-5b6962ef,USS-Odyssey,Workspace,WKS-5b6962ef,
1,WKS-d6a39ae5,Starbase-Intrepid,Workspace,WKS-d6a39ae5,
2,WKS-a8083ba1,USS-Odyssey,Workspace,WKS-a8083ba1,
3,WKS-bf7914a3,NCC-Yorktown,Workspace,WKS-bf7914a3,
4,WKS-179874b1,NCC-Discovery,Workspace,WKS-179874b1,
...,...,...,...,...,...
66,REP-211df42d,Phoenix AuxiliaryDiscovery 9,Report,WKS-179874b1,SEM-f61ae4f9
67,REP-2c91b609,Voyager BetaIntrepid 1,Report,WKS-179874b1,SEM-ff0c559f
68,REP-99733d93,Voyager BetaDefiant 10,Report,WKS-179874b1,SEM-ff0c559f
69,REP-674b98e9,Voyager BetaDefiant 1,Report,WKS-179874b1,SEM-ff0c559f


### Access Grants

In [49]:
principal_access_data = []

for index, item in df_items.iterrows():
    item_id = item['itemId']
    item_type = item['itemType']
    permissions_in_scope = all_permissions.get(item_type, [])

    if permissions_in_scope:
        num_to_assign = random.randint(1, 3)

        # Direct Groups
        sampled_groups = random.sample(groups, num_to_assign)
        for group in sampled_groups:
            permission = random.choice(permissions_in_scope)
            principal_access_data.append({
                "itemId": item_id, 
                "principalId": group['groupId'], 
                "principalType": "Group",
                "principalName": group['groupName'], 
                "permission": permission, 
                "accessType": "Direct",
                "accessToItemGroupId": generate_access_to_item_group_id(item_id, permission)
            })

            # Indirect Users
            for user in group['assignedUsers']:
                principal_access_data.append({
                    "itemId": item_id, 
                    "principalId": user['userId'], 
                    "principalType": "User",
                    "principalName": user['fullName'],
                    "permission": permission,
                    "accessType": "indirect",
                    "accessToItemGroupId": generate_access_to_item_group_id(item_id, permission)
                })

        # Direct Users
        sampled_users = random.sample(users, num_to_assign)
        for user in sampled_users:
            permission = random.choice(permissions_in_scope)
            principal_access_data.append({
                "itemId": item_id, 
                "principalId": user['userId'], 
                "principalType": "User",
                "principalName": user['fullName'],
                "permission": permission, 
                "accessType": "Direct",
                "accessToItemGroupId": generate_access_to_item_group_id(item_id, permission)
            })

df_access_to_item_direct_principal = pd.DataFrame(principal_access_data)
display(df_access_to_item_direct_principal)

Unnamed: 0,itemId,principalId,principalType,principalName,permission,accessType,accessToItemGroupId
0,WKS-5b6962ef,UG-D5CE28,Group,Medical Officer,Contributor,Direct,ATG-WKS-5b6962ef-Contributor
1,WKS-5b6962ef,SF-D2B541,User,Sarah Harris,Contributor,indirect,ATG-WKS-5b6962ef-Contributor
2,WKS-5b6962ef,SF-00276C,User,Lori Taylor,Contributor,indirect,ATG-WKS-5b6962ef-Contributor
3,WKS-5b6962ef,UG-2C73C1,Group,Commander,Contributor,Direct,ATG-WKS-5b6962ef-Contributor
4,WKS-5b6962ef,SF-94F330,User,Peter Williams,Contributor,indirect,ATG-WKS-5b6962ef-Contributor
...,...,...,...,...,...,...,...
676,REP-fa00d97f,SF-BF4214,User,Melissa Lee,Read,indirect,ATG-REP-fa00d97f-Read
677,REP-fa00d97f,SF-6D835A,User,Tyler Miles,Read,indirect,ATG-REP-fa00d97f-Read
678,REP-fa00d97f,SF-9DE5A9,User,Sandra Sanchez,Read,Direct,ATG-REP-fa00d97f-Read
679,REP-fa00d97f,SF-4BE42E,User,Sabrina Martin,Owner,Direct,ATG-REP-fa00d97f-Owner


### Edges

In [50]:
edges_data = []

# Create direct edges
for index, access_row in df_access_to_item_direct_principal[df_access_to_item_direct_principal["accessType"] == 'Direct'].iterrows():
    item_id = access_row['itemId']
    principal_id = access_row['principalId']
    principal_type = access_row['principalType']
    access_to_item_group_id = access_row['accessToItemGroupId']

    item_info = df_items[df_items['itemId'] == item_id]
    if item_info.empty: continue
    item = item_info.iloc[0]

    principal_name = access_row['principalName']
    edges_data.append({
        "accessToItemGroupId": access_to_item_group_id,
        "srcId": item_id, "srcType": item['itemType'], "srcName": item['itemName'],
        "dstId": principal_id, "dstType": principal_type, "dstName": principal_name,
    })

# group -> user edges
for index, access_row in df_access_to_item_direct_principal.loc[(df_access_to_item_direct_principal["accessType"] == 'Direct') & (df_access_to_item_direct_principal["principalType"] == 'Group')].iterrows():
    for group in groups:
        if group['groupId'] == access_row['principalId']:
            for user in group['assignedUsers']:
                edges_data.append({
                    "accessToItemGroupId": access_row['accessToItemGroupId'],
                    "srcId": group['groupId'], 
                    "srcType": 'Group', 
                    "srcName": access_row['principalName'],
                    "dstId": user['userId'], 
                    "dstType": 'User', 
                    "dstName": user['fullName']
                })

df_access_to_item_edges = pd.DataFrame(edges_data)
df_access_to_item_edges = df_access_to_item_edges.drop_duplicates().reset_index(drop=True)

display(df_access_to_item_edges)

Unnamed: 0,accessToItemGroupId,srcId,srcType,srcName,dstId,dstType,dstName
0,ATG-WKS-5b6962ef-Contributor,WKS-5b6962ef,Workspace,USS-Odyssey,UG-D5CE28,Group,Medical Officer
1,ATG-WKS-5b6962ef-Contributor,WKS-5b6962ef,Workspace,USS-Odyssey,UG-2C73C1,Group,Commander
2,ATG-WKS-5b6962ef-Admin,WKS-5b6962ef,Workspace,USS-Odyssey,UG-5A5753,Group,Security Officer
3,ATG-WKS-5b6962ef-Viewer,WKS-5b6962ef,Workspace,USS-Odyssey,SF-4E1026,User,Aaron Gonzalez
4,ATG-WKS-5b6962ef-Viewer,WKS-5b6962ef,Workspace,USS-Odyssey,SF-6FD6FF,User,Kylie Coleman
...,...,...,...,...,...,...,...
676,ATG-REP-fa00d97f-ReadWrite,UG-756300,Group,Engineering Officer,SF-6D835A,User,Tyler Miles
677,ATG-REP-fa00d97f-ReadWrite,UG-756300,Group,Engineering Officer,SF-85B4C5,User,Kimberly Williams
678,ATG-REP-fa00d97f-ReadWrite,UG-756300,Group,Engineering Officer,SF-924A68,User,Gabriel Jacobs
679,ATG-REP-fa00d97f-Read,UG-CFB98C,Group,Lieutenant Commander,SF-BF4214,User,Melissa Lee


## Save

In [51]:
output_path = "./data"

df_workspaces.to_csv(os.path.join(output_path, "workspaces.csv"), index=False)
df_semantic_models.to_csv(os.path.join(output_path, "semantic_models.csv"), index=False)
df_reports.to_csv(os.path.join(output_path, "reports.csv"), index=False)
df_items.to_csv(os.path.join(output_path, "items.csv"), index=False)
df_access_to_item_direct_principal.to_csv(os.path.join(output_path, "access_to_item.csv"), index=False)
df_access_to_item_edges.to_csv(os.path.join(output_path, "access_to_item_edges.csv"), index=False)

print(f"\nCSV files successfully saved to directory: {output_path}")


CSV files successfully saved to directory: ./data
