In [1]:
#Summarize the provided dataset to get a sense of Cybersecurity Culture in Organizations working on NGS.
#This includes Structural Equation Modeling (SEM): Understand the relationships between different elements of cybersecurity culture 
#and their impact on organizational behavior and user awareness.
import pandas as pd
from semopy import Model, Optimizer

In [2]:
file_path = "cleaneddata_NGS.csv"
df = pd.read_csv(file_path)

In [3]:
# Relevant columns to analyze
columns_to_analyze = [
   #Q40: 
    'In the last year, how often has cybersecurity been discussed in your meetings or other organizational communications',
    
    'How often do you handle sensitive NGS data that could be targeted in a cybersecurity attack',


    #---------
    'How often do you open emails from unknown senders as part of your work',


    #--------
    'Have you ever used a personal USB drive or other external storage devices to transfer NGS data',
  # Q37
    'How would you describe the overall culture of cybersecurity within your organization',
    
    'In your opinion, what are the biggest barriers to implementing effective cybersecurity training in your organization (Select all that apply)',
    
    'Has your organization ever experienced a cybersecurity breach that affected NGS or any other sensitive data',
    
    'How often does your organization provide cybersecurity training specifically tailored to NGS operations',
    
    'Does your organization have clear and accessible cybersecurity policies specific to NGS operations',
   #Q41: 
    'What methods does your organization use to communicate about cybersecurity issues? (Select all that apply)'
]

In [4]:
# Rename columns for easier reference
df.rename(columns={
   'In the last year, how often has cybersecurity been discussed in your meetings or other organizational communications': 'cybersecurity_discussions',
   'How often do you handle sensitive NGS data that could be targeted in a cybersecurity attack': 'handle_sensitive_data',
   'How often do you open emails from unknown senders as part of your work': 'open_unknown_emails',
   'Have you ever used a personal USB drive or other external storage devices to transfer NGS data': 'used_usb_transfer',
   'How would you describe the overall culture of cybersecurity within your organization': 'cybersecurity_culture',
   'In your opinion, what are the biggest barriers to implementing effective cybersecurity training in your organization (Select all that apply)': 'cybersecurity_barriers',
   'Has your organization ever experienced a cybersecurity breach that affected NGS or any other sensitive data': 'cybersecurity_breach',
   'How often does your organization provide cybersecurity training specifically tailored to NGS operations': 'cybersecurity_training_frequency',
   'Does your organization have clear and accessible cybersecurity policies specific to NGS operations': 'cybersecurity_policies',
   'What methods does your organization use to communicate about cybersecurity issues (Select all that apply)': 'cybersecurity_communication_methods'
}, inplace=True)

# Update columns_to_analyze with the renamed columns
columns_to_analyze = [
    'cybersecurity_discussions',
    'handle_sensitive_data',
    'open_unknown_emails',
    'used_usb_transfer',
    'cybersecurity_culture',
    'cybersecurity_barriers',
    'cybersecurity_breach',
    'cybersecurity_training_frequency',
    'cybersecurity_policies',
    'cybersecurity_communication_methods'
]

In [5]:
# Encoding categorical variables to numerical values
def create_frequency_mapping(df, columns):
    unique_values = set()
    for col in columns:
        unique_values.update(df[col].dropna().unique())
    
    unique_values = sorted(list(unique_values))
    mapping = {value: index for index, value in enumerate(unique_values)}
    
    return mapping

In [6]:
# Create frequency mapping
frequency_mapping = create_frequency_mapping(df, columns_to_analyze)

In [7]:
# Convert relevant columns using the mapping
for col in columns_to_analyze:
    df[col] = df[col].map(frequency_mapping)

In [8]:
# Drop rows with missing values
df_cleaned = df.dropna(subset=columns_to_analyze)

In [65]:
# Define the SEM model
model_desc = """
# Measurement model
cybersecurity_culture =~ cybersecurity_discussions + cybersecurity_barriers + cybersecurity_breach + cybersecurity_policies + cybersecurity_communication_methods

# Structural model
cybersecurity_discussions ~ handle_sensitive_data + open_unknown_emails + used_usb_transfer + cybersecurity_training_frequency
cybersecurity_culture ~ handle_sensitive_data + open_unknown_emails + used_usb_transfer + cybersecurity_training_frequency
"""

In [66]:
# Create the SEM model
model = Model(model_desc)

In [67]:
# Load dataset into the model
model.load_dataset(df_cleaned)


In [68]:
# Fit the SEM model
model.fit()

SolverResult(fun=0.31909596438058685, success=True, n_it=125, x=array([-4.66822910e-01, -1.65906387e+00, -8.58416327e-01,  6.38768111e-02,
        5.12791587e-02,  1.86580341e-01, -2.44316371e-01,  2.82102050e-01,
       -2.35096942e-02, -1.10806657e-01, -1.02737520e-01, -5.28981315e-02,
        4.42506842e+01,  4.40431921e+00,  1.24445790e+02,  9.25639510e+00,
        7.58239040e+01,  5.52897260e+01]), message='Optimization terminated successfully', name_method='SLSQP', name_obj='MLW')

In [69]:
# Extract results
res = model.inspect()

# Display the results
print(res)

                                   lval  op  \
0                 cybersecurity_culture   ~   
1                 cybersecurity_culture   ~   
2                 cybersecurity_culture   ~   
3                 cybersecurity_culture   ~   
4             cybersecurity_discussions   ~   
5                cybersecurity_barriers   ~   
6                  cybersecurity_breach   ~   
7                cybersecurity_policies   ~   
8   cybersecurity_communication_methods   ~   
9             cybersecurity_discussions   ~   
10            cybersecurity_discussions   ~   
11            cybersecurity_discussions   ~   
12            cybersecurity_discussions   ~   
13                cybersecurity_culture  ~~   
14               cybersecurity_barriers  ~~   
15                 cybersecurity_breach  ~~   
16  cybersecurity_communication_methods  ~~   
17            cybersecurity_discussions  ~~   
18               cybersecurity_policies  ~~   

                                   rval    Estimate   Std. 