In [17]:
#Summarize the data to get a sense of Users behaviour or awareness in cyber security in NGS.
#This includes Frequency Analysis:  Summarize user responses to Count the responses for each category to identify common behaviors 
#and awareness levels in Next generation Sequencing by using relevant questions. 
import pandas as pd

In [18]:
file_path = "cleaneddata_NGS.csv"
df = pd.read_csv(file_path)

In [19]:
# Relevant columns to analyze
columns_to_analyze = [
    #1
    'Please indicate your role within the organization:',
    #2
    'How long have you been working in the NGS',
    #3
    'How often do you handle sensitive NGS data that could be targeted in a cybersecurity attack',
    #4
    'How often do you open emails from unknown senders as part of your work',
    #5
    'Have you ever used a personal USB drive or other external storage devices to transfer NGS data?',
    #6
    'Do you follow a process to verify the security of websites or links before accessing them, especially when related to NGS research',
   #7
    'Have you ever shared your login credentials with a colleague, even if you believed it was for a legitimate purpose',
   #8
    'In the event of receiving a suspicious email, what actions do you typically take (Select all that apply)',
    #9
    'Has there been an instance where you accessed NGS data from a public Wi-Fi network',
    #10
    'How frequently do you engage in non-work-related activities (e.g., browsing social media, personal emails) on devices used to access NGS data systems',
    #11
    'Do you regularly update your computer, smartphone, and other devices with the latest security patches and software updates',
    #12
    'Have you ever fallen victim to a phishing email or scam',
    #13
    'Are you aware of the potential cybersecurity risks associated with next-generation sequencing (NGS) technologies',
    #14
    'Do you understand the importance of securing genomic data generated by NGS technologies to protect patient privacy and confidentiality',
    #15
    'Are you familiar with common cybersecurity threats and vulnerabilities that can affect NGS data, such as data breaches, ransomware attacks, unauthorized access, fishing attack,  targeted attacks on genomic data or manipulation of sequencing results', 
    #16
    'Have you implemented data backup and disaster recovery plans to ensure the resilience of NGS data in the event of cyberattacks or system failures',
     #18
    'How confident are you in identifying a cybersecurity threat specific to NGS operations',
    #22
    'When handling sensitive NGS data, which of the following practices do you regularly follow (Select all that apply)',
    #23
    'Have you or anyone in your team ever detected a potential cybersecurity threat',
    #29
    'Have you ever received suspicious emails or messages that you suspect may target NGS data or systems',
    #31
    'Do you feel that cybersecurity should be a priority in your NGS relates tasks or experiments',
    #32
    'Do you use encryption techniques to secure NGS data during transmission and storage',
    #33
    'Do you collaborate with cybersecurity experts or professionals to enhance the security posture of NGS systems and data',
    #34
    'Do you feel that the  careless behaviors could lead to potential cybersecurity risks to the sensitive and private data in NGS',
    #35
    'Has there been a cybersecurity incident within your organization such as hacking, password stealing etc.,  that led to changes or enhancements in the training program or policy change',
    #39
    'What is your age group',
    #40
    'Have you heard about the term Bio-Cyber Security'
]

In [20]:
# Rename columns to be more formula-friendly
df.rename(columns={
    'Please indicate your role within the organization:': 'role_in_organization',
    'How long have you been working in the NGS':'time_duration',
    'How often do you handle sensitive NGS data that could be targeted in a cybersecurity attack': 'handle_sensitive_data',
    'How often do you open emails from unknown senders as part of your work':'open_emails',
    'Have you ever used a personal USB drive or other external storage devices to transfer NGS data':'usb_for_datatransfer',
    'Do you follow a process to verify the security of websites or links before accessing them, especially when related to NGS research':'website_verification',
    'Have you ever shared your login credentials with a colleague, even if you believed it was for a legitimate purpose':'sharing_credentials',
    'In the event of receiving a suspicious email, what actions do you typically take (Select all that apply)':'suspicious_email_actions',
    'Has there been an instance where you accessed NGS data from a public Wi-Fi network':'public_wifi',
    'How frequently do you engage in non-work-related activities (e.g., browsing social media, personal emails) on devices used to access NGS data systems':'activities',
    'Do you regularly update your computer, smartphone, and other devices with the latest security patches and software updates':'Updates',
    'Have you ever fallen victim to a phishing email or scam':'phishing',
    'Are you aware of the potential cybersecurity risks associated with next-generation sequencing (NGS) technologies': 'aware_attack',
    'Do you understand the importance of securing genomic data generated by NGS technologies to protect patient privacy and confidentiality':'securing_genomic_data',
    'Are you familiar with common cybersecurity threats and vulnerabilities that can affect NGS data, such as data breaches, ransomware attacks, unauthorized access, fishing attack,  targeted attacks on genomic data or manipulation of sequencing results': 'familiarity_cs', 
    'Have you implemented data backup and disaster recovery plans to ensure the resilience of NGS data in the event of cyberattacks or system failures': 'Backup',
    'How confident are you in identifying a cybersecurity threat specific to NGS operations': 'threat_identification',
    'When handling sensitive NGS data, which of the following practices do you regularly follow (Select all that apply)': 'handling',
     'Have you or anyone in your team ever detected a potential cybersecurity threat': 'threat_detection',
    'Have you ever received suspicious emails or messages that you suspect may target NGS data or systems': 'suspicious_emails',
    'Do you feel that cybersecurity should be a priority in your NGS relates tasks or experiments': 'cs_as_priority',
    'Do you use encryption techniques to secure NGS data during transmission and storage': 'encryption_techniques',
    'Do you collaborate with cybersecurity experts or professionals to enhance the security posture of NGS systems and data':'collaboration', 
    'Do you feel that the  careless behaviors could lead to potential cybersecurity risks to the sensitive and private data in NGS': 'careless_behaviour',
    'Has there been a cybersecurity incident within your organization such as hacking, password stealing etc.,  that led to changes or enhancements in the training program or policy change' : 'policy change',
    'What is your age group': 'age_group',
    'Have you heard about the term Bio-Cyber Security': 'Bio-Cyber_Security_familiarity'
}, inplace=True)

In [21]:
# Update columns_to_analyze with the renamed columns
columns_to_analyze = [
    'role_in_organization',
    'time_duration',
    'handle_sensitive_data',
    'open_emails',
    'usb_for_datatransfer',
    'website_verification',
    'sharing_credentials',
    'suspicious_email_actions',
    'public_wifi',
    'activities',
    'Updates',
    'phishing',
    'aware_attack',
    'securing_genomic_data',
    'familiarity_cs', 
    'Backup',
    'threat_identification',
    'handling',
    'threat_detection',
    'suspicious_emails',
    'cs_as_priority',
    'encryption_techniques',
    'collaboration', 
    'careless_behaviour',
    'policy change',
    'age_group',
    'Bio-Cyber_Security_familiarity'
]

In [22]:
# Perform frequency analysis
frequency_analysis = {}
for col in columns_to_analyze:
    frequency_analysis[col] = df[col].value_counts()

In [23]:
# Convert frequency analysis results to DataFrame for better visualization
frequency_analysis_df = pd.DataFrame(frequency_analysis)

In [24]:
pd.set_option('display.max_columns', None)
# Display the frequency analysis
frequency_analysis_df

Unnamed: 0,role_in_organization,time_duration,handle_sensitive_data,open_emails,usb_for_datatransfer,website_verification,sharing_credentials,suspicious_email_actions,public_wifi,activities,Updates,phishing,aware_attack,securing_genomic_data,familiarity_cs,Backup,threat_identification,handling,threat_detection,suspicious_emails,cs_as_priority,encryption_techniques,collaboration,careless_behaviour,policy change,age_group,Bio-Cyber_Security_familiarity
1-3 years,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,
18-24,,,,,,,,,,,,,,,,,,,,,,,,,,47.0,
25-34,,,,,,,,,,,,,,,,,,,,,,,,,,10.0,
3-5 years,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,
35-44,,,,,,,,,,,,,,,,,,,,,,,,,,7.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Yes, sometimes",,,,,,,,,,,29.0,,,,,,,,,,,,,,,,
"Yes, very aware",,,,,,,,,,,,,66.0,,,,,,,,,,,,,,
"Yes, very familiar",,,,,,,,,,,,,,,13.0,,,,,,,,,,,,
more than 5 year,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,
