### Importing the necessary libraries

In [206]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [207]:
df = pd.read_csv("Questionnaire analysis.csv")

In [208]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 37 columns):
 #   Column                                                                                                                                                                             Non-Null Count  Dtype  
---  ------                                                                                                                                                                             --------------  -----  
 0   ID                                                                                                                                                                                 199 non-null    int64  
 1   Start time                                                                                                                                                                         199 non-null    object 
 2   Completion time                                           

### Making the column names simple and concise (Renaming)

In [209]:
# Dictionary to map old column names to new short names
rename_columns = {
    'ID': 'ID',
    'Start time': 'StartTime',
    'Completion time': 'CompletionTime',
    'Email': 'Email',
    'Name': 'Name',
    'Last modified time': 'LastModifiedTime',
    'Age': 'Age',
    'Gender': 'Gender',
    'Level of Education': 'Education',
    'Job Category': 'JobCategory',
    'Income range': 'Income',
    'Openness: This involves trying new things, learning, and exploring different ideas and perspectives.': 'Openness',
    'Conscientiousness: This refers about how organized and responsible someone is.': 'Conscientiousness',
    'Extraversion: This is about how outgoing and social someone is.': 'Extraversion',
    'Agreeableness: This is about how friendly and kind someone is toward others.': 'Agreeableness',
    'Neuroticism (or Emotional Stability): This is about how easily someone feels negative emotions like worry, sadness, or stress.': 'Neuroticism',
    ' Checking Social Media': 'SocialMedia',
    'Online Shopping': 'Shopping',
    'Watching Online Videos (e.g., YouTube, Netflix)': 'Videos',
    'Gaming or Online Entertainment': 'Gaming',
    'Online Learning or Skill Development': 'Learning',
    ' Communicating Online (e.g., Email, Messaging Apps)': 'Communicating',
    'I tend to rely on gut feelings or intuition when making decisions.': 'GutFeelings',
    'I often use simple rules or shortcuts to assess the legitimacy of emails or messages without deep analysis.': 'SimpleRules',
    'I prefer to carefully analyze information before making decisions.': 'CarefulAnalysis',
    'I take time to consider multiple factors and perspectives when evaluating the authenticity of emails or messages.': 'ConsiderFactors',
    'How confident are you in identifying phishing attempts?': 'PhishingConfidence',
    'Have you ever received an email or message that appeared to be from a legitimate source (e.g., your bank, a social media platform) but later turned out to be a phishing attempt?': 'PhishingExperience',
    'What types of messages do you consider to be potential phishing attempts? (Select all that apply)': 'PhishingTypes',
    'How often do you check the sender\'s email address or URL to verify the legitimacy of an email or message before clicking on links or providing personal information?': 'CheckSender',
    'Have you ever fallen victim to a phishing scam or had your personal information compromised as a result of a phishing attack?': 'PhishingVictim',
    'Which of the following actions would you take if you received a suspicious email or message that you suspect may be a phishing attempt? (Select all that apply)': 'SuspiciousActions',
    'How often do you receive cybersecurity awareness training or education in your academic or professional environment?': 'CyberTraining',
    'What format would you prefer for receiving phishing awareness training (e.g., online courses, workshops, informational materials)?': 'TrainingFormat',
    'Would you be interested in receiving additional training or resources to help you recognize and respond to phishing attacks more effectively?': 'AdditionalTraining',
    'Do you consider a detection system as a possible solution to combatting Phishing attacks on websites?': 'DetectionSystem',
    'Do you have any recommendations for improving the effectiveness of cybersecurity training or awareness programs?': 'TrainingRecommendations'
}

# Rename the columns in the DataFrame
df.rename(columns=rename_columns, inplace=True)

# Print new column names to verify
print(df.columns.tolist())

['ID', 'StartTime', 'CompletionTime', 'Email', 'Name', 'LastModifiedTime', 'Age', 'Gender', 'Education', 'JobCategory', 'Income', 'Openness:\xa0This involves trying new things, learning, and exploring different ideas and perspectives.', 'Conscientiousness:\xa0This refers about how organized and responsible someone is.', 'Extraversion: This\xa0is about how outgoing and social someone is.', 'Agreeableness: This\xa0is about how friendly and kind someone is toward others.', 'Neuroticism', 'SocialMedia', 'Shopping', 'Videos', 'Gaming', 'Learning', 'Communicating', 'GutFeelings', 'SimpleRules', 'CarefulAnalysis', 'ConsiderFactors', 'PhishingConfidence', 'PhishingExperience', 'PhishingTypes', 'CheckSender', 'PhishingVictim', 'SuspiciousActions', 'CyberTraining', 'TrainingFormat', 'AdditionalTraining', 'DetectionSystem', 'TrainingRecommendations']


#### Certain field names remained the same due to the presence of non-breaking spaces. Replacing non-breaking spaces with regular spaces and striping any leading/trailing spaces

In [210]:
df.columns = df.columns.str.replace('\xa0', ' ').str.strip()

# Define renaming dictionary
rename_dict = {
    'Openness: This involves trying new things, learning, and exploring different ideas and perspectives.': 'Openness',
    'Conscientiousness: This refers about how organized and responsible someone is.': 'Conscientiousness',
    'Extraversion: This is about how outgoing and social someone is.': 'Extraversion',
    'Agreeableness: This is about how friendly and kind someone is toward others.': 'Agreeableness',
    'Neuroticism': 'Neuroticism'
}

# Rename columns
df = df.rename(columns=rename_dict)

# Print new column names to verify
print(df.columns.tolist())

['ID', 'StartTime', 'CompletionTime', 'Email', 'Name', 'LastModifiedTime', 'Age', 'Gender', 'Education', 'JobCategory', 'Income', 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism', 'SocialMedia', 'Shopping', 'Videos', 'Gaming', 'Learning', 'Communicating', 'GutFeelings', 'SimpleRules', 'CarefulAnalysis', 'ConsiderFactors', 'PhishingConfidence', 'PhishingExperience', 'PhishingTypes', 'CheckSender', 'PhishingVictim', 'SuspiciousActions', 'CyberTraining', 'TrainingFormat', 'AdditionalTraining', 'DetectionSystem', 'TrainingRecommendations']


In [211]:
df.head()

Unnamed: 0,ID,StartTime,CompletionTime,Email,Name,LastModifiedTime,Age,Gender,Education,JobCategory,...,PhishingExperience,PhishingTypes,CheckSender,PhishingVictim,SuspiciousActions,CyberTraining,TrainingFormat,AdditionalTraining,DetectionSystem,TrainingRecommendations
0,1,4/15/24 19:02:53,4/15/24 20:25:54,anonymous,,,60 above,Prefer not to say,Graduate,Teacher,...,Yes,Emails containing suspicious links or attachme...,Always,Prefer not to disclose,Do not click on any links or download any atta...,Rarely,Simulated phishing exercises,Maybe,,
1,2,4/18/24 19:57:07,4/18/24 20:06:36,anonymous,,,45-60,Male,Graduate,Administrator,...,Yes,Emails asking for personal information such as...,Very often,Yes,Delete the email or message immediately;Do not...,Never,Online courses,Yes,,Yes dere should be a consistent awareness
2,3,4/25/24 19:39:53,4/25/24 19:45:50,anonymous,,,45-60,Male,Graduate,Teacher,...,Yes,Emails asking for personal information such as...,Sometimes,Yes,Do not click on any links or download any atta...,Rarely,Online courses,Yes,Strongly agree,
3,4,4/26/24 15:46:03,4/26/24 15:46:46,anonymous,,,15-25,Female,Undergraduate,Student,...,Yes,Emails containing suspicious links or attachme...,Always,No,Delete the email or message immediately;Do not...,Rarely,Workshops,Yes,Strongly agree,
4,5,4/26/24 15:39:41,4/26/24 15:47:56,anonymous,,,15-25,Female,Undergraduate,Student,...,Yes,Emails asking for personal information such as...,Always,Yes,Delete the email or message immediately;Do not...,Rarely,Online courses,No,Strongly agree,No


### Dropping columns that are irrelevant to model building

In [212]:
columns_to_drop = [
    'ID', 'StartTime', 'CompletionTime', 'Email', 'Name', 
    'LastModifiedTime', 'AdditionalTraining', 'TrainingFormat', 
    'TrainingRecommendations'
]

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)
print(df.shape)

(199, 28)


In [213]:
df.head()

Unnamed: 0,Age,Gender,Education,JobCategory,Income,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,...,CarefulAnalysis,ConsiderFactors,PhishingConfidence,PhishingExperience,PhishingTypes,CheckSender,PhishingVictim,SuspiciousActions,CyberTraining,DetectionSystem
0,60 above,Prefer not to say,Graduate,Teacher,200k above,Strongly Agree,Agree,Strongly Agree,Strongly Agree,Strongly Agree,...,Strongly Agree,Strongly Agree,Very Confident,Yes,Emails containing suspicious links or attachme...,Always,Prefer not to disclose,Do not click on any links or download any atta...,Rarely,
1,45-60,Male,Graduate,Administrator,100k - 200k,Strongly Agree,Agree,Strongly disagree,Agree,Agree,...,Strongly Agree,Strongly Agree,Somewhat confident,Yes,Emails asking for personal information such as...,Very often,Yes,Delete the email or message immediately;Do not...,Never,
2,45-60,Male,Graduate,Teacher,200k above,Agree,Agree,Agree,Agree,Agree,...,Strongly Agree,Strongly Agree,Very Confident,Yes,Emails asking for personal information such as...,Sometimes,Yes,Do not click on any links or download any atta...,Rarely,Strongly agree
3,15-25,Female,Undergraduate,Student,20k - 50k,Strongly Agree,Agree,Agree,Agree,Agree,...,Strongly Agree,Strongly Agree,Extremely confident,Yes,Emails containing suspicious links or attachme...,Always,No,Delete the email or message immediately;Do not...,Rarely,Strongly agree
4,15-25,Female,Undergraduate,Student,20k - 50k,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,...,Strongly Agree,Strongly Agree,Very Confident,Yes,Emails asking for personal information such as...,Always,Yes,Delete the email or message immediately;Do not...,Rarely,Strongly agree


### Checking for null values

In [214]:
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
Age                   0
Gender                0
Education             0
JobCategory           0
Income                0
Openness              0
Conscientiousness     0
Extraversion          0
Agreeableness         0
Neuroticism           0
SocialMedia           0
Shopping              0
Videos                0
Gaming                0
Learning              0
Communicating         0
GutFeelings           0
SimpleRules           0
CarefulAnalysis       0
ConsiderFactors       0
PhishingConfidence    0
PhishingExperience    0
PhishingTypes         0
CheckSender           0
PhishingVictim        0
SuspiciousActions     0
CyberTraining         0
DetectionSystem       4
dtype: int64


### Applying Imputation to address the identified Null values

In [215]:
if 'DetectionSystem' in df.columns:
    mode_value = df['DetectionSystem'].mode()[0]
    df['DetectionSystem'].fillna(mode_value, inplace=True)
    print(f"Filled missing values in 'DetectionSystem' with mode: {mode_value}")


Filled missing values in 'DetectionSystem' with mode: Strongly agree


In [216]:
df.head()

Unnamed: 0,Age,Gender,Education,JobCategory,Income,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,...,CarefulAnalysis,ConsiderFactors,PhishingConfidence,PhishingExperience,PhishingTypes,CheckSender,PhishingVictim,SuspiciousActions,CyberTraining,DetectionSystem
0,60 above,Prefer not to say,Graduate,Teacher,200k above,Strongly Agree,Agree,Strongly Agree,Strongly Agree,Strongly Agree,...,Strongly Agree,Strongly Agree,Very Confident,Yes,Emails containing suspicious links or attachme...,Always,Prefer not to disclose,Do not click on any links or download any atta...,Rarely,Strongly agree
1,45-60,Male,Graduate,Administrator,100k - 200k,Strongly Agree,Agree,Strongly disagree,Agree,Agree,...,Strongly Agree,Strongly Agree,Somewhat confident,Yes,Emails asking for personal information such as...,Very often,Yes,Delete the email or message immediately;Do not...,Never,Strongly agree
2,45-60,Male,Graduate,Teacher,200k above,Agree,Agree,Agree,Agree,Agree,...,Strongly Agree,Strongly Agree,Very Confident,Yes,Emails asking for personal information such as...,Sometimes,Yes,Do not click on any links or download any atta...,Rarely,Strongly agree
3,15-25,Female,Undergraduate,Student,20k - 50k,Strongly Agree,Agree,Agree,Agree,Agree,...,Strongly Agree,Strongly Agree,Extremely confident,Yes,Emails containing suspicious links or attachme...,Always,No,Delete the email or message immediately;Do not...,Rarely,Strongly agree
4,15-25,Female,Undergraduate,Student,20k - 50k,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,...,Strongly Agree,Strongly Agree,Very Confident,Yes,Emails asking for personal information such as...,Always,Yes,Delete the email or message immediately;Do not...,Rarely,Strongly agree


In [217]:
df.isnull().sum()

Age                   0
Gender                0
Education             0
JobCategory           0
Income                0
Openness              0
Conscientiousness     0
Extraversion          0
Agreeableness         0
Neuroticism           0
SocialMedia           0
Shopping              0
Videos                0
Gaming                0
Learning              0
Communicating         0
GutFeelings           0
SimpleRules           0
CarefulAnalysis       0
ConsiderFactors       0
PhishingConfidence    0
PhishingExperience    0
PhishingTypes         0
CheckSender           0
PhishingVictim        0
SuspiciousActions     0
CyberTraining         0
DetectionSystem       0
dtype: int64

### Converting Categorical Values to Numerical Values

In [218]:
def encode_categorical_data(df):
    # Columns to encode using ordinal encoding
    ordinal_cols = {
        'Age': ['15-25', '25-35', '35-45', '45-60', '60 above'],
        'Income': ['20k - 50k', '50k - 100k', '100k - 200k', '200k above'],
        'Openness': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'Conscientiousness': ['Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'Extraversion': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'Agreeableness': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'Neuroticism': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'PhishingConfidence': ['Extremely not confident', 'Somewhat confident', 'Neutral','Very Confident', 'Extremely confident'],
        'GutFeelings': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'SimpleRules': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'CarefulAnalysis': ['Disagree', 'Neutral', 'Agree', 'Strongly Agree'],
        'ConsiderFactors': ['Strongly disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree']
    }

    # Ordinal encoding
    ordinal_encoder = OrdinalEncoder(categories=list(ordinal_cols.values()))
    ordinal_encoded = pd.DataFrame(ordinal_encoder.fit_transform(df[list(ordinal_cols.keys())]))
    ordinal_encoded.columns = list(ordinal_cols.keys())

    # Columns to encode using one-hot encoding
    one_hot_cols = [
        'Gender', 'Education', 'JobCategory', 'SocialMedia', 'Shopping','Videos', 'Gaming', 'Learning', 'Communicating', 'PhishingExperience','CheckSender', 'PhishingVictim', 'DetectionSystem'
    ]

    # One-hot encoding
    one_hot_encoder = OneHotEncoder(sparse=False, drop='first')  # Use drop='first' to avoid multicollinearity
    one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[one_hot_cols]))
    one_hot_encoded.columns = one_hot_encoder.get_feature_names_out(one_hot_cols)

    # Initialize MultiLabelBinarizer and convert the strings to lists of values
    mlb = MultiLabelBinarizer()
    df['PhishingTypes'] = df['PhishingTypes'].astype(str).str.split(';')
    df['SuspiciousActions'] = df['SuspiciousActions'].astype(str).str.split(';')

    # Fit and transform the columns
    phishing_types_encoded = pd.DataFrame(mlb.fit_transform(df['PhishingTypes']), columns=mlb.classes_, index=df.index)
    suspicious_actions_encoded = pd.DataFrame(mlb.fit_transform(df['SuspiciousActions']), columns=mlb.classes_, index=df.index)

    return ordinal_encoded, one_hot_encoded, phishing_types_encoded, suspicious_actions_encoded



In [219]:
ordinal_encoded, one_hot_encoded, phishing_types_encoded, suspicious_actions_encoded = encode_categorical_data(df)

# Concatenate the encoded features into a single DataFrame
encoded_features = pd.concat([ordinal_encoded, one_hot_encoded, phishing_types_encoded, suspicious_actions_encoded], axis=1)

# Calculate correlations
correlation_matrix = encoded_features.corr()

# Extract correlations with 'PhishingVictim_Yes' and 'PhishingVictim_No'
phishing_victim_correlation_yes = correlation_matrix['PhishingVictim_Yes'].sort_values(ascending=False)
phishing_victim_correlation_no = correlation_matrix['PhishingVictim_No'].sort_values(ascending=False)

# Print the top correlated features for PhishingVictim_Yes
print("Top correlated features for 'PhishingVictim_Yes':")
print(phishing_victim_correlation_yes)

# Print the top correlated features for PhishingVictim_No
print("\nTop correlated features for 'PhishingVictim_No':")
print(phishing_victim_correlation_no)

# Include the target variable
encoded_features['PhishingVictim'] = df['PhishingVictim']

# Separate features and target
X = encoded_features.drop(columns=['PhishingVictim'])
y = encoded_features['PhishingVictim']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Top correlated features for 'PhishingVictim_Yes':
PhishingVictim_Yes                1.000000
Communicating_Seasonal            0.202731
CheckSender_Never                 0.190197
DetectionSystem_Strongly agree    0.172414
PhishingExperience_Yes            0.171417
                                    ...   
PhishingExperience_No            -0.120160
CheckSender_Rarely               -0.128814
PhishingVictim_No                -0.760078
                                       NaN
                                       NaN
Name: PhishingVictim_Yes, Length: 78, dtype: float64

Top correlated features for 'PhishingVictim_No':
PhishingVictim_No          1.000000
PhishingExperience_No      0.171654
GutFeelings                0.121744
Shopping_Yearly            0.116904
Education_Undergraduate    0.109258
                             ...   
CheckSender_Never         -0.181943
Age                       -0.200035
PhishingVictim_Yes        -0.760078
                                NaN
              

In [220]:
# Calculate the correlation matrix
encoded_features.corr()

Unnamed: 0,Age,Income,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,PhishingConfidence,GutFeelings,SimpleRules,...,Emails containing suspicious links or attachments,Messages claiming that you have won a prize or lottery,Messages from unfamiliar senders requesting sensitive information or login credentials,Unnamed: 15,Delete the email or message immediately,Do not click on any links or download any attachments,None of the above,Notify your organization's IT department or security team,Report the email or message as spam,Verify the sender's identity through a separate communication channel
Age,1.000000,0.458243,-0.085826,-0.046385,0.037811,0.038987,-0.083800,-0.016256,-0.072266,-0.036218,...,-0.021724,-0.004905,-0.008129,,0.044005,0.017078,-0.059623,-0.019981,-0.163146,0.001177
Income,0.458243,1.000000,0.029362,-0.000386,0.060197,-0.020518,0.005536,0.152184,0.007508,-0.009406,...,0.043787,0.153957,0.054465,,-0.004249,0.088294,0.135225,0.029069,-0.088915,0.002937
Openness,-0.085826,0.029362,1.000000,0.392354,0.264528,0.141690,0.267868,0.040122,0.103212,0.125566,...,0.038234,-0.005353,-0.029458,,-0.140931,0.004097,-0.073819,-0.057524,0.003887,-0.040827
Conscientiousness,-0.046385,-0.000386,0.392354,1.000000,0.224554,0.305976,0.122059,0.078566,0.054943,0.164510,...,0.021213,-0.042790,-0.064646,,-0.035626,-0.075993,-0.041557,0.036659,0.040600,0.094324
Extraversion,0.037811,0.060197,0.264528,0.224554,1.000000,0.263962,0.421707,0.150654,0.143557,0.247898,...,-0.006960,0.014762,-0.082709,,0.015020,-0.107827,-0.000641,0.007617,-0.110801,0.051154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Do not click on any links or download any attachments,0.017078,0.088294,0.004097,-0.075993,-0.107827,0.026341,-0.046058,-0.078117,-0.087624,-0.083319,...,0.313220,0.249658,0.338102,,0.093465,1.000000,-0.029986,0.245061,0.056671,0.264529
None of the above,-0.059623,0.135225,-0.073819,-0.041557,-0.000641,0.010009,0.034619,0.043822,-0.022604,0.035684,...,0.006056,0.013616,-0.080029,,-0.057235,-0.029986,1.000000,0.004455,-0.059908,-0.000877
Notify your organization's IT department or security team,-0.019981,0.029069,-0.057524,0.036659,0.007617,0.064215,0.022361,0.050011,-0.045794,0.041287,...,0.371593,0.207992,0.371593,,0.229613,0.245061,0.004455,1.000000,0.325624,0.543831
Report the email or message as spam,-0.163146,-0.088915,0.003887,0.040600,-0.110801,-0.069855,-0.025333,-0.097546,-0.014601,-0.030399,...,0.311162,0.294313,0.289956,,0.160544,0.056671,-0.059908,0.325624,1.000000,0.271244


In [221]:
# Check for empty columns
empty_columns = encoded_features.columns[encoded_features.isnull().all()]

# Print empty columns, if any
if len(empty_columns) > 0:
    print("Empty columns found:")
    print(empty_columns)
else:
    print("No empty columns found.")


No empty columns found.


In [222]:
# Check for null values
null_values = encoded_features.isnull().sum()

# Print columns with null values, if any
columns_with_null = null_values[null_values > 0]
if not columns_with_null.empty:
    print("Columns with null values:")
    print(columns_with_null)
else:
    print("No null values found in any column.")


No null values found in any column.


In [223]:
# Check information
print("\nInfo of one-hot encoded data:")
one_hot_encoded.info()


Info of one-hot encoded data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 53 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Gender_Male                                 199 non-null    float64
 1   Gender_Prefer not to say                    199 non-null    float64
 2   Education_High School                       199 non-null    float64
 3   Education_Undergraduate                     199 non-null    float64
 4   JobCategory_Care Provider                   199 non-null    float64
 5   JobCategory_Customer service officer        199 non-null    float64
 6   JobCategory_Engineering                     199 non-null    float64
 7   JobCategory_Entertainer                     199 non-null    float64
 8   JobCategory_Entrepreneur                    199 non-null    float64
 9   JobCategory_Lecturer                        199 non-null

In [224]:
# Check info
print("\nInformation of ordinal encoded data:")
ordinal_encoded.info()


Information of ordinal encoded data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 199 non-null    float64
 1   Income              199 non-null    float64
 2   Openness            199 non-null    float64
 3   Conscientiousness   199 non-null    float64
 4   Extraversion        199 non-null    float64
 5   Agreeableness       199 non-null    float64
 6   Neuroticism         199 non-null    float64
 7   PhishingConfidence  199 non-null    float64
 8   GutFeelings         199 non-null    float64
 9   SimpleRules         199 non-null    float64
 10  CarefulAnalysis     199 non-null    float64
 11  ConsiderFactors     199 non-null    float64
dtypes: float64(12)
memory usage: 18.8 KB


### Saving the encoders

In [225]:
joblib.dump(ordinal_encoded, 'ordinal_encoder.pkl')

['ordinal_encoder.pkl']

In [226]:
joblib.dump(one_hot_encoded, 'one_hot_encoder.pkl')


['one_hot_encoder.pkl']

In [227]:
joblib.dump(phishing_types_encoded, 'mlb_phishing_types.pkl')

['mlb_phishing_types.pkl']

In [228]:
joblib.dump(suspicious_actions_encoded, 'mlb_suspicious_actions.pkl')

['mlb_suspicious_actions.pkl']

### Training the model - Logistic Regression Model

In [229]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [230]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

I am not sure       1.00      0.50      0.67         4
           No       0.92      1.00      0.96        23
          Yes       1.00      1.00      1.00        13

     accuracy                           0.95        40
    macro avg       0.97      0.83      0.88        40
 weighted avg       0.95      0.95      0.94        40



### Saving the Model

In [231]:
joblib_file = "Phishing_Susceptibility_Prediction_model.pkl"  
joblib.dump(model, joblib_file)

['Phishing_Susceptibility_Prediction_model.pkl']

## Models Considererd Comparison and Conclusion


### Random Forest Classifier Performance 
* first model utilized 

### Logistic Regression Performance

* model that was finally utilized for this project

#### Based on the above performance comparison, Logistic Regression performs better overall than Random Forest, especially in handling the "I am not sure" class with significantly improved recall and F1-score.
- Class Imbalance: Both models handle the "Yes" and "No" classes well, but Logistic Regression shows better handling of the minority class ("I am not sure").
- Interpretability: Logistic Regression is simpler and more interpretable, making it easier to understand which features contribute to the predictions.
- Accuracy: Logistic Regression achieves higher accuracy (0.95 vs. 0.93).
- Given these results, Logistic Regression is the preferred model due to its higher overall performance, better handling of class imbalance, and ease of interpretation.

### Studying model features to aid app development

In [232]:
# View the resulting encoded DataFrames
print("Phishing Types Encoded:")
print(phishing_types_encoded)
print("\nSuspicious Actions Encoded:")
print(suspicious_actions_encoded)

Phishing Types Encoded:
        \
0    1   
1    1   
2    1   
3    1   
4    1   
..  ..   
194  1   
195  1   
196  1   
197  1   
198  1   

     Communications requesting urgent action such as clicking on a link or transferring money  \
0                                                    0                                          
1                                                    1                                          
2                                                    0                                          
3                                                    1                                          
4                                                    1                                          
..                                                 ...                                          
194                                                  0                                          
195                                                  0                         

In [233]:
# View the column headers as a list
print("Phishing Types Encoded Columns:")
print(list(phishing_types_encoded.columns))
print("\nSuspicious Actions Encoded Columns:")
print(list(suspicious_actions_encoded.columns))

Phishing Types Encoded Columns:
['', 'Communications requesting urgent action such as clicking on a link or transferring money', 'Emails asking for personal information such as passwords or Social Security numbers', 'Emails containing suspicious links or attachments', 'Messages claiming that you have won a prize or lottery', 'Messages from unfamiliar senders requesting sensitive information or login credentials']

Suspicious Actions Encoded Columns:
['', 'Delete the email or message immediately', 'Do not click on any links or download any attachments', 'None of the above', "Notify your organization's IT department or security team", 'Report the email or message as spam', "Verify the sender's identity through a separate communication channel"]


In [234]:
# Function to print unique values for each column
def print_unique_values(df, columns):
    for column in columns:
        unique_values = df[column].unique()
        print(f"Column: {column}")
        print(f"Unique values: {unique_values}")
        print()

# Columns to check
check_columns = ['Gender', 'Education', 'JobCategory', 'SocialMedia', 'Shopping', 'Videos', 'Gaming', 'Learning', 'Communicating', 'PhishingExperience', 'CheckSender', 'PhishingVictim', 'DetectionSystem']

# Call the function
print_unique_values(df, check_columns)


Column: Gender
Unique values: ['Prefer not to say' 'Male' 'Female']

Column: Education
Unique values: ['Graduate' 'Undergraduate' 'High School']

Column: JobCategory
Unique values: ['Teacher' 'Administrator' 'Student' 'Parent' 'Customer service officer '
 'Entertainer' 'Technician ' 'Care Provider' 'Engineering'
 'Self employed & undergraduate' 'Lecturer ' 'Entrepreneur ']

Column: SocialMedia
Unique values: ['Daily' 'Weekly' 'Monthly']

Column: Shopping
Unique values: ['Weekly' 'Monthly' 'Seasonal' 'Daily' 'Never' 'Yearly']

Column: Videos
Unique values: ['Daily' 'Seasonal' 'Weekly' 'Never' 'Monthly' 'Yearly']

Column: Gaming
Unique values: ['Daily' 'Seasonal' 'Monthly' 'Weekly' 'Never' 'Yearly']

Column: Learning
Unique values: ['Daily' 'Seasonal' 'Weekly' 'Monthly' 'Never']

Column: Communicating
Unique values: ['Daily' 'Monthly' 'Weekly' 'Seasonal' 'Yearly' 'Never']

Column: PhishingExperience
Unique values: ['Yes' 'Maybe' 'No']

Column: CheckSender
Unique values: ['Always' 'Very o