In [19]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/content/cybersecurity_dataset_with_target (1).csv')
df.head()

Unnamed: 0,industry,threats,vulnerabilities,information_value,risk_level
0,transportation,Man_in_the_middle_attack,Weak_passwords,priority,High_Risk
1,financial,Man_in_the_middle_attack,Missing_Encryption,not_important,Low_Risk
2,educational,DDoS,Missing_Encryption,critical,Low_Risk
3,transportation,ransomware,Missing_security_policies,dormant,Low_Risk
4,financial,malware,Unsecured_endpoints,critical,High_Risk


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [5]:
df['industry'].unique

<bound method Series.unique of 0      transportation
1           financial
2         educational
3      transportation
4           financial
            ...      
495         financial
496         financial
497    transportation
498       educational
499        healthcare
Name: industry, Length: 500, dtype: object>

In [6]:
le_industry = LabelEncoder()
df['industry'] = le_industry.fit_transform(df['industry'])
df['industry'].unique()


array([4, 1, 0, 2, 3])

In [7]:
le_threats = LabelEncoder()
df['threats'] = le_threats.fit_transform(df['threats'])
df['threats'].unique()


array([2, 0, 4, 3, 1])

In [8]:
le_vulnerabilities = LabelEncoder()
df['vulnerabilities'] = le_vulnerabilities.fit_transform(df['vulnerabilities'])
df['vulnerabilities'].unique()

array([4, 0, 1, 3, 2])

In [9]:
le_information_value = LabelEncoder()
df['information_value'] = le_vulnerabilities.fit_transform(df['information_value'])
df['information_value'].unique()

array([3, 2, 0, 1, 4])

In [12]:
X = df.drop('risk_level', axis = 1)
y = df['risk_level']

In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Create and train a Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred = model.predict(X_test)

In [16]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [17]:
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_rep)

Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

   High_Risk       0.75      0.36      0.49        25
    Low_Risk       0.77      0.97      0.86        69
 Medium_Risk       0.00      0.00      0.00         6

    accuracy                           0.76       100
   macro avg       0.51      0.44      0.45       100
weighted avg       0.72      0.76      0.71       100



In [18]:
df.head()

Unnamed: 0,industry,threats,vulnerabilities,information_value,risk_level
0,4,2,4,3,High_Risk
1,1,2,0,2,Low_Risk
2,0,0,0,0,Low_Risk
3,4,4,1,1,Low_Risk
4,1,3,3,0,High_Risk


In [23]:
import pickle

In [24]:
data = {'model':model, 'le_industry':le_industry, 'le_threats':le_threats, 'le_vulnerabilities':le_vulnerabilities , 'le_information_value': le_information_value }
with open('saved_steps.pkl', 'wb') as file:
  pickle.dump(data, file)

In [25]:
with open('saved_steps.pkl', 'rb') as file:
  data = pickle.load(file)

model_loaded = data['model']
le_industry = data['le_industry']
le_threats = data['le_threats']
le_vulnerabilities = data['le_vulnerabilities']
le_information_value = data['le_information_value']


In [29]:
y_pred = model_loaded.predict(X_test)
y_pred

array(['Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'High_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Medium_Risk', 'High_Risk', 'Low_Risk', 'Low_Risk', 'High_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'High_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'High_Risk',
       'Low_Risk', 'Low_Risk', 'High_Risk', 'High_Risk', 'Low_Risk',
       'High_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'High_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk', 'Low_Risk',
       'High_Risk', 'Low_Risk', 'Low_Risk', 'Low_R