In [21]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [22]:
# Read the CSV file
df = pd.read_csv("Dataset.csv", encoding="ISO-8859-1")

In [23]:
df.head()

Unnamed: 0,ï»¿Category,Messages
0,Data_Leak,"Yeah, it's a goldmine waiting to happen."
1,Data_Leak,Let's discuss our strategy for this data breach.
2,Data_Leak,"Right, data privacy be damned, we need to be ..."
3,Data_Leak,Data security is tight; we'll need a plan to ...
4,Data_Leak,I've got a contact who specializes in cybercr...


In [24]:
df.rename(columns={'ï»¿Category': 'Category'}, inplace=True)

In [25]:
df.head()

Unnamed: 0,Category,Messages
0,Data_Leak,"Yeah, it's a goldmine waiting to happen."
1,Data_Leak,Let's discuss our strategy for this data breach.
2,Data_Leak,"Right, data privacy be damned, we need to be ..."
3,Data_Leak,Data security is tight; we'll need a plan to ...
4,Data_Leak,I've got a contact who specializes in cybercr...


In [26]:
# Split the comma-separated categories into a list
df['Category'] = df['Category'].str.split(',')


In [27]:
# Use MultiLabelBinarizer to convert the list of categories into binary labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['Category'])

In [28]:
# Split data
x_train, x_test, Y_train, Y_test = train_test_split(df['Messages'], binary_labels, test_size=0.2, random_state=0, shuffle=True)

In [29]:

# Create a pipeline with TF-IDF vectorization and a multi-output classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultiOutputClassifier(SVC(C=1000, gamma='auto')))
])

In [30]:
# Fit the model
model.fit(x_train, Y_train)


In [31]:
# Make predictions
Y_pred = model.predict(x_test)


In [32]:
# Model evaluation
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8146690518783543


In [33]:
# Convert binary labels back to the original category format
predicted_categories = mlb.inverse_transform(Y_pred)
true_categories = mlb.inverse_transform(Y_test)

In [34]:
# Print classification report for each category
for i, category in enumerate(mlb.classes_):
    print(f"Classification Report for {category}:")
    print(classification_report([label[i] for label in Y_test], [label[i] for label in Y_pred]))


Classification Report for Cyberspace:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1676
           1       0.87      0.85      0.86      1119

    accuracy                           0.89      2795
   macro avg       0.89      0.88      0.88      2795
weighted avg       0.89      0.89      0.89      2795

Classification Report for Data_Leak:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2701
           1       0.00      0.00      0.00        94

    accuracy                           0.97      2795
   macro avg       0.48      0.50      0.49      2795
weighted avg       0.93      0.97      0.95      2795

Classification Report for Normal:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1277
           1       0.89      0.89      0.89      1518

    accuracy                           0.88      2795
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
# Save the model
joblib.dump(model, "MultiLabelCategorySVC_model.pkl")

['MultiLabelCategorySVC_model.pkl']

In [40]:
# Load the trained model
loaded_model = joblib.load("MultiLabelCategorySVC_model.pkl")

# Input a new message
new_message = ["There are always casualties in war, my feudal friend! Fidayeen sacrifices for the greater cause!"]

# Make predictions for the new message
new_message_predictions = loaded_model.predict(new_message)

# Convert binary labels back to the original category format
predicted_categories = mlb.inverse_transform(new_message_predictions)

# Print the predicted categories for the new message
print("Predicted Categories for the New Message:", predicted_categories)


Predicted Categories for the New Message: [('Normal',)]
