### Creating test set

In [3]:
import os
import re
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import spacy
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

### Merging Test Set Files 

In [12]:
#Input relevant files to create test set
file1 = "forum_nulled_20230123_20230207_61_annotations_(test_set).jsonl"
file2 = "forum_breached_20230123_20230207_102_annotations_(test_set).jsonl"
file3 = "forum_xss_20230123_20230207_105_annotations_(test_set).jsonl"

files = [file1,file2,file3]

def merge_jsonl_files(files):
    curr_path = os.getcwd()
    df_list = []

    for file in files:
        file_path = os.path.join(curr_path,"prodigy","annotation_output", file)
        df = pd.read_json(file_path,lines= True)
        df_list.append(df)

    merged_df = pd.concat(df_list)

    return merged_df

def url_remover(text):
    remove = "http\S+"
    text = re.sub(remove, " ", text)
    return text

df_dummy = merge_jsonl_files(files)
df_dummy_dummy = df_dummy[df_dummy.answer == "accept"]
df = df_dummy_dummy.drop(columns=["_input_hash","_session_id","_task_hash","_view_id","options","config", "answer"])
df["text"] = df["text"].apply(url_remover)

del df["meta"]
df

Unnamed: 0,text,accept
0,x2000 Steam Accounts with Games #4 \n\nThis l...,"[DATA LEAKS, CREDENTIALS OR ACCOUNTS, OFFERING..."
1,332K Combolist EDU OFFICE 332K Combolist EDU O...,"[DATA LEAKS, COMPANY OR ORG INFORMATION, OFFER..."
2,Mycanal ACCOUNTS PREMIUM diariatouaidara1999@g...,"[CREDENTIALS OR ACCOUNTS, DATA LEAKS, OFFERING..."
4,Connecting to shoutbox Anyone have solution to...,[ADVICE]
5,BWW - Free Food - Accounts with Over 1000 Pts ...,"[OFFERING OF SERVICE OR PRODUCT, CREDENTIALS O..."
...,...,...
101,Looking for stealer Hello please what’s the la...,"[REQUEST FOR SERVICE OR PRODUCT, ADVICE, MALWA..."
102,Android vulnerability to install a silent payl...,"[VULNERABILITY, MALWARE TOOLS AND EXPLOITS, AD..."
103,Email/Phone leads - USA banks I have variously...,"[OFFERING OF SERVICE OR PRODUCT, MONEY INVOLVE..."
104,110K Malaysian Online Casino Customers [ Depos...,"[OFFERING OF SERVICE OR PRODUCT, DATA LEAKS, P..."


In [13]:
cwd = os.getcwd()
label_path = os.path.join(cwd,"labels.txt")
label_data = open(label_path,"r").read()
labels = label_data.split("\n")
mlb = MultiLabelBinarizer(classes=labels)
mlb.classes

['REQUEST FOR SERVICE OR PRODUCT',
 'OFFERING OF SERVICE OR PRODUCT',
 'MONEY INVOLVED',
 'ADVICE',
 'NETWORK OR PANEL ACCESS',
 'CREDENTIALS OR ACCOUNTS',
 'CARDING',
 'INFRASTRUCTURE AND HOSTING',
 'DATA LEAKS',
 'PERSONAL INFORMATION',
 'COMPANY OR ORG INFORMATION',
 'ADULT',
 'MALWARE TOOLS AND EXPLOITS',
 'VULNERABILITY',
 'RECRUITMENT',
 'DEFACEMENT',
 'PHISHING',
 'SPAMMING',
 'HACKING',
 'SCAM PAGE',
 'LOGS',
 'SMS OR EMAIL MAILER',
 'GOOD REVIEW',
 'BAD REVIEW']

In [14]:
# Because the output of labels are a probability distribution, need to decide a threshold to accept it as a label
def standardize_tags(doc, threshold):
    tags = doc.cats
    for k in tags:
        if tags[k] >= threshold:
            tags[k] = 1
        else:
            tags[k] = 0
    return tags

# Using model to predict each text.
def nlp_predict(text, nlp, threshold):
    doc = nlp(text)
    tags = standardize_tags(doc, threshold)
    tags_list = []
    for k,v in tags.items():
        if v == 1:
            tags_list.append(k)
    return tags_list

### Model Last

In [15]:
nlp = spacy.load("models/v4/model-last")
df["predicted_output"] = df["text"].apply(lambda text: nlp_predict(text, nlp,0.8))

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


In [27]:
df

Unnamed: 0,text,accept,predicted_output
0,x2000 Steam Accounts with Games #4 \n\nThis l...,"[DATA LEAKS, CREDENTIALS OR ACCOUNTS, OFFERING...","[CREDENTIALS OR ACCOUNTS, DATA LEAKS]"
1,332K Combolist EDU OFFICE 332K Combolist EDU O...,"[DATA LEAKS, COMPANY OR ORG INFORMATION, OFFER...","[OFFERING OF SERVICE OR PRODUCT, CREDENTIALS O..."
2,Mycanal ACCOUNTS PREMIUM diariatouaidara1999@g...,"[CREDENTIALS OR ACCOUNTS, DATA LEAKS, OFFERING...","[CREDENTIALS OR ACCOUNTS, DATA LEAKS]"
4,Connecting to shoutbox Anyone have solution to...,[ADVICE],[ADVICE]
5,BWW - Free Food - Accounts with Over 1000 Pts ...,"[OFFERING OF SERVICE OR PRODUCT, CREDENTIALS O...","[CREDENTIALS OR ACCOUNTS, DATA LEAKS]"
...,...,...,...
101,Looking for stealer Hello please what’s the la...,"[REQUEST FOR SERVICE OR PRODUCT, ADVICE, MALWA...","[REQUEST FOR SERVICE OR PRODUCT, ADVICE, MALWA..."
102,Android vulnerability to install a silent payl...,"[VULNERABILITY, MALWARE TOOLS AND EXPLOITS, AD...",[OFFERING OF SERVICE OR PRODUCT]
103,Email/Phone leads - USA banks I have variously...,"[OFFERING OF SERVICE OR PRODUCT, MONEY INVOLVE...","[DATA LEAKS, PERSONAL INFORMATION]"
104,110K Malaysian Online Casino Customers [ Depos...,"[OFFERING OF SERVICE OR PRODUCT, DATA LEAKS, P...","[OFFERING OF SERVICE OR PRODUCT, CREDENTIALS O..."


In [17]:
y_pred_set = mlb.fit_transform(df["predicted_output"])
y_test_set = mlb.fit_transform(df["accept"])

y_pred_set.shape == y_test_set.shape


True

In [18]:

confusion_matrix_= multilabel_confusion_matrix(y_test_set, y_pred_set)
cls_report = classification_report(y_test_set, y_pred_set)
f1 = f1_score(y_test_set, y_pred_set, average = "micro")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
print(cls_report)

              precision    recall  f1-score   support

           0       0.77      0.90      0.83        40
           1       0.82      0.70      0.76        93
           2       0.86      0.92      0.89        91
           3       0.90      0.76      0.82        71
           4       0.89      0.47      0.62        17
           5       0.72      0.67      0.70        46
           6       0.75      0.50      0.60         6
           7       0.00      0.00      0.00         2
           8       0.85      0.70      0.77        83
           9       0.87      0.74      0.80        27
          10       0.31      0.16      0.21        31
          11       0.67      0.40      0.50         5
          12       0.93      0.53      0.67        51
          13       1.00      0.33      0.50         6
          14       0.50      0.67      0.57         9
          15       0.00      0.00      0.00         1
          16       0.67      0.67      0.67         3
          17       0.75    

In [20]:
print(f1)

0.7382198952879581


### Model Best

In [26]:
nlp = spacy.load("models/v4/model-best")
df["predicted_output"] = df["text"].apply(lambda text: nlp_predict(text, nlp,0.8))

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


In [28]:
y_pred_set = mlb.fit_transform(df["predicted_output"])
y_test_set = mlb.fit_transform(df["accept"])

y_pred_set.shape == y_test_set.shape

True

In [29]:
confusion_matrix_= multilabel_confusion_matrix(y_test_set, y_pred_set)
cls_report = classification_report(y_test_set, y_pred_set)
f1 = f1_score(y_test_set, y_pred_set, average = "micro")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
print(cls_report)

              precision    recall  f1-score   support

           0       0.80      0.88      0.83        40
           1       0.71      0.82      0.76        93
           2       0.91      0.88      0.89        91
           3       0.92      0.68      0.78        71
           4       0.83      0.59      0.69        17
           5       0.71      0.76      0.74        46
           6       1.00      0.33      0.50         6
           7       0.00      0.00      0.00         2
           8       0.89      0.67      0.77        83
           9       0.75      0.89      0.81        27
          10       0.43      0.29      0.35        31
          11       1.00      0.40      0.57         5
          12       0.91      0.59      0.71        51
          13       1.00      0.17      0.29         6
          14       0.71      0.56      0.63         9
          15       0.00      0.00      0.00         1
          16       1.00      0.67      0.80         3
          17       0.75    

In [31]:
print(f1)

0.743103448275862
