### Install relevant packages if not done so

In [None]:
!pip install -U spacy
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

In [None]:
import os
import pandas as pd
import re
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
nlp = spacy.load("en_core_web_trf")

### Importing relevant training files

In [None]:
file1 = "forum_breached_20221115_20221201_155_annotations.jsonl"
file2 = "forum_exploit_20220101_20220201_300_posts_set1_226_annotations.jsonl"
file3 = "forum_exploit_20220301_20220401_251_annotations.jsonl"
file4 =  "forum_exploit_20220801_20220815_163_annotations.jsonl"
file5 = "forum_nulled_20220801_20220815_147_annotations.jsonl"
file6 = "forum_xss_posts_20220801_20220815_157_annotations.jsonl"
file7 = "popular_forums_20221101_20221104_500_posts_set1_486_annotations.jsonl"
phishing = "phishing.jsonl"
company_orginfo = "company_orginfo.jsonl"
vuln = "vulnerability.jsonl"

files = [file1,file2,file3,file4,file5,file6,file7,phishing,company_orginfo,vuln]

#Importing relevant files

def merge_jsonl_files(files):
    curr_path = os.getcwd()
    df_list = []

    for file in files:
        file_path = os.path.join(curr_path,"..","prodigy","annotation_output", file)
        print(file)
        print(file_path)
        df = pd.read_json(file_path,lines= True)
        df_list.append(df)

    merged_df = pd.concat(df_list)

    return merged_df

#Removing special characters
def sp_char_remove(review):
    review = re.sub('\[[^]]*\]', ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    return review

#Removing special characters
def stopword_remover(text):
    x=[]
    text=text.split()    #splitting into individual words
    for i in text:
        if i not in stopwords.words('english'):
            x.append(i)
    return x

def url_remover(text):
    remove = "http\S+"
    text = re.sub(remove, " ", text)
    return text

#Total dataframe
df_dummy = merge_jsonl_files(files)
df_dummy_dummy = df_dummy[df_dummy.answer == "accept"]
df = df_dummy_dummy.drop(columns=["_input_hash","_session_id","_task_hash","_view_id","options","config", "answer"])
#df["accept"] = df["accept"].apply(lambda x: x if x else ["EMPTY"])
df["text"] = df["text"].apply(url_remover)
#df["text"] = df["text"].apply(lambda x: x.lower())
del df["meta"]
df

In [None]:
df["text"]

### Converting labelled outputs to a binary matrix

In [None]:
cwd = os.getcwd()
label_path = os.path.join(cwd, "..","labels.txt")
label_data = open(label_path,"r").read()
labels = label_data.split("\n")
mlb = MultiLabelBinarizer(classes=labels)

In [None]:
mlb.classes

### Splitting df to training and test


In [None]:
#Splitting df to training and test
train, validation = train_test_split(df, test_size=0.2)

print("size of training data:",len(train))
print("size of test data:", len(validation))

### Creating DocBin

In [None]:
def convert_text_to_bin_format(nlp, row, categories):
    doc = nlp.make_doc(row["text"])
    #print(categories)
    doc.cats = {cat: 0 for cat in categories}

    for label in row["accept"]:
        doc.cats[label] = 1
    #print(doc.cats)

    return doc

In [None]:
#Creating a DocBin - train
num_of_rows = len(train)
docs = []
categories = mlb.classes

for i in range(num_of_rows):
    row = train.iloc[i]
    doc = convert_text_to_bin_format(nlp, row, categories)
    docs.append(doc)

train_doc_bin = DocBin(docs=docs)
curr_path = os.getcwd()
path = os.path.join(curr_path,"..","data","transformer_data_no_url","training.spacy")

train_doc_bin.to_disk(path)

In [None]:
#Creating a DocBin - validation
num_of_rows = len(validation)
docs = []
categories = mlb.classes
for i in range(num_of_rows):
    row = validation.iloc[i]
    doc = convert_text_to_bin_format(nlp, row, categories)
    docs.append(doc)

test_doc_bin = DocBin(docs=docs)
curr_path = os.getcwd()
path = os.path.join(curr_path,"..","data","transformer_data_no_url","validation.spacy")

test_doc_bin.to_disk(path)

### Creating specific spacy config files for training


!python -m spacy init fill-config <path/to/input/base/config/file>  <output/config/path>

In [None]:
!python -m spacy init fill-config configs/base_config_textcat.cfg configs/txt_classification_config_batch128_raw.cfg

### Training the model using spaCy

!python -m spacy train <path/of/config/file> --output <path/to/save/model> --paths.train <training/data/path> --paths.dev <validation/data/path>

In [None]:
!python -m spacy train ../configs/txt_classification_config.cfg --output ../models/v4 --paths.train ./../data/transformer_data_no_url/training.spacy --paths.dev ./../data/transformer_data_no_url/validation.spacy

### Comparing models

In [None]:
def standardize_tags(doc, threshold):
    tags = doc.cats
    for k in tags:
        #print(k)
        if tags[k] >= threshold:
            tags[k] = 1
        else:
            tags[k] = 0
    return tags


In [None]:
trained_nlp = spacy.load("models/trf_output/model-best")
text = " 685K HQ Private Combolist Email:Pass [Netflix,Minecraft,Uplay,Steam,Paypal,Hulu,Vpn,Spotify,Etc....]  PLZ REPLY THIS THREAD FOR MOR COMBO  Download Link......   https://rosefile.net/2m9km2ui7g/685K.zip.html 685K HQ Private Combolist Email:Pass [Netflix,Minecraft,Uplay,Steam,Paypal,Hulu,Vpn,Spotify,Etc....]\n\nPLZ REPLY THIS THREAD FOR MOR COMBO\n\nDownload Link......\n\nHidden Content\n\nReply  to this topic to view hidden content or  Update your account. (https://crackingall.com/index.php?/subscriptions/)"
doc = trained_nlp(text)
doc.cats
#print(standardize_tags(doc, 0.9))
#doc.cats

In [None]:
text = "Invicti Vulnerability Scanner Professional version. Windows version. I tested it and it works fine. \n\nNot spywares, not backdoors or any type of bullshit. Now, as always, you should try this in your virtual machine and all that.\n\nInstruction to use this:\n\nYou just have to find the file files in the folder and the main one is\u00a0Netsparker.exe. Its pretty easy to use but if anything, you should try googling etc. I is important that you allow this in your firewall list of programs etc. This is a version for windows.\n\nI tasted it against some vulnerable and non vulnerable web app that I setup on my machine\u00a0and it works.\n\nDisclaimer: This is for educational purposes only and you should always have permission to perform the scanning etc. I don't take any responsibility for stupid skidoos\n\nand Mr robot fans.\n\nI wanted to share this without credits but I feel that leachers are dangerous and usually they use the tools to cause damage and do not contribute at all.\n\nEnjoy\n\nHidden Content\n\nYou must reply to this thread to view this content.\n\nHidden Content\n\nYou must reply to this thread to view this content."
doc = trained_nlp(text)
print(standardize_tags(doc, 0.9))