In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # matrix construction
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB

import pandas as pd
import json
import os

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval

import sklearn.metrics
import numpy as np

In [None]:
nlp = spacy.load("en_core_web_sm", disable=['parser','tagger', 'parser', 'ner']) 

# Opening Files: 

In [None]:
# Processing datasets retrieved from MITRE's Github and our webscraping: 

dataset1 = "dataset_full_text.json"
path_data1 =  "../src/scraping/scraped_data/dataset_full_text.json"

dataset2 = "dataset_malware_text.json"
path_data2 =  "../src/scraping/scraped_data/dataset_malware_text.json"

with open(path_data1) as file:
    open_data = json.load(file)

with open(path_data2) as file: 
    open_data2 = json.load(file)

df1 = pd.DataFrame(open_data).transpose().reset_index(drop = True)

df2 = pd.DataFrame(open_data2).transpose().reset_index(drop = False)
df2.columns = ['url', 'mitre_domain', 'tech_name', 'tech_id', 'software_id', 'text'] # renaming columns 



# Merging our datasets:

In [None]:
df = pd.concat([df1, df2], axis = 0)  # Create one Data Frame with both datasets

# Cleaning: 

In [None]:
# Cleaning NAs in text: 

df['tactic_name'] = df['tactic_name'].fillna("").apply(list) 
df['software_id'] = df['software_id'].fillna("").apply(list)

In [None]:
df = df[df['text'] != '\n']

In [None]:
# Cleaning duplicates: 

dup = df[df.duplicated(subset='text')]

In [None]:
df_no_dup = df.drop_duplicates(subset='text').reset_index(drop=True)

In [None]:
for _, row in dup.iterrows():
    row_id = df_no_dup[df_no_dup['text'] == row['text']].index[0]
    for col in ['mitre_domain', 'tech_id', 'tech_name', 'software_id', 'tactic_name']:
        merged_list = df_no_dup.loc[row_id, col]
        for item in row[col]:
            if item not in merged_list:
                merged_list.append(item)
        

# Filetering URLS: 

In [None]:
df = df_no_dup

In [None]:
def is_url_relevant(url):
    for word in ['microsoft', 'apple', 'github', 'wikipedia',
                 'support.office', 'amazon', 'gitlab', 'capec', 'docker', 'youtube', 'google', 'mitre', 'zip', 
                 'twitter']:
        if word in url:
            return False
    return True

df = df[df['url'].apply(is_url_relevant)]
df

In [None]:
for i, row in df.iterrows():
    print('--------------')
    print(row['url'])
    print('--------------')
    print(row['text'])
    if i > 10:
        break
    

In [None]:
# Save to csv merged and cleaned dataset: 

df.to_csv("../data/merged_dataset.csv", index = False)


In [None]:
df_tech = df.explode(['tech_id']).reset_index(drop = True)

# Add Tactic to dataset:

In [None]:
df = pd.read_csv('merged_dataset_noMalwareNames.csv')

In [None]:
for col in ['mitre_domain', 'tech_name', 'tech_id', 'software_id']:
    df[col] = df[col].apply(literal_eval)

In [None]:
tactic_dataset = "tactic_dataset.json"
path_dataset = "../src/tactic_dataset.json"

with open(path_dataset) as file: 
    open_data = json.load(file)   

In [None]:
def tactic_list(tech_id):
    tactics = []
    tech_id_set = set(tech_id)
    for tactic_id in open_data:
        if len(tech_id_set.intersection(open_data[tactic_id]['Technique_ID'][0])) > 0:
            tactics.append(tactic_id)
    return tactics

In [None]:
df['tactic_id'] = df['tech_id'].apply(tactic_list)

# Export Cleaned Dataset: 

In [None]:
# Export new dataset for training: 

df.to_csv('../data/training_dataset_full.csv')