In [1]:
import os, re
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
!pip install transformers tika yake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/MyDrive/CYsyphus/indexed_corpus

/content/drive/.shortcut-targets-by-id/1X1APgHBDiD-6ucORNx5eM_SWKZYOgyvz/indexed_corpus


In [5]:
def remove_punc(pdf_content):
    punc = ['• ', '· ', '&', '~', ' o ', '\uf0a7', '\uf03c', '\uf0b7', 
            '–', '()', '[…]', '| ', '© ', '(Insert Scale)', '_', '%', '[', ']', 'Ü ']
    for p in punc:
        pdf_content = pdf_content.replace(p, '')
    return pdf_content

def remove_bulleted_points(pdf_content):
    pdf_content = re.sub(r'\.+ [0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+[0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)

    pdf_content = re.sub(r'\([0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+.', '', pdf_content)
    pdf_content = re.sub(r'\([a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r' [a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r'\(i+\)', '', pdf_content)
    pdf_content = re.sub(r' i+\)', '', pdf_content)

    pdf_content = re.sub('\s\s+', ' ', pdf_content)
    return pdf_content

def remove_url(pdf_content):
    url = re.findall('http[s]?://\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    url = re.findall('www.\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    pdf_content = re.sub(r'http[s]?://', '', pdf_content)
    return pdf_content

def filter_sentences_by_length(pdf_sentence):
    return [s for s in pdf_sentence if len(word_tokenize(s)) > 4 and len(word_tokenize(s)) < 200]

In [6]:
from tika import parser

sentences = []

for i in range(1, 16):
    
    pdf_path = str(i) + ".pdf"
    parsed_pdf = parser.from_file(pdf_path)

    pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
    pdf_content = remove_punc(pdf_content)
    pdf_content = remove_bulleted_points(pdf_content)
    pdf_content = remove_url(pdf_content)
    pdf_content = remove_punc(pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)
    pdf_content = re.sub(r'\s\s+', ' ', pdf_content)
    
    pdf_sentence = sent_tokenize(pdf_content)
    filtered_sentence = filter_sentences_by_length(pdf_sentence)
    sentences += filtered_sentence

len(sentences)

2023-05-12 02:36:54,438 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar to /tmp/tika-server.jar.
2023-05-12 02:36:58,639 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
INFO:tika.tika:Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server-standard/2.6.0/tika-server-standard-2.6.0.jar.md5 to /tmp/tika-server.jar.md5.
2023-05-12 02:36:59,752 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


13667

In [7]:
# Extract Keywords

import yake

language = "en"
max_ngram_size = 2 
deduplication_threshold = 0.3
num_keywords = 50

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=num_keywords, features=None)
keywords_tuple = custom_kw_extractor.extract_keywords(' '.join(sentences))
keywords_list = [word.lower() for word, score in keywords_tuple]

In [8]:
eval_doc_idx = 13
doc_for_eval = 'DHS_Common_Cybersecurity_Vulnerabilities_ICS_2010.pdf'

sentences_for_eval = []

pdf_path = str(eval_doc_idx) + ".pdf"
parsed_pdf = parser.from_file(pdf_path)

pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
pdf_content = remove_punc(pdf_content)
pdf_content = remove_bulleted_points(pdf_content)
pdf_content = remove_url(pdf_content)
pdf_content = remove_punc(pdf_content)
pdf_content = re.sub(r'\.+', '.', pdf_content)
pdf_content = re.sub(r'\s\s+', ' ', pdf_content)

pdf_sentence = sent_tokenize(pdf_content)
filtered_sentence = filter_sentences_by_length(pdf_sentence)
sentences_for_eval += filtered_sentence

len(sentences_for_eval)

1412

In [9]:
# Filter Sentence with KWs

lem = WordNetLemmatizer()
sentences_for_eval_with_kw = []

for sent in sentences_for_eval:
    lem_sent = lem.lemmatize(sent)
    lem_sent = lem_sent.lower()
    if any(kw in lem_sent for kw in keywords_list):
        sentences_for_eval_with_kw.append(sent)

len(sentences_for_eval_with_kw)

888

In [10]:
from transformers import BertTokenizer, TFBertForMaskedLM, BertConfig

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


num_hidden_layers = 12
hidden_size = 768

model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')

# model.bert.embeddings.trainable = False
# unfreeze_layer_count = 2
# for layer in model.bert.encoder.layer[:-unfreeze_layer_count]:
#     layer.trainable = False

# model.load_weights(f'/content/drive/MyDrive/frz_layer_{unfreeze_layer_count}.h5')
# print('Model Loaded!')


# num_hidden_layers = 6
# hidden_size = 768

# model_config = BertConfig.from_pretrained('bert-base-uncased')
# model_config.num_attention_heads = 6   # Default is 12
# model_config.num_hidden_layers = num_hidden_layers     # Default is 12
# model_config.hidden_size = hidden_size         # Default is 768
# model_config.hidden_dropout_prob = 0.2  # Default is 0.1

# model = TFBertForMaskedLM(model_config)


input_ids = tokenizer("Hello, my dog is cute", return_tensors="tf")["input_ids"]
outputs = model(input_ids, labels=input_ids, output_hidden_states=True)

# Load model weights
model.load_weights('/content/drive/MyDrive/baseline_kw.h5')
print('Model Loaded!')

outputs[2][num_hidden_layers].shape

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Model Loaded!


TensorShape([1, 8, 768])

In [11]:
recomm_df = pd.read_csv('https://raw.githubusercontent.com/ColumbiaCysyphus/ml-dev/shubham-dev/summer_2022/cleaned_recs.csv', usecols=[1, 2])
recomm_df = recomm_df.dropna(how='any')
recomm_df.head(3)

Unnamed: 0,Document File Name,Recommendation text
0,GOVPUB-C13-e1840672bcc4d823d5a2b11dcb45d280.pdf,When evaluating IT products and services that ...
1,GOVPUB-C13-e1840672bcc4d823d5a2b11dcb45d280.pdf,Organizations developing their own custom secu...
2,GOVPUB-C13-e1840672bcc4d823d5a2b11dcb45d280.pdf,"Using CVE, CCE, and CPE names supports interop..."


In [12]:
recomm_df_for_clst = recomm_df[recomm_df['Document File Name '] != doc_for_eval]
recomm_df_for_eval = recomm_df[recomm_df['Document File Name '] == doc_for_eval]

recomm_sentences_for_clst = []
for item in list(recomm_df_for_clst['Recommendation text']):
  item = item.replace('\n', ' ').strip()
  item = re.sub("\s\s+", " ", item)
  item = re.sub(r'\.+', ".", item)
  item_sentence = sent_tokenize(item)
  recomm_sentences_for_clst += item_sentence

recomm_sentences_for_eval = []
for item in list(recomm_df_for_eval['Recommendation text']):
  item = item.replace('\n', ' ').strip()
  item = re.sub("\s\s+", " ", item)
  item = re.sub(r'\.+', ".", item)
  item_sentence = sent_tokenize(item)
  recomm_sentences_for_eval += item_sentence

len(recomm_sentences_for_clst), len(recomm_sentences_for_eval)

(87, 26)

In [13]:
def get_CLS_embedding(model, sent):
  input_ids = tokenizer(sent, return_tensors="tf")["input_ids"]
  outputs = model(input_ids, labels=input_ids, output_hidden_states=True)
  sentence_embedding_every_token = outputs[2][num_hidden_layers][0]  # hidden states, final layer embedd, squeeze dim
  CLS_embedding = sentence_embedding_every_token[0]  # CLS as the first token
  return CLS_embedding

In [14]:
def get_sentence_embedding(model, sent):
  input_ids = tokenizer(sent, return_tensors="tf")["input_ids"]
  outputs = model(input_ids, labels=input_ids, output_hidden_states=True)
  sentence_embedding_every_token = outputs[2][num_hidden_layers][0]
  sentence_embedding = tf.reduce_mean(sentence_embedding_every_token, 0)
  return sentence_embedding

In [15]:
# def get_batch_sentence_embedding(model, sent):
#   input_ids = tokenizer(sent, return_tensors="tf", padding='max_length', max_length=100, truncation=True)["input_ids"]
#   outputs = model(input_ids, labels=input_ids, output_hidden_states=True)
#   sentence_embedding_every_token = outputs[2][num_hidden_layers]
#   sentence_embedding = tf.reduce_mean(sentence_embedding_every_token, 1)
#   return sentence_embedding

In [16]:
recomm_len = len(recomm_sentences_for_clst)
recomm_mat = np.zeros((recomm_len, hidden_size))

for i in tqdm(range(recomm_len)):
  recomm_mat[i, :] = get_sentence_embedding(model, recomm_sentences_for_clst[i])

100%|██████████| 87/87 [00:45<00:00,  1.91it/s]


In [17]:
eval_doc_len = len(sentences_for_eval_with_kw)
eval_mat = np.zeros((eval_doc_len, hidden_size))

for i in tqdm(range(eval_doc_len)):
  eval_mat[i, :] = get_sentence_embedding(model, sentences_for_eval_with_kw[i])

# eval_mat = get_batch_sentence_embedding(model, sentences_for_eval_with_kw)

100%|██████████| 888/888 [07:16<00:00,  2.03it/s]


In [18]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

num_clst = 3
kmeans = KMeans(n_clusters=num_clst, n_init='auto').fit(recomm_mat)
recomm_centroid = kmeans.cluster_centers_
recomm_centroid.shape

(3, 768)

In [19]:
def nearest_neighbors(candidate, centroid):
  dist = np.array([[np.linalg.norm(c-x, 2) for c in centroid] for x in candidate])
  dist_reduce = dist.min(axis=1)
  # dist = cosine_similarity(candidate, centroid)
  # dist_reduce = dist.max(axis=1)
  dist_sort, idx_sort = np.sort(dist_reduce), np.argsort(dist_reduce)
  return dist_sort, idx_sort

In [20]:
top_k = 25

dist, nn_idx = nearest_neighbors(eval_mat, recomm_centroid)
top_sentences = [sentences_for_eval[idx] for idx in nn_idx[:top_k]]
top_sentences

['Recommendation: Because of the complexity of ICS software and possible modifications to the underlying operating system, changes must undergo comprehensive regression testing.',
 'Examples of these services are as telnet, FTP, and rsh.',
 'Any communication can be “tunneled” through SSH.',
 'Usage of common administrative passwords should be discouraged.',
 'One should keep this in mind when interpreting common vulnerability data.',
 'Some had newer versions available just for security fixes.',
 'In every case, the lack of protection of user credentials may lead to the attacker gaining increased privileges on the ICS and thus being able to more effectively advance the attack.',
 'Although differences in these systems exist, their similarities enable a common framework for discussing and defining security controls.',
 'Correlated and compiled in this report are vulnerabilities from general knowledge gained from DHS CSSP assessments and Industrial Control Systems Cyber Emergency Respon

In [21]:
len(recomm_sentences_for_eval)

26

In [22]:
recomm_sentences_for_eval

['All code should be written to validate input data.',
 'All input should be validated, not just those proven to cause buffer overflows.',
 'Every programmer should be trained in secure coding practices.',
 'All code should be reviewed and tested for input functions that could be susceptible to buffer overflow attacks.',
 'All input should be validated, not just those proven to cause buffer overflows.',
 'Input values should be validated.',
 'Network data value and integrity checking should be implemented.',
 'Use an “accept known good” input validation strategy, i.e., use a whitelist of acceptable inputs that strictly conform to specifications.',
 'Reject any input that does not strictly conform to specifications, or transform it into something that does.',
 'If possible, use library calls rather than external processes to recreate the desired functionality.',
 'Otherwise, ensure that all external commands called from the program arestatically created if possible.',
 'Perform input va