In [38]:
import os, re
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
!pip install transformers tika yake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
cd drive/MyDrive/CYsyphus/indexed_corpus

[Errno 2] No such file or directory: 'drive/MyDrive/CYsyphus/indexed_corpus'
/content/drive/.shortcut-targets-by-id/1X1APgHBDiD-6ucORNx5eM_SWKZYOgyvz/indexed_corpus


In [42]:
def remove_punc(pdf_content):
    punc = ['• ', '· ', '&', '~', ' o ', '\uf0a7', '\uf03c', '\uf0b7', 
            '–', '()', '[…]', '| ', '© ', '(Insert Scale)', '_', '%', '[', ']', 'Ü ']
    for p in punc:
        pdf_content = pdf_content.replace(p, '')
    return pdf_content

def remove_bulleted_points(pdf_content):
    pdf_content = re.sub(r'\.+ [0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+[0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)

    pdf_content = re.sub(r'\([0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+.', '', pdf_content)
    pdf_content = re.sub(r'\([a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r' [a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r'\(i+\)', '', pdf_content)
    pdf_content = re.sub(r' i+\)', '', pdf_content)

    pdf_content = re.sub('\s\s+', ' ', pdf_content)
    return pdf_content

def remove_url(pdf_content):
    url = re.findall('http[s]?://\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    url = re.findall('www.\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    pdf_content = re.sub(r'http[s]?://', '', pdf_content)
    return pdf_content

def filter_sentences_by_length(pdf_sentence):
    return [s for s in pdf_sentence if len(word_tokenize(s)) > 4 and len(word_tokenize(s)) < 200]

In [43]:
from tika import parser

sentences = []

for i in range(1, 16):
    
    pdf_path = str(i) + ".pdf"
    parsed_pdf = parser.from_file(pdf_path)

    pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
    pdf_content = remove_punc(pdf_content)
    pdf_content = remove_bulleted_points(pdf_content)
    pdf_content = remove_url(pdf_content)
    pdf_content = remove_punc(pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)
    pdf_content = re.sub(r'\s\s+', ' ', pdf_content)
    
    pdf_sentence = sent_tokenize(pdf_content)
    filtered_sentence = filter_sentences_by_length(pdf_sentence)
    sentences += filtered_sentence

len(sentences)

13667

In [44]:
# Extract Keywords

import yake

language = "en"
max_ngram_size = 2 
deduplication_threshold = 0.3
num_keywords = 50

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=num_keywords, features=None)
keywords_tuple = custom_kw_extractor.extract_keywords(' '.join(sentences))
keywords_list = [word.lower() for word, score in keywords_tuple]

In [45]:
eval_doc_idx = 13
doc_for_eval = 'DHS_Common_Cybersecurity_Vulnerabilities_ICS_2010.pdf'

sentences_for_eval = []

pdf_path = str(eval_doc_idx) + ".pdf"
parsed_pdf = parser.from_file(pdf_path)

pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
pdf_content = remove_punc(pdf_content)
pdf_content = remove_bulleted_points(pdf_content)
pdf_content = remove_url(pdf_content)
pdf_content = remove_punc(pdf_content)
pdf_content = re.sub(r'\.+', '.', pdf_content)
pdf_content = re.sub(r'\s\s+', ' ', pdf_content)

pdf_sentence = sent_tokenize(pdf_content)
filtered_sentence = filter_sentences_by_length(pdf_sentence)
sentences_for_eval += filtered_sentence

len(sentences_for_eval)

1412

In [46]:
# Filter Sentence with KWs

lem = WordNetLemmatizer()
sentences_for_eval_with_kw = []

for sent in sentences_for_eval:
    lem_sent = lem.lemmatize(sent)
    lem_sent = lem_sent.lower()
    if any(kw in lem_sent for kw in keywords_list):
        sentences_for_eval_with_kw.append(sent)

len(sentences_for_eval_with_kw)

888

In [73]:
from transformers import BertTokenizer, TFBertForMaskedLM, BertConfig

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# num_hidden_layers = 12
# hidden_size = 768

# model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')

# model.bert.embeddings.trainable = False
# unfreeze_layer_count = 2
# for layer in model.bert.encoder.layer[:-unfreeze_layer_count]:
#     layer.trainable = False

# model.load_weights(f'/content/drive/MyDrive/frz_layer_{unfreeze_layer_count}.h5')
# print('Model Loaded!')


num_hidden_layers = 6
hidden_size = 384

model_config = BertConfig.from_pretrained('bert-base-uncased')
model_config.num_attention_heads = 6   # Default is 12
model_config.num_hidden_layers = num_hidden_layers     # Default is 12
model_config.hidden_size = hidden_size         # Default is 768
model_config.hidden_dropout_prob = 0.2  # Default is 0.1

model = TFBertForMaskedLM(model_config)


input_ids = tokenizer("Hello, my dog is cute", return_tensors="tf")["input_ids"]
outputs = model(input_ids, labels=input_ids, output_hidden_states=True)

# Load model weights
model.load_weights(f'/content/drive/MyDrive/low_capacity_{hidden_size}.h5')
print('Model Loaded!')

outputs[2][num_hidden_layers].shape

Model Loaded!


TensorShape([1, 8, 384])

In [74]:
recomm_df = pd.read_csv('https://raw.githubusercontent.com/ColumbiaCysyphus/ml-dev/shubham-dev/summer_2022/cleaned_recs.csv', usecols=[1, 2])
recomm_df = recomm_df.dropna(how='any')
recomm_df.head(3)

Unnamed: 0,Document File Name,Recommendation text
0,GOVPUB-C13-e1840672bcc4d823d5a2b11dcb45d280.pdf,When evaluating IT products and services that ...
1,GOVPUB-C13-e1840672bcc4d823d5a2b11dcb45d280.pdf,Organizations developing their own custom secu...
2,GOVPUB-C13-e1840672bcc4d823d5a2b11dcb45d280.pdf,"Using CVE, CCE, and CPE names supports interop..."


In [75]:
recomm_df_for_clst = recomm_df[recomm_df['Document File Name '] != doc_for_eval]
recomm_df_for_eval = recomm_df[recomm_df['Document File Name '] == doc_for_eval]

recomm_sentences_for_clst = []
for item in list(recomm_df_for_clst['Recommendation text']):
  item = item.replace('\n', ' ').strip()
  item = re.sub("\s\s+", " ", item)
  item = re.sub(r'\.+', ".", item)
  item_sentence = sent_tokenize(item)
  recomm_sentences_for_clst += item_sentence

recomm_sentences_for_eval = []
for item in list(recomm_df_for_eval['Recommendation text']):
  item = item.replace('\n', ' ').strip()
  item = re.sub("\s\s+", " ", item)
  item = re.sub(r'\.+', ".", item)
  item_sentence = sent_tokenize(item)
  recomm_sentences_for_eval += item_sentence

len(recomm_sentences_for_clst), len(recomm_sentences_for_eval)

(87, 26)

In [76]:
def get_CLS_embedding(model, sent):
  input_ids = tokenizer(sent, return_tensors="tf")["input_ids"]
  outputs = model(input_ids, labels=input_ids, output_hidden_states=True)
  sentence_embedding_every_token = outputs[2][num_hidden_layers][0]  # hidden states, final layer embedd, squeeze dim
  CLS_embedding = sentence_embedding_every_token[0]  # CLS as the first token
  return CLS_embedding

In [77]:
def get_sentence_embedding(model, sent):
  input_ids = tokenizer(sent, return_tensors="tf")["input_ids"]
  outputs = model(input_ids, labels=input_ids, output_hidden_states=True)
  sentence_embedding_every_token = outputs[2][num_hidden_layers][0]
  sentence_embedding = tf.reduce_mean(sentence_embedding_every_token, 0)
  return sentence_embedding

In [78]:
# def get_batch_sentence_embedding(model, sent):
#   input_ids = tokenizer(sent, return_tensors="tf", padding='max_length', max_length=100, truncation=True)["input_ids"]
#   outputs = model(input_ids, labels=input_ids, output_hidden_states=True)
#   sentence_embedding_every_token = outputs[2][num_hidden_layers]
#   sentence_embedding = tf.reduce_mean(sentence_embedding_every_token, 1)
#   return sentence_embedding

In [79]:
recomm_len = len(recomm_sentences_for_clst)
recomm_mat = np.zeros((recomm_len, hidden_size))

for i in tqdm(range(recomm_len)):
  recomm_mat[i, :] = get_sentence_embedding(model, recomm_sentences_for_clst[i])

100%|██████████| 87/87 [00:20<00:00,  4.35it/s]


In [80]:
eval_doc_len = len(sentences_for_eval_with_kw)
eval_mat = np.zeros((eval_doc_len, hidden_size))

for i in tqdm(range(eval_doc_len)):
  eval_mat[i, :] = get_sentence_embedding(model, sentences_for_eval_with_kw[i])

# eval_mat = get_batch_sentence_embedding(model, sentences_for_eval_with_kw)

100%|██████████| 888/888 [03:18<00:00,  4.47it/s]


In [81]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

num_clst = 3
kmeans = KMeans(n_clusters=num_clst, n_init='auto').fit(recomm_mat)
recomm_centroid = kmeans.cluster_centers_
recomm_centroid.shape

(3, 384)

In [82]:
def nearest_neighbors(candidate, centroid):
  dist = np.array([[np.linalg.norm(c-x, 2) for c in centroid] for x in candidate])
  dist_reduce = dist.min(axis=1)
  # dist = cosine_similarity(candidate, centroid)
  # dist_reduce = dist.max(axis=1)
  dist_sort, idx_sort = np.sort(dist_reduce), np.argsort(dist_reduce)
  return dist_sort, idx_sort

In [83]:
top_k = 25

dist, nn_idx = nearest_neighbors(eval_mat, recomm_centroid)
top_sentences = [sentences_for_eval[idx] for idx in nn_idx[:top_k]]
top_sentences

['Examples of these services are as telnet, FTP, and rsh.',
 'Recommendation: The first step in implementing a cybersecurity program for ICS is to develop a compelling business case for the unique needs of the organization.',
 'Without sufficient removal or quoting of SQL syntax in user-controllable inputs, the generated SQL query can cause those inputs to be interpreted as SQL instead of ordinary user data.',
 'Patches are additional pieces of code that have been developed to address specific problems or flaws in existing software.',
 'Any communication can be “tunneled” through SSH.',
 'Recommendation: The vendor bears responsibility to incorporate the latest versions of third-party (and operating system) software into the current version of the ICS product before delivery.',
 'Some had newer versions available just for security fixes.',
 'Many scenarios allow for this behavior, but they are caused by a lack of data sanitization.',
 'Improper Input Validation Buffer Overflow Input va

In [84]:
len(recomm_sentences_for_eval)

26

In [85]:
recomm_sentences_for_eval

['All code should be written to validate input data.',
 'All input should be validated, not just those proven to cause buffer overflows.',
 'Every programmer should be trained in secure coding practices.',
 'All code should be reviewed and tested for input functions that could be susceptible to buffer overflow attacks.',
 'All input should be validated, not just those proven to cause buffer overflows.',
 'Input values should be validated.',
 'Network data value and integrity checking should be implemented.',
 'Use an “accept known good” input validation strategy, i.e., use a whitelist of acceptable inputs that strictly conform to specifications.',
 'Reject any input that does not strictly conform to specifications, or transform it into something that does.',
 'If possible, use library calls rather than external processes to recreate the desired functionality.',
 'Otherwise, ensure that all external commands called from the program arestatically created if possible.',
 'Perform input va