In [1]:
import pandas as pd 
import numpy as np
import spacy as sp

nlp = sp.load('en_core_web_sm')
nlp_lg = sp.load('en_core_web_lg')

In [2]:
import matplotlib.pyplot as plt
import seaborn as sn

## Import data

In [6]:
full_data_clustered = pd.read_csv('full_data_clustered.csv')
full_data_clustered = full_data_clustered.replace(np.nan,'None')

In [4]:
full_data_clustered.head(2)

Unnamed: 0,CVE ID,Publish Date,Update Date,Number Of Related Vulnerabilities,index,CWE ID,Vulnerability Type(s),Score,Access,Complexity,Authentication,Conf.,Integ.,Avail,cluster,CVE Summary,Vulnerability Name,Vulnerability Description,CLEAN TEXT,cluster_2
0,CVE-2019-1020019,2019-07-29,2019-07-31,22413.0,1,79,XSS,4.3,Remote,Medium,Not required,,Partial,,0,invenio-previewer before 1.0.0a12 allows XSS.,Failure to Preserve Web Page Structure ('Cross...,The software does not sufficiently valida...,failure preserve web page structure cross site...,1
1,CVE-2019-1020018,2019-07-29,2022-04-18,3011.0,2,287,Unknown,7.5,Remote,Low,Not required,Partial,Partial,Partial,2,Discourse before 2.3.0 and 2.4.x before 2.4.0....,Improper Authentication,When an actor claims to have a given iden...,improper authentication actor claim give ident...,2


## Test Similarity

- To classify text based on similarity to the term "critical infrastructure," you can use word embeddings to measure the semantic similarity between the text and the target term. One common way to do this is by using pre-trained word embeddings like Word2Vec or GloVe
- spaCy provides a Word2Vec-based word embeddings model called "en_core_web_lg," which is trained on large corpora and includes word vectors for over 1 million unique words.
- Sentence embeddings are useful when you want to compare the similarity or meaning of entire sentences or documents.

In [5]:
s1 = full_data_clustered['CLEAN TEXT'][1]
s2 = 'critical infrastructure'

doc1 = nlp(s1)
doc2 = nlp(s2)

doc1.similarity(doc2) 

  doc1.similarity(doc2)


0.42953072988705027

In [6]:
doc1 = nlp_lg(s1)
doc2 = nlp_lg(s2)

doc1.similarity(doc2)

0.6251714899710745

In [7]:

def similarity(text):
    docc = nlp_lg(text)
    ci = nlp_lg('critical infrastructure')
    return docc.similarity(ci)


In [8]:
sim = full_data_clustered.sample(5)
sim['CI SCORE'] = sim['CLEAN TEXT'].apply(similarity)
sim[['CVE ID','index','CLEAN TEXT','CI SCORE']]

Unnamed: 0,CVE ID,index,CLEAN TEXT,CI SCORE
19498,CVE-2020-8236,19499,improper authentication actor claim give ident...,0.590032
9330,CVE-2019-0180,9331,insufficiently protected credential weakness o...,0.74761
15468,CVE-2020-16269,15469,improper input validation product validate inc...,0.732623
44367,CVE-2022-40944,44368,improper sanitization special elements sql com...,0.718948
32472,CVE-2021-31231,32473,improper input validation product validate inc...,0.682499


## Apply to all data
- Word embeddings capture the semantic meaning of words by representing them as dense, low-dimensional vectors. Popular word embedding models include Word2Vec, GloVe, and FastText.
- Word embeddings capture semantic relationships, but they require pre-training on large corpora and may not work well for rare or out-of-vocabulary words.
- Docs : https://spacy.io/usage/linguistic-features#vectors-similarity

In [9]:

def similarity(text):
    docc = nlp_lg(text)
    ci = nlp_lg('critical infrastructure')
    return docc.similarity(ci)


def chunks(csv_input_name, input_column, output_column, csv_output_name, func, chunksize=10000):
    i = 1
    df = pd.DataFrame()
    for reader in pd.read_csv(csv_input_name, chunksize=chunksize):
        reader[output_column] = reader[input_column].apply(func)
        print('chunk',i,'\n ',reader,'\n=============================\n')
        df = pd.concat([df,reader] , ignore_index=True)
        i = i+1
    return df.to_csv(csv_output_name , index=False)


In [11]:
# chunks(csv_input_name = 'full_data_clustered.csv',
#        input_column = 'CLEAN TEXT',
#        output_column = 'CI SCORE',
#        csv_output_name = 'full_data_clustered_ssim.csv',
#        func = similarity,
#        chunksize = 10000
#       )
# =~30 min

## Final data

In [12]:
full_data_clustered_ssim = pd.read_csv('full_data_clustered_ssim.csv')

In [17]:
full_data_clustered_ssim.head(2)

Unnamed: 0,CVE ID,Publish Date,Update Date,Number Of Related Vulnerabilities,index,CWE ID,Vulnerability Type(s),Score,Access,Complexity,...,Conf.,Integ.,Avail,cluster,CVE Summary,Vulnerability Name,Vulnerability Description,CLEAN TEXT,cluster_2,CI SCORE
0,CVE-2019-1020019,2019-07-29,2019-07-31,22413.0,1,79,XSS,4.3,Remote,Medium,...,,Partial,,0,invenio-previewer before 1.0.0a12 allows XSS.,Failure to Preserve Web Page Structure ('Cross...,The software does not sufficiently valida...,failure preserve web page structure cross site...,1,0.666677
1,CVE-2019-1020018,2019-07-29,2022-04-18,3011.0,2,287,Unknown,7.5,Remote,Low,...,Partial,Partial,Partial,2,Discourse before 2.3.0 and 2.4.x before 2.4.0....,Improper Authentication,When an actor claims to have a given iden...,improper authentication actor claim give ident...,2,0.625171


## Plots

In [14]:
ci_score = full_data_clustered_ssim['CI SCORE']

In [15]:
np.mean(ci_score)

0.7046297998870444

In [16]:
ci_score.nunique()

52905

In [19]:
round(ci_score,2).value_counts()

CI SCORE
0.67    5495
0.72    5354
0.74    5251
0.71    5138
0.73    4925
0.70    4788
0.68    4121
0.75    4094
0.66    3979
0.69    3941
0.76    3042
0.65    2071
0.77    1771
0.64    1484
0.78     984
0.79     937
0.63     838
0.62     572
0.80     503
0.61     460
0.60     449
0.59     366
0.58     286
0.81     232
0.57     179
0.56     145
0.55      87
0.54      60
0.82      52
0.53      22
0.51      12
0.52      12
0.83       8
0.50       8
0.49       6
0.47       4
0.41       3
0.34       3
0.46       3
0.39       2
0.36       1
0.33       1
0.32       1
0.44       1
0.29       1
0.43       1
0.30       1
0.31       1
0.84       1
0.37       1
0.45       1
Name: count, dtype: int64

In [20]:
sel = ['CVE ID','index','Vulnerability Type(s)','CLEAN TEXT','CI SCORE']

threshold = 0.8
full_data_clustered_ssim[sel][full_data_clustered_ssim['CI SCORE'] >= 0.78 ]

Unnamed: 0,CVE ID,index,Vulnerability Type(s),CLEAN TEXT,CI SCORE
35,CVE-2019-1010283,36,+Info,information exposure information exposure inte...,0.801313
38,CVE-2019-1010266,39,DoS,allocation resources limits throttling softwar...,0.806116
40,CVE-2019-1010262,41,DoS,expose dangerous method function software prov...,0.790834
49,CVE-2019-1010249,50,Overflow,integer overflow wraparound software perform c...,0.784686
87,CVE-2019-1010172,88,DoS,uncontrolle resource consumption resource exha...,0.796503
...,...,...,...,...,...
61674,CVE-2014-0068,61675,Unknown,incorrect permission assignment critical resou...,0.793969
61675,CVE-2013-20004,61676,DoS,uncontrolle resource consumption resource exha...,0.787583
61677,CVE-2013-10004,61678,Unknown,improper restriction excessive authentication ...,0.792994
61694,CVE-2007-20001,61695,DoS,uncontrolle resource consumption resource exha...,0.789261


# Testing Anothers Pretrained models

## Hugging face

In [21]:
# !pip install -U sentence-transformers
# !pip install sentence-transformers

In [18]:
# from sentence_transformers import SentenceTransformer, util

# s1 = full_data_clustered['CLEAN TEXT'][1]
# s2 = 'critical infrastructure'

# #Compute embedding for both lists
# embedding_1= model.encode(s1, convert_to_tensor=True)
# embedding_2 = model.encode(s2, convert_to_tensor=True)

# util.pytorch_cos_sim(embedding_1, embedding_2)
# ## tensor([[0.6003]])

## Transformers
- you can use various techniques to measure semantic similarity between a given text and a target term. Here, I'll show you how to do it using the transformers library with the pre-trained RoBERTa model. First, make sure you have the library installed:


In [23]:
# from sentence_transformers import SentenceTransformer, util

# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# def similarity_trfs(text):
#     target = "critical infrastructure"
#     embedding_text = model.encode(text, convert_to_tensor=True)
#     embedding_target = model.encode(target, convert_to_tensor=True)
#     similarity = util.pytorch_cos_sim(embedding_text, embedding_target)
#     return similarity.item()

# sim_trfs = full_data_clustered.sample(50)
# sim_trfs['CI SCORE TRFS'] = sim_trfs['CLEAN TEXT'].apply(similarity_trfs)
# sim_trfs[['CVE ID','index','CLEAN TEXT','CI SCORE TRFS']]

Unnamed: 0,CVE ID,index,CLEAN TEXT,CI SCORE TRFS
390,CVE-2019-19719,391,failure preserve web page structure cross site...,0.10102
744,CVE-2019-18664,745,failure preserve web page structure cross site...,0.101078
29686,CVE-2021-38604,29687,null pointer dereference null pointer derefere...,0.131254
56967,CVE-2022-0676,56968,bound write software write datum past end begi...,0.158615
34112,CVE-2021-27330,34113,failure preserve web page structure cross site...,0.068188
39895,CVE-2020-27862,39896,improper sanitization special elements command...,0.238997
39314,CVE-2020-36370,39315,bound write software write datum past end begi...,0.19581
13911,CVE-2020-27758,13912,integer overflow wraparound software perform c...,0.092211
53132,CVE-2022-22274,53133,bound write software write datum past end begi...,0.199048
33334,CVE-2021-29297,33335,buffer copy check size input classic buffer ov...,0.188252


## BERT

In [89]:
import transformers
import numpy as np
import torch

# Load the RoBERTa model and tokenizer
model_name = 'roberta-base'
model = transformers.RobertaModel.from_pretrained(model_name)
tokenizer = transformers.RobertaTokenizer.from_pretrained(model_name)

# Define your texts
text1 = full_data_clustered_ssim['CLEAN TEXT'][1]
text2 = "Critical infrastructure"

# Tokenize and encode the texts
inputs1 = tokenizer(text1, return_tensors='pt', padding=True, truncation=True, max_length=512)
inputs2 = tokenizer(text2, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Get the contextual embeddings
with torch.no_grad():
    embeddings1 = model(**inputs1).last_hidden_state.mean(dim=1)
    embeddings2 = model(**inputs2).last_hidden_state.mean(dim=1)

# Calculate the cosine similarity
similarity = np.dot(embeddings1, embeddings2.T) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))
print(similarity)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[0.8344635]]


## BERT 2

In [24]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define text and target term
text = full_data_clustered['CLEAN TEXT'][1]
target = "critical infrastructure"

# Tokenize and encode the text and target
inputs = tokenizer(text, target, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    text_output = model(**inputs)['last_hidden_state'][:, 0]  # Taking the [CLS] token embedding for text
    target_output = model(**inputs)['last_hidden_state'][:, 1]  # Taking the [CLS] token embedding for target

# Calculate cosine similarity between embeddings
similarity_score = cosine_similarity(text_output, target_output)[0][0]
print("Semantic Similarity:", similarity_score)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Semantic Similarity: 0.6475706


## GloVe

In [25]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# glove-wiki-gigaword-50 : Semantic Similarity: 0.67194796
# glove-wiki-gigaword-200 : Semantic Similarity: 0.55132633
# glove-wiki-gigaword-100 : Semantic Similarity: 0.6053121

glove_vectors2 = api.load("glove-wiki-gigaword-100")

# Define text and target term
text = "This is an example text that you want to measure similarity for."
target = "critical infrastructure"

# Tokenize and process text and target embeddings
text_words = text.lower().split()
target_words = target.lower().split()

text_embedding = np.mean([glove_vectors2[word] for word in text_words if word in glove_vectors2], axis=0)
target_embedding = np.mean([glove_vectors2[word] for word in target_words if word in glove_vectors2], axis=0)

# Calculate cosine similarity between embeddings
similarity_score = cosine_similarity([text_embedding], [target_embedding])[0][0]
print("Semantic Similarity:", similarity_score)


Semantic Similarity: 0.6053121


In [97]:
# glove_vectors3 = api.load("glove-wiki-gigaword-100")



# SELECTION 1 - BERT (bert-base-uncased)

In [None]:
full_data_clustered['CLEAN TEXT'][1]

In [19]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
target = "critical infrastructure"


def similarity_bert(text):
    
    # Tokenize and encode the text and target
    inputs = tokenizer(text, target, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        text_output = model(**inputs)['last_hidden_state'][:, 0]  # Taking the [CLS] token embedding for text
        target_output = model(**inputs)['last_hidden_state'][:, 1]  # Taking the [CLS] token embedding for target
    
    similarity_score = cosine_similarity(text_output, target_output)[0][0]
    return similarity_score



similarity_bert(full_data_clustered['CLEAN TEXT'][1])
sim_bert = full_data_clustered.sample(5)
sim_bert['CI SCORE BERT'] = sim_bert['CLEAN TEXT'].apply(similarity_bert)
sim_bert[['CVE ID','index','CLEAN TEXT','CI SCORE BERT']]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,CVE ID,index,CLEAN TEXT,CI SCORE BERT
23713,CVE-2020-0667,23714,improper privilege management software properl...,0.576609
48664,CVE-2022-30668,48665,bound read software read datum past end beginn...,0.067523
27803,CVE-2021-45261,27804,release invalid pointer reference application ...,0.373779
10807,CVE-2018-13104,10808,failure preserve web page structure cross site...,0.287724
43806,CVE-2022-42206,43807,failure preserve web page structure cross site...,0.326014


In [30]:

def chunks_scores(csv_input_name, input_column, output_column, csv_output_name, func, chunksize=10000):
    i = 1
    df = pd.DataFrame()
    for reader in pd.read_csv(csv_input_name, usecols=['index','CLEAN TEXT','CI SCORE'], chunksize=chunksize):
        reader[output_column] = reader[input_column].apply(func)
        print('chunk',i,'\n ',reader,'\n=============================\n')
        df = pd.concat([df,reader] , ignore_index=True)
        i = i+1
    return df.to_csv(csv_output_name , index=False)


# chunks_scores(
#     csv_input_name = 'full_data_clustered_ssim.csv',
#     input_column = 'CLEAN TEXT',
#     output_column = 'CI SCORE BERT',
#     csv_output_name = 'full_data_clustered_ssim_V2.csv',
#     func = similarity_bert,
#     chunksize = 1000
# )



chunk 1 
       index                                         CLEAN TEXT  CI SCORE  \
0        1  failure preserve web page structure cross site...  0.666677   
1        2  improper authentication actor claim give ident...  0.625171   
2        3  url redirection untrusted site open redirect w...  0.636100   
3        4  improper input validation product validate inc...  0.719175   
4        5  double free product call free twice memory add...  0.707726   
..     ...                                                ...       ...   
995    996  access control bypass user control key system ...  0.723874   
996    997  deserialization untrusted data application des...  0.735792   
997    998  failure sanitize datum different plane injecti...  0.742766   
998    999  deserialization untrusted data application des...  0.708179   
999   1000  improper input validation product validate inc...  0.720398   

     CI SCORE BERT  
0         0.468787  
1         0.647571  
2         0.061754  
3   

chunk 8 
        index                                         CLEAN TEXT  CI SCORE  \
7000   7001  bound read software read datum past end beginn...  0.698663   
7001   7002  improper input validation product validate inc...  0.750934   
7002   7003  bound write software write datum past end begi...  0.715184   
7003   7004  bound write software write datum past end begi...  0.715836   
7004   7005  bound write software write datum past end begi...  0.715836   
...     ...                                                ...       ...   
7995   7996  information exposure discrepancy product behav...  0.806276   
7996   7997  miss required cryptographic step software impl...  0.732232   
7997   7998  improper limitation pathname restricted direct...  0.736240   
7998   7999  use broken risky cryptographic algorithm use b...  0.732156   
7999   8000  improper privilege management software properl...  0.735039   

      CI SCORE BERT  
7000       0.102631  
7001       0.391553  
7002      

chunk 15 
         index                                         CLEAN TEXT  CI SCORE  \
14000  14001  failure preserve web page structure cross site...  0.652731   
14001  14002  improper restriction excessive authentication ...  0.716871   
14002  14003  insufficient session expiration accord wasc in...  0.579040   
14003  14004  failure preserve web page structure cross site...  0.682268   
14004  14005  inadequate encryption strength software store ...  0.762081   
...      ...                                                ...       ...   
14995  14996  bound write software write datum past end begi...  0.731030   
14996  14997  improper sanitization special elements os comm...  0.757871   
14997  14998  improper limitation pathname restricted direct...  0.754012   
14998  14999  unrestricted recursive entity references dtd x...  0.692493   
14999  15000  improper authentication actor claim give ident...  0.758497   

       CI SCORE BERT  
14000       0.330424  
14001       0.296

chunk 22 
         index                                         CLEAN TEXT  CI SCORE  \
21000  21001  uncontrolle resource consumption resource exha...  0.771510   
21001  21002  information leak xml external entity file disc...  0.712156   
21002  21003  failure sanitize datum different plane injecti...  0.748866   
21003  21004  uncontrolle resource consumption resource exha...  0.797565   
21004  21005  failure sanitize datum different plane injecti...  0.760305   
...      ...                                                ...       ...   
21995  21996  failure preserve web page structure cross site...  0.687170   
21996  21997  improper input validation product validate inc...  0.741394   
21997  21998  improper initialization software initialize in...  0.739513   
21998  21999  failure release memory remove reference memory...  0.768634   
21999  22000  improper input validation product validate inc...  0.745803   

       CI SCORE BERT  
21000       0.097283  
21001       0.244

chunk 29 
         index                                         CLEAN TEXT  CI SCORE  \
28000  28001  information leak xml external entity file disc...  0.689251   
28001  28002  improper sanitization special elements sql com...  0.716599   
28002  28003  failure preserve web page structure cross site...  0.671879   
28003  28004  improper link resolution file access link foll...  0.694005   
28004  28005  reachable assertion product contain assert sim...  0.762124   
...      ...                                                ...       ...   
28995  28996  null pointer dereference null pointer derefere...  0.657720   
28996  28997  use broken risky cryptographic algorithm use b...  0.688083   
28997  28998  use broken risky cryptographic algorithm use b...  0.686901   
28998  28999  buffer copy check size input classic buffer ov...  0.655662   
28999  29000  improper sanitization special elements sql com...  0.711058   

       CI SCORE BERT  
28000       0.309614  
28001       0.329

KeyboardInterrupt: 

# SELECTION 2 - GolVe (glove-wiki-gigaword-100 & -200)

In [5]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# glove-wiki-gigaword-50 : Semantic Similarity: 0.67194796
# glove-wiki-gigaword-200 : Semantic Similarity: 0.55
# glove-wiki-gigaword-100 : Semantic Similarity: 0.63277894

glove_vectors3 = api.load("glove-wiki-gigaword-50")
target = "critical infrastructure"
target_words = target.lower().split()


def similarity_glove(text):
    
    # Tokenize and process text and target embeddings
    text_words = text.lower().split()
    text_embedding = np.mean([glove_vectors3[word] for word in text_words if word in glove_vectors3], axis=0)
    target_embedding = np.mean([glove_vectors3[word] for word in target_words if word in glove_vectors3], axis=0)
    
    similarity_score = cosine_similarity([text_embedding], [target_embedding])[0][0]
    return similarity_score



print(similarity_glove(full_data_clustered['CLEAN TEXT'][1]))

sim_glove = full_data_clustered.sample(5)
sim_glove['CI SCORE GLOVE'] = sim_glove['CLEAN TEXT'].apply(similarity_glove)
sim_glove[['CVE ID','index','CLEAN TEXT','CI SCORE GLOVE']]

In [38]:

def chunks_scores(csv_input_name, input_column, output_column, csv_output_name, func, chunksize=10000):
    i = 1
    df = pd.DataFrame()
    for reader in pd.read_csv(csv_input_name,  chunksize=chunksize):
        reader[output_column] = reader[input_column].apply(func)
        print('chunk',i,'\n ',reader,'\n=============================\n')
        df = pd.concat([df,reader] , ignore_index=True)
        i = i+1
    return df.to_csv(csv_output_name , index=False)


chunks_scores(
    csv_input_name = 'full_data_clustered_ssim_V3.csv',
    input_column = 'CLEAN TEXT',
    output_column = 'CI SCORE GLOVE 2',
    csv_output_name = 'full_data_clustered_ssim_V4.csv',
    func = similarity_glove,
    chunksize = 10000
)



chunk 1 
                  CVE ID Publish Date Update Date  \
0     CVE-2019-1020019   2019-07-29  2019-07-31   
1     CVE-2019-1020018   2019-07-29  2022-04-18   
2     CVE-2019-1020016   2019-07-29  2019-08-01   
3     CVE-2019-1020015   2019-07-29  2021-07-21   
4     CVE-2019-1020014   2019-07-29  2022-10-06   
...                ...          ...         ...   
9995    CVE-2018-19365   2019-03-21  2023-01-20   
9996    CVE-2018-19362   2019-01-02  2020-08-31   
9997    CVE-2018-19361   2019-01-02  2020-08-31   
9998    CVE-2018-19360   2019-01-02  2020-08-31   
9999    CVE-2018-19300   2019-04-11  2023-04-26   

      Number Of Related Vulnerabilities  index  CWE ID Vulnerability Type(s)  \
0                               22413.0      1      79                   XSS   
1                                3011.0      2     287               Unknown   
2                                 754.0      3     601               Unknown   
3                                9380.0      4      20  

chunk 3 
                 CVE ID Publish Date Update Date  \
20000   CVE-2020-7322   2020-09-09  2022-05-03   
20001   CVE-2020-7319   2020-09-09  2022-01-01   
20002   CVE-2020-7318   2020-10-14  2020-12-23   
20003   CVE-2020-7317   2020-10-14  2020-10-19   
20004   CVE-2020-7316   2020-10-07  2020-10-16   
...               ...          ...         ...   
29995  CVE-2021-37993   2021-11-02  2022-02-18   
29996  CVE-2021-37992   2021-11-02  2022-02-18   
29997  CVE-2021-37991   2021-11-02  2022-02-18   
29998  CVE-2021-37988   2021-11-02  2022-02-12   
29999  CVE-2021-37987   2021-11-02  2022-02-12   

       Number Of Related Vulnerabilities  index  CWE ID Vulnerability Type(s)  \
20000                              485.0  20001     532               Unknown   
20001                              893.0  20002      59               Unknown   
20002                            22413.0  20003      79                   XSS   
20003                            22413.0  20004      79         

chunk 5 
                 CVE ID Publish Date Update Date  \
40000  CVE-2020-27262   2021-01-08  2021-01-14   
40001  CVE-2020-27261   2021-02-09  2021-02-11   
40002  CVE-2020-27260   2021-01-08  2021-01-14   
40003  CVE-2020-27258   2021-01-19  2021-01-22   
40004  CVE-2020-27256   2021-01-19  2021-01-23   
...               ...          ...         ...   
49995  CVE-2022-28126   2022-11-11  2022-11-16   
49996  CVE-2022-28120   2022-05-05  2022-05-13   
49997  CVE-2022-28116   2022-04-05  2022-04-12   
49998  CVE-2022-28115   2022-04-05  2022-04-12   
49999  CVE-2022-28113   2022-04-15  2022-04-25   

       Number Of Related Vulnerabilities  index  CWE ID Vulnerability Type(s)  \
40000                            22413.0  40001      79                   XSS   
40001                             7611.0  40002     787    Exec Code Overflow   
40002                              891.0  40003      74               Unknown   
40003                              855.0  40004     522         

chunk 7 
                 CVE ID Publish Date Update Date  \
60000  CVE-2021-32040   2022-04-12  2023-02-03   
60001  CVE-2021-32039   2022-01-20  2022-01-26   
60002  CVE-2021-32036   2022-02-04  2022-02-09   
60003  CVE-2021-32010   2022-05-04  2022-05-11   
60004  CVE-2021-32009   2022-03-11  2022-03-18   
...               ...          ...         ...   
61693  CVE-2008-10001   2022-03-28  2022-04-08   
61694  CVE-2007-20001   2022-02-06  2022-09-01   
61695  CVE-2005-10001   2022-03-28  2022-04-08   
61696   CVE-2003-5003   2022-03-28  2022-04-06   
61697   CVE-2003-5002   2022-03-28  2022-04-06   

       Number Of Related Vulnerabilities  index  CWE ID Vulnerability Type(s)  \
60000                             7611.0  60001     787              Overflow   
60001                              855.0  60002     522               Unknown   
60002                              530.0  60003     770                   DoS   
60003                              323.0  60004     326         

## Final data

In [4]:
full_data_clustered_ssim_V3 = pd.read_csv('full_data_clustered_ssim_V3.csv')
full_data_clustered_ssim_V3 = full_data_clustered_ssim_V3.replace(np.nan,'None')

In [5]:
full_data_clustered_ssim_V3 

Unnamed: 0,CVE ID,Publish Date,Update Date,Number Of Related Vulnerabilities,index,CWE ID,Vulnerability Type(s),Score,Access,Complexity,...,Avail,cluster,CVE Summary,Vulnerability Name,Vulnerability Description,CLEAN TEXT,cluster_2,CI SCORE,CI SCORE GLOVE,CI SCORE GLOVE 2
0,CVE-2019-1020019,2019-07-29,2019-07-31,22413.0,1,79,XSS,4.3,Remote,Medium,...,,0,invenio-previewer before 1.0.0a12 allows XSS.,Failure to Preserve Web Page Structure ('Cross...,The software does not sufficiently valida...,failure preserve web page structure cross site...,1,0.666677,0.698982,0.620250
1,CVE-2019-1020018,2019-07-29,2022-04-18,3011.0,2,287,Unknown,7.5,Remote,Low,...,Partial,2,Discourse before 2.3.0 and 2.4.x before 2.4.0....,Improper Authentication,When an actor claims to have a given iden...,improper authentication actor claim give ident...,2,0.625171,0.632779,0.567375
2,CVE-2019-1020016,2019-07-29,2019-08-01,754.0,3,601,Unknown,5.8,Remote,Medium,...,,0,ASH-AIO before 2.0.0.3 allows an open redirect.,URL Redirection to Untrusted Site ('Open Redir...,A web application accepts a user-controll...,url redirection untrusted site open redirect w...,0,0.636100,0.627934,0.573071
3,CVE-2019-1020015,2019-07-29,2021-07-21,9380.0,4,20,Unknown,5.0,Remote,Low,...,,0,graphql-engine (aka Hasura GraphQL Engine) bef...,Improper Input Validation,The product does not validate or incorrec...,improper input validation product validate inc...,0,0.719175,0.676609,0.614739
4,CVE-2019-1020014,2019-07-29,2022-10-06,394.0,5,415,Unknown,2.1,Local,Low,...,,0,docker-credential-helpers before 0.6.3 has a d...,Double Free,The product calls free() twice on the sam...,double free product call free twice memory add...,2,0.707726,0.695226,0.627228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61693,CVE-2008-10001,2022-03-28,2022-04-08,22413.0,61694,79,XSS,4.3,Remote,Medium,...,,0,** UNSUPPORTED WHEN ASSIGNED ** A vulnerabilit...,Failure to Preserve Web Page Structure ('Cross...,The software does not sufficiently valida...,failure preserve web page structure cross site...,1,0.706891,0.713690,0.645094
61694,CVE-2007-20001,2022-02-06,2022-09-01,1320.0,61695,400,DoS,5.0,Remote,Low,...,Partial,0,A flaw was found in StarWind iSCSI target. An ...,Uncontrolled Resource Consumption ('Resource E...,The software does not properly restrict t...,uncontrolle resource consumption resource exha...,1,0.789261,0.747740,0.679483
61695,CVE-2005-10001,2022-03-28,2022-04-08,754.0,61696,601,Unknown,5.8,Remote,Medium,...,,0,** UNSUPPORTED WHEN ASSIGNED ** A vulnerabilit...,URL Redirection to Untrusted Site ('Open Redir...,A web application accepts a user-controll...,url redirection untrusted site open redirect w...,0,0.696738,0.670062,0.621229
61696,CVE-2003-5003,2022-03-28,2022-04-06,22413.0,61697,79,XSS,4.3,Remote,Medium,...,,0,** UNSUPPORTED WHEN ASSIGNED ** A vulnerabilit...,Failure to Preserve Web Page Structure ('Cross...,The software does not sufficiently valida...,failure preserve web page structure cross site...,1,0.711290,0.719332,0.651999
