In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
import re
import spacy as sp
nlp = sp.load('en_core_web_sm')

## Read data

In [51]:
data = pd.read_csv('full_data_clustered_ssim.csv')
data = data.replace(np.nan,'None')

In [52]:
df1 = data[['CVE ID','index','CLEAN TEXT','CI SCORE']]
df1.head()

Unnamed: 0,CVE ID,index,CLEAN TEXT,CI SCORE
0,CVE-2019-1020019,1,failure preserve web page structure cross site...,0.666677
1,CVE-2019-1020018,2,improper authentication actor claim give ident...,0.625171
2,CVE-2019-1020016,3,url redirection untrusted site open redirect w...,0.6361
3,CVE-2019-1020015,4,improper input validation product validate inc...,0.719175
4,CVE-2019-1020014,5,double free product call free twice memory add...,0.707726


In [53]:
def text(i):
    return data['CLEAN TEXT'][i] +' '+ data['Vulnerability Type(s)'][i]
    
text(0)

'failure preserve web page structure cross site scripting software sufficiently validate filter escape encode user controllable input place output web page serve user cross site scripting xss vulnerability occur origin policy state browser limit resource accessible script run give web site origin resource associate web site client client resource site origin goal prevent site able modify read content unrelated site world wide web involve interaction site policy important browser enforce invenio previewer allow xss XSS'

## CountVectorizer

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.95,min_df=2,stop_words='english')
text_cv = cv.fit_transform(df1['CLEAN TEXT']) # Create a Document-Term Matrix (DTM)


In [55]:
# text_cv.toarray() dont use this
len(cv.get_feature_names_out())

15784

## TFIDFVectorizer

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
text_tfidf = tfidf.fit_transform(df1['CLEAN TEXT']) # (DTM)

In [57]:
len(tfidf.get_feature_names_out())

32606

## Latent Dirichlet Allocation - LDA

In [58]:
# Apply LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

num_topics = 2
lda_model = LDA(n_components=num_topics)
lda_model.fit(text_cv)

In [60]:
for i,topic in enumerate(lda_model.components_):
    print(i,':',len(topic),topic)

0 : 15784 [ 1.59855    12.40438022  1.77194231 ...  8.46679465  9.49440157
  2.48462786]
1 : 15784 [1.40145    1.59561978 1.22805769 ... 1.53320535 0.50559843 0.51537214]


## List Topics

In [61]:
# Get the most significant words for each topic
def get_top_words(model, feature_names, n_top_words):
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

n_top_words = 5 # number of top words to display for each topic
feature_names = cv.get_feature_names_out()
top_words_per_topic = get_top_words(lda_model, feature_names, n_top_words)

In [62]:
feature_names

array(['aa', 'aaa', 'aaaa', ..., 'zzzcms', 'zzzphp', 'âš'], dtype=object)

In [63]:
top_words_per_topic

[['buffer', 'input', 'command', 'attacker', 'code'],
 ['site', 'web', 'resource', 'user', 'origin']]

In [64]:
# Print the most significant words for each topic
for i, words in enumerate(top_words_per_topic):
    print(f"Topic {i}: {', '.join(words)}")

Topic 0: buffer, input, command, attacker, code
Topic 1: site, web, resource, user, origin


In [65]:
import random

len(lda_model.components_[0])

for i in range(10):
    random_word_id = random.randint(0,15784)
    print(cv.get_feature_names_out()[random_word_id])

stuck
ckeditorfuncnum
libimaging
icap
fpga
mechanism
activepath
readily
risky
vfprintf


In [66]:
len(lda_model.components_)

2

In [67]:
len(lda_model.components_[0])

15784

In [68]:
single_topic = lda_model.components_[0]
single_topic

array([ 1.59855   , 12.40438022,  1.77194231, ...,  8.46679465,
        9.49440157,  2.48462786])

In [69]:
top_word_indices = single_topic.argsort()[-10:]
top_word_indices

array([ 4622, 12879, 12688,  8177,  2981,  2214,   889,  2295,  6435,
        1543], dtype=int64)

In [70]:
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

file
sql
software
memory
datum
code
attacker
command
input
buffer


In [71]:
def topics(topic_num):
    single_topic = lda_model.components_[topic_num]
    top_word_indices = single_topic.argsort()[-10:]
    for index in top_word_indices:
        print(cv.get_feature_names_out()[index])
        
topics(1)

client
page
vulnerability
scripting
cross
origin
user
resource
web
site


In [72]:
topics(0)

file
sql
software
memory
datum
code
attacker
command
input
buffer


In [73]:

for index, topic in enumerate(lda_model.components_):
    print(f'The top 15 words for topic #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['bound', 'control', 'vulnerability', 'execution', 'result', 'file', 'sql', 'software', 'memory', 'datum', 'code', 'attacker', 'command', 'input', 'buffer']


The top 15 words for topic #1
['xss', 'information', 'policy', 'software', 'browser', 'client', 'page', 'vulnerability', 'scripting', 'cross', 'origin', 'user', 'resource', 'web', 'site']




In [44]:
# topic #0 : CI
# topic #1 : Non CI

## Transform

In [74]:
topic_results = lda_model.transform(text_cv)
topic_results

array([[0.00710179, 0.99289821],
       [0.03295312, 0.96704688],
       [0.73691442, 0.26308558],
       ...,
       [0.74917097, 0.25082903],
       [0.00583261, 0.99416739],
       [0.10762087, 0.89237913]])

In [75]:
df1['Topic'] = topic_results.argmax(axis=1)
df1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Topic'] = topic_results.argmax(axis=1)


Unnamed: 0,CVE ID,index,CLEAN TEXT,CI SCORE,Topic
0,CVE-2019-1020019,1,failure preserve web page structure cross site...,0.666677,1
1,CVE-2019-1020018,2,improper authentication actor claim give ident...,0.625171,1
2,CVE-2019-1020016,3,url redirection untrusted site open redirect w...,0.636100,0
3,CVE-2019-1020015,4,improper input validation product validate inc...,0.719175,0
4,CVE-2019-1020014,5,double free product call free twice memory add...,0.707726,0
...,...,...,...,...,...
61693,CVE-2008-10001,61694,failure preserve web page structure cross site...,0.706891,1
61694,CVE-2007-20001,61695,uncontrolle resource consumption resource exha...,0.789261,1
61695,CVE-2005-10001,61696,url redirection untrusted site open redirect w...,0.696738,0
61696,CVE-2003-5003,61697,failure preserve web page structure cross site...,0.711290,1


In [84]:
text_cv.shape

(61698, 15784)

In [86]:
df1['Topic'].value_counts()

Topic
0    34670
1    27028
Name: count, dtype: int64

In [89]:
len(text_cv.data)

2960876

In [95]:
df1

Unnamed: 0,CVE ID,index,CLEAN TEXT,CI SCORE,Topic
0,CVE-2019-1020019,1,failure preserve web page structure cross site...,0.666677,1
1,CVE-2019-1020018,2,improper authentication actor claim give ident...,0.625171,1
2,CVE-2019-1020016,3,url redirection untrusted site open redirect w...,0.636100,0
3,CVE-2019-1020015,4,improper input validation product validate inc...,0.719175,0
4,CVE-2019-1020014,5,double free product call free twice memory add...,0.707726,0
...,...,...,...,...,...
61693,CVE-2008-10001,61694,failure preserve web page structure cross site...,0.706891,1
61694,CVE-2007-20001,61695,uncontrolle resource consumption resource exha...,0.789261,1
61695,CVE-2005-10001,61696,url redirection untrusted site open redirect w...,0.696738,0
61696,CVE-2003-5003,61697,failure preserve web page structure cross site...,0.711290,1


In [97]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler

# Sample non-text data (numeric)
data = {
    'Feature1': [1.5, 2.3, 0.7, 3.1, 1.0],
    'Feature2': [0.8, 1.2, 0.5, 1.9, 2.5],
    'Feature3': [3.0, 2.8, 1.2, 2.5, 3.2]
}

df = pd.DataFrame(data)

# Step 1: Preprocess the data (if needed, in this example we'll just scale the data)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)

# Step 2: Convert data to DTM-like format (optional, depending on the structure of your data)
dtm = pd.DataFrame(scaled_data, columns=df.columns)

# Step 3: Apply LDA
n_topics = 2
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
topic_dist = lda.fit_transform(dtm)

# Print the topic distribution for each sample
print(topic_dist)


[[0.43428121 0.56571879]
 [0.5550228  0.4449772 ]
 [0.5        0.5       ]
 [0.62386576 0.37613424]
 [0.2229213  0.7770787 ]]


## Metrics

In [None]:
# from sklearn.metrics import silhouette_score
# from sklearn.metrics import davies_bouldin_score
# from sklearn.metrics import calinski_harabasz_score



# # Calculate the silhouette score (ranges from -1 to 1, higher is better)
# silhouette_avg = silhouette_score(ohe, kmeans_clusters)
# print("Silhouette Score:", silhouette_avg)

# # Calculate the Davies-Bouldin index (lower is better)
# davies_bouldin_avg = davies_bouldin_score(ohe, kmeans_clusters)
# print("Davies-Bouldin Index:", davies_bouldin_avg)