In [None]:
# This might take some time.
!pip install bertopic
from IPython.display import clear_output
clear_output()

In [None]:
import bertopic
import pandas as pd
import numpy as np
from bertopic import BERTopic
from IPython.display import clear_output

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
data = pd.read_csv("../input/wikipedia-corpus-on-security-domain/wikidatav1.csv")

In [None]:
data.head()

Unnamed: 0,title,content
0,Backup,verb form referring process back whereas noun ...
1,Backup,different types data storage devices used copy...
2,Backup,limitations human factors involved backup sche...
3,Backup,rule rule aid backup process.states least copi...
4,Backup,backup methods unstructured unstructured repos...


In [None]:
data.shape

(173315, 2)

In [None]:
data.title.value_counts()

Information security        1258
Computer security           1200
Transport Layer Security    1131
Mobile security              949
National Security Agency     810
                            ... 
KSV-21                         1
MPEG Common Encryption         1
Client-side encryption         1
Session key                    1
Cabinet of Brazil              1
Name: title, Length: 6736, dtype: int64

In [None]:
#convert dataframe to list and taking only one thousand row
data = data[:50000]
docs = data.content.to_list()

In [None]:
model = BERTopic(embedding_model="all-mpnet-base-v2").fit(docs)
clear_output()

In [None]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,10173
1,0,280
2,1,279
3,2,278
4,3,269
...,...,...
950,955,10
949,956,10
947,958,10
946,952,10


In [None]:
def compute(text):
    topics, probs = model.transform(text)
    clear_output()
    print("-----------------------------------------------------------------------------------")
    print("Topic id: ",topics)
    topic = model.topic_names[topics[0]]
    print("Topic name: ",topic)
    print("-----------------------------------------------------------------------------------")
    print("Suggested Topic: \n")
    f_t = model.find_topics(topic)
    for i in f_t[0]:
        print(model.topic_names[i],"\n")

In [None]:
text = "Malware is intrusive software that is designed to damage and destroy computers and computer systems. Malware is a contraction for “malicious software.” Examples of common malware includes viruses, worms, Trojan viruses, spyware, adware, and ransomware."

compute(text)
del text

-----------------------------------------------------------------------------------
Topic id:  [210]
Topic name:  210_virus_viruses_infected_programs
-----------------------------------------------------------------------------------
Suggested Topic: 

210_virus_viruses_infected_programs 

681_malware_antivirus_antimalware_fileless 

824_malwarebytes_kleczynski_malware_pii 

593_av_onecare_essentials_morro 

443_norton_symantec_antivirus_mb 



In [None]:
text = """ Mobile security is the protection of smartphones, tablets, laptops and other portable computing devices, and the networks they connect to, from threats and vulnerabilities associated with wireless computing.
"""

compute(text)
del text

-----------------------------------------------------------------------------------
Topic id:  [371]
Topic name:  371_smartphone_smartphones_malicious_mobile
-----------------------------------------------------------------------------------
Suggested Topic: 

371_smartphone_smartphones_malicious_mobile 

824_malwarebytes_kleczynski_malware_pii 

681_malware_antivirus_antimalware_fileless 

400_phones_mobile_phone_gsm 

210_virus_viruses_infected_programs 



In [None]:
md = model.find_topics("Microsoft")
md


([545, 4, 296, 598, 14],
 [0.7990591148043709,
  0.7755663838789957,
  0.6766453740707444,
  0.663554185106069,
  0.6568542535793394])

In [None]:
for i in md[0]:
    print(model.get_topic(i))
    print()

[('office', 0.060668963071285574), ('microsoft', 0.05801730897503321), ('onedrive', 0.029258234345362965), ('premium', 0.026567596540771198), ('subscription', 0.02449503272098211), ('skype', 0.02349709782793276), ('sharepoint', 0.02284263497664613), ('enterprise', 0.02256589797010997), ('plans', 0.02211851449485912), ('business', 0.02130393865910669)]

[('windows', 0.042986325295883386), ('vista', 0.027856195236314798), ('xp', 0.015582919934191189), ('microsoft', 0.013353976904710973), ('editions', 0.012191947504467346), ('nt', 0.010403079608584644), ('server', 0.008562427345287556), ('pack', 0.007537683780862969), ('edition', 0.007430142639396355), ('versions', 0.007090702617753443)]

[('explorer', 0.11041592522580929), ('internet', 0.042346033442830354), ('browser', 0.03852165917131308), ('ie', 0.03356754176385891), ('windows', 0.017090392026106495), ('addons', 0.015597443632612028), ('microsoft', 0.015496437118646837), ('mshtml', 0.015310699300809643), ('html', 0.01462111760644583),

In [None]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,10173,-1_security_key_displaystyle_cryptography
1,0,280,0_rick_portrayed_negan_saviors
2,1,279,1_tenchi_ryoko_jurai_masaki
3,2,278,2_dutch_netherlands_holland_amsterdam
4,3,269,3_dexter_debra_dexters_laguerta
...,...,...,...
950,955,10,955_culture_chinese_subcultures_han
949,956,10,956_xxxviii_schleihauf_warship_dumaresq
947,958,10,958_log_logs_event_events
946,952,10,952_scada_sewage_mosaics_systems


# Saving Model

In [None]:
model.save("topic_model", save_embedding_model=False)

-----------------------------------------------------------------------------------------------------
Road Runner is saying to upvote this notebook

Bye

![](https://media2.giphy.com/media/yhRhIgnJIRD0I/giphy.gif)