In [70]:
import re

import nltk
import pandas as pd
from langdetect import detect

nltk.download("stopwords")

tech5G = pd.read_csv(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/tc_scrapped_final.csv",
    index_col=0,
)

crunchbase_industries = pd.read_csv(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/crunchbase_industries.csv"
)


html = pd.read_csv(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/html.csv", index_col=0
)
idx_to_drop = html[
    (html["Content"].isna() == True) | (html["Content"] == "Error")
].index
html.drop(index=idx_to_drop, inplace=True)
html["lang"] = html["Content"].apply(lambda x: detect(x))
html = html[html["lang"] == "en"].drop(columns="lang")
idx_to_drop = html[html["Content"].apply(lambda x: len(x.split())) < 80].index
html.drop(index=idx_to_drop, inplace=True)

html_cb = pd.read_csv(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/html2.csv", index_col=0
)
idx_to_drop = html_cb[
    (html_cb["Content"].isna() == True) | (html_cb["Content"] == "Error")
].index
html_cb.drop(index=idx_to_drop, inplace=True)
html_cb["lang"] = html_cb["Content"].apply(lambda x: detect(x))
html_cb = html_cb[html_cb["lang"] == "en"].drop(columns="lang")
idx_to_drop = html_cb[html_cb["Content"].apply(lambda x: len(x.split())) < 80].index
html_cb.drop(index=idx_to_drop, inplace=True)


def clean_text(text):
    """text = re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", text)
     # text = str(text).lower()  # Lowercase words
    #  text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
      text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
      text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
      text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
      #text = re.sub(
      #    f"[{re.escape(string.punctuation)}]", "", text
    #"""
    text = text.encode("ascii", "ignore").decode("utf-8")

    text = re.sub(r"([a-z\.!?])([A-Z])", r"\1 \2", text)
    text = re.sub(r"(\d)([a-z]|[A-Z])", r"\1 \2", text)
    text = re.sub(r"([a-z]|[A-Z])(\d)", r"\1 \2", text)
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    return text


data_ch = [clean_text(html.loc[i, "Content"]) for i in html.index]
data_cb = [clean_text(html_cb.loc[i, "Content"]) for i in html_cb.index]
data_tc = [clean_text(tech5G.loc[i, "content"]) for i in tech5G.index]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cordiez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Bert topic

In [78]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(data_ch)
vectorizer_model = CountVectorizer(stop_words="english")
eng_stop_words = set(vectorizer_model.get_stop_words())
additional_stops = [
    "mk",
    "milton",
    "keynes",
    "world",
    "work",
    "uk",
    "service",
    "search",
    "contact",
    "use",
    "we",
    "provide",
    "website",
    "help",
    "product",
    "solution",
    "company",
    "design",
    "get",
    "team",
    "experience",
    "customer",
    "need",
    "client",
    "develop",
    "quick",
    "sale",
    "day",
    "ltd",
    "project",
    "work",
    "market",
    "can",
    "will",
    "offer",
    "find",
    "new",
    "year",
    "look",
    "call",
    "see",
    "one",
    "service",
    "cookies",
    "facebook",
    "events",
    "accept",
    "contact",
    "search",
]

vectorizer_model = CountVectorizer(stop_words=eng_stop_words.union(additional_stops))
topic_model.update_topics(data_ch, topics=topics, vectorizer_model=vectorizer_model)

counts = topic_model.get_topic_info()

In [89]:
counts

Unnamed: 0,Topic,Count,Name
0,-1,729,-1_services_business_management_policy
1,0,164,0_business_services_digital_marketing
2,1,87,1_hsbcad_licensee_construction_licensed
3,2,86,2_car_vehicle_cars_mercedes
4,3,71,3_food_menu_sauce_chicken
5,4,66,4_business_accounting_accountants_accountancy
6,5,64,5_property_rent_estate_house
7,6,58,6_sensor_cnc_sensors_forklift
8,7,53,7_recruitment_candidates_jobs_job
9,8,52,8_shop_incense_wedding_accessories


In [95]:
topic_model.visualize_hierarchy(height=800, width=600, orientation="left")

In [81]:
categories = pd.DataFrame.from_dict(topic_model.get_topics(), orient="index")
categories.reset_index(inplace=True)
categories.head()

for i in categories.index:
    ch = ""
    for c in categories.columns:
        if type(c) == int:
            ch = ch + categories.loc[i, c][0] + ", "
    ch = ch[:-2]
    categories.loc[i, "terms"] = ch

topic_results = pd.merge(
    categories[["index", "terms"]],
    counts[["Topic", "Count"]],
    left_on="index",
    right_on="Topic",
)
topic_results = topic_results.rename(
    columns={"terms": "10 most representative words", "Topic": "Topic Id"}
).drop(columns=["index"])
topic_results = topic_results[["Topic Id", "Count", "10 most representative words"]]
topic_results.style.set_properties(
    subset=["10 most representative words"], **{"width": "40px"}
)

from pandas import option_context

with option_context("display.max_colwidth", 500):
    display(topic_results)

Unnamed: 0,Topic Id,Count,10 most representative words
0,-1,729,"services, business, management, policy, systems, visit, information, email, courses, customers"
1,0,164,"business, services, digital, marketing, management, cloud, software, clients, web, automation"
2,1,87,"hsbcad, licensee, construction, licensed, projects, property, residential, landscaping, licensees, surveyors"
3,2,86,"car, vehicle, cars, mercedes, tyres, tyre, benz, vehicles, volkswagen, kia"
4,3,71,"food, menu, sauce, chicken, roast, dishes, pizza, restaurant, delicious, beef"
5,4,66,"business, accounting, accountants, accountancy, accounts, financial, clients, accountant, finance, audit"
6,5,64,"property, rent, estate, house, landlords, tenants, tenant, landlord, bedrooms, sell"
7,6,58,"sensor, cnc, sensors, forklift, machining, machines, welding, photoelectric, metal, inspection"
8,7,53,"recruitment, candidates, jobs, job, manager, recruit, employee, clients, vacancies, salary"
9,8,52,"shop, incense, wedding, accessories, wish, brands, art, clothing, womens, bridesmaids"


In [82]:
html["Category"] = topics

In [84]:
# html.to_csv('/Users/cordiez/Documents/CompaniesHouse/OutputFiles/companieshouse_clusters.csv')

In [351]:
topic_model.visualize_term_rank(log_scale=True)

In [293]:
# topic_results.to_excel('/Users/cordiez/Documents/CompaniesHouse/OutputFiles/bert_blue_41classes.xls')

In [302]:
classes = pd.read_excel(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/bert_blue_41classes.xls",
    index_col=0,
)
with option_context("display.max_colwidth", 500):
    display(classes)

Unnamed: 0,Topic Id,Count,10 most representative words,Supposed activity
0,-1,771,"business, services, management, information, privacy, customers, email, courses, 01908, clients",Outliers
1,0,95,"car, vehicle, cars, mercedes, tyres, tyre, benz, vehicles, wheel, volkswagen",Vehicule dealer
2,1,73,"food, menu, sauce, chicken, roast, adipisicing, dishes, restaurant, pizza, delicious",Restaurants
3,2,68,"business, accounting, accountants, accountancy, accounts, financial, accountant, finance, corporate, audit",Accounting
4,3,63,"sensor, cnc, sensors, machining, machines, aluminium, welding, photoelectric, inspection, metal",Industrial automation
5,4,61,"licensee, licensed, construction, property, projects, residential, licensees, license, homes, landscaping",Property management
6,5,60,"marketing, business, brand, services, clients, management, sales, customers, consulting, businesses",Management consulting
7,6,59,"property, rent, valuation, estate, landlords, tenants, tenant, landlord, bedrooms, sell",Real estate
8,7,52,"recruitment, candidates, jobs, job, manager, recruit, employee, clients, vacancies, salary",Recruiting
9,8,45,"wedding, bridal, clothing, bridesmaids, wear, dress, womens, dresses, salon, suit",Event management


In [303]:
len(data_ch)

2055

In [173]:
html["Category"] = topics

from pandas import option_context

with option_context("display.max_colwidth", 500):
    display(html[html["Category"] == 9])
index_cat_9 = html[html["Category"] == 9].index

Unnamed: 0,Website,Content,Category
68,https://abeminedu.business.site/?utm_source=gmb&utm_medium=referral,AbeMin Education Centre Ltd - Education CenterAbeMin Education Centre LtdGet QuoteCall nowGet directionsTestimonialsGalleryContactAbeMin Education Centre LtdEducation CenterOpening at 9:00 AM tomorrowGet QuoteCall 07944 388924Get directionsWhatsApp 07944 388924Message 07944 388924Contact UsFind TableMake AppointmentPlace OrderView MenuTestimonialsa week agoReport review- Robert AWrite a ReviewRead MoreGalleryContact UsContactCall now07944 388924AddressGet directions39 MilecastleBancroftM...,9
141,https://find.shell.com/gb/fuel/12038524-hks-kempston-service-station,"HKS KEMPSTON SERVICE STATION Go to our Station Locator HKS KEMPSTON SERVICE STATION WOBURN ROAD, MK42 7QA, Kempston, GB +44 1234 841767 Get Directions Opening Hours Day Mon Tue Wed Thu Fri Sat Sun Forecourt Hours 05:00 - 23:00 05:00 - 23:00 05:00 - 23:00 05:00 - 23:00 05:00 - 23:00 05:00 - 23:00 05:00 - 23:00 Fuels Shell V-Power Unleaded Shell V-Power Diesel Shell FuelSave Unleaded Shell FuelSave Diesel Services & Amenities Car Wash Mobile Payment Shop - Select fuelService Air Loyalty cards ...",9
151,http://www.alertcctvsys.co.uk/,Alert CCTV Get a free online quote Request a Free Quote We offer a free no obligation quotation and would be happy to meet with you to discuss your requirements. However If you are just looking for idea of cost please complete the information below and we will be happy to give you a quotation. Are you looking to upgrade an existing system? Yes No Request a Quote Phone: 01908 698 777 Email: info@alertcctvsys.co.uk Home About Services Blog Contact GET A QUOTE External & Internal High Definitio...,9
192,http://amohealthcare.co.uk/,"AMO HEALTHCARE LTD - Home Expand/collapse navigation HomeContact UsLegal Notice AMO HEALTHCARE LTD AMO HEALTHCARE LTD HomeContact UsLegal Notice Dr. Osigbeme Awudu MBBCh. LFHom. PGDip. My Button My Button Personalised Professional Healthcare Services We are located at: AMO HEALTHCARE LTD 5 Chicksands Avenue Milton Keynes MK10 9DP UK Contact us today! If you have any queries or wish to make an appointment, please contact us: +44 795 8034939 +44 795 8034939 Dr.awudu@amohealthcare.co.uk Or use...",9
208,https://anointed-hands-ltd.business.site/?utm_source=gmb&utm_medium=referral,Anointed hands Ltd | Door To Door UK to Ghana - Freight Forwarding ServiceAnointed hands Ltd | Door To Door UK to GhanaGet QuoteCall nowGet directionsTestimonialsGalleryContactAnointed hands Ltd | Door To Door UK to GhanaFreight Forwarding ServiceOpen today until 10:00 PMGet QuoteCall 07737 776640Get directionsWhatsApp 07737 776640Message 07737 776640Contact UsFind TableView MenuMake AppointmentPlace OrderTestimonials4 months agoReport reviewI've been using them for the past 14 years and...,9
310,https://azu-electrical-ltd.business.site/?utm_source=gmb&utm_medium=referral,"AZU Electrical Ltd - Electrical Installation ServiceYour domain registration is pending. Check back in an hourAZU Electrical LtdGet QuoteCall nowGet directionsUpdatesGalleryContactAZU Electrical LtdElectrical Installation ServiceOpen 24 hoursGet QuoteCall 07574 605177Get directionsWhatsApp 07574 605177Message 07574 605177Contact UsFind TableView MenuMake AppointmentPlace OrderUpdatesPosted on Apr 4, 2021You can call us any time, we are working 24/7 under guidance of Covid rules strictly, ...",9
414,https://blindmatrix.business.site/?utm_source=gmb&utm_medium=referral,"BlindMatrix - World's Best Software for Retailers and Wholesalers of Blinds, Curtains and Shutters.Your domain registration is pending. Check again in an hourBlindMatrixGet QuoteCall nowGet directionsUpdatesTestimonialsAbout usGalleryContactBlindMatrixWorld's Best Software for Retailers and Wholesalers of Blinds, Curtains and Shutters.Sherwood Drive, BletchleyGet QuoteCall 020 7096 1371Get directionsWhatsApp 020 7096 1371Message 020 7096 1371Contact UsFind TableMake AppointmentPlace Order...",9
420,https://blueskay888.business.site/?utm_source=gmb&utm_medium=referral,BlueSkay888 Ltd - Master Technician Car RepairYour domain registration is pending. Check again in an hourBlueSkay888 LtdMessage usMake AppointmentView MenuTestimonialsGalleryContactBlueSkay888 LtdMaster Technician Car RepairOpen until midnight on FridayMessage 07788 930598Make AppointmentView MenuCall 07788 930598Get directionsWhatsApp 07788 930598Contact UsGet QuoteFind TablePlace OrderTestimonials5 months agoReport reviewArrived quickly and promptly to diagnose electrical problem with ...,9
424,https://find.shell.com/gb/fuel/12038450-shell-budgens-broughton,"SHELL BUDGENS BROUGHTON Go to our Station Locator SHELL BUDGENS BROUGHTON CHILDS WAY, MK10 9AB, MILTON KEYNES, GB +44 1908 239291 Get Directions Opening Hours Day Mon Tue Wed Thu Fri Sat Sun Forecourt Hours Open 24 Hours Open 24 Hours Open 24 Hours Open 24 Hours Open 24 Hours Open 24 Hours Open 24 Hours Fuels Shell V-Power Unleaded Shell V-Power Diesel Shell FuelSave Unleaded Shell FuelSave Diesel Services & Amenities Car Wash Toilet Costa Express Mobile Payment Shop - Select Budgens fuelSer...",9
436,https://www.bonguk.com/contact-us/,"Contact us - Bong UK HomeAbout UsExpertise & ServicesProductsSolutionsNewsEnvironmentDealer ZoneContact us Home > Contact us How to find us Stockists Testimonials Contact us Contact our friendly Customer Services Team t 01908 216216 f 01908 216217 08:30am – 5:00pm, 5 days a week (call charges vary according to service provider). Customer Services customerservicesuk@bong.com print_link mail_link_text Bong Websites Dealer Zone | Envelope Lounge Quick Links Our Heritage Choosing the Right Enve...",9


In [263]:
topic_results

Unnamed: 0,Topic Id,Count,10 most representative words
0,-1,742,"services, business, management, info..."
1,0,226,"services, courses, digital, cloud, s..."
2,1,101,"car, vehicle, cars, mercedes, tyres,..."
3,2,78,"wedding, shop, incense, hair, access..."
4,3,71,"sensor, sensors, forklift, industria..."
5,4,71,"food, menu, chicken, roast, pork, di..."
6,5,67,"business, accounting, accountants, a..."
7,6,66,"recruitment, candidates, jobs, job, ..."
8,7,66,"care, support, families, supported, ..."
9,8,63,"property, rent, estate, landlords, t..."


## BERTopic Crunchbase

In [20]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0, metric="cosine")
vectorizer_model = CountVectorizer(stop_words="english")
eng_stop_words = set(vectorizer_model.get_stop_words())
additional_stops = [
    "mk",
    "milton",
    "keynes",
    "world",
    "work",
    "uk",
    "service",
    "search",
    "contact",
    "use",
    "we",
    "provide",
    "website",
    "help",
    "product",
    "solution",
    "company",
    "design",
    "get",
    "team",
    "experience",
    "customer",
    "need",
    "client",
    "develop",
    "quick",
    "sale",
    "day",
    "ltd",
    "project",
    "work",
    "market",
    "can",
    "will",
    "offer",
    "find",
    "new",
    "year",
    "look",
    "call",
    "see",
    "one",
    "service",
    "cookies",
    "cookie",
    "facebook",
    "events",
    "accept",
    "contact",
    "search",
]


vectorizer_model = CountVectorizer(
    stop_words=eng_stop_words.union(
        ["mk", "milton", "keynes", "world", "work", "uk", "service"]
    )
)

# Pass the custom models to BERTopic
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
)

topics, probs = topic_model.fit_transform(data_cb)
counts = topic_model.get_topic_info()

In [32]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

topic_model = BERTopic(n_gram_range=(1, 2))
topics, probs = topic_model.fit_transform(data_cb)

vectorizer_model = CountVectorizer(stop_words="english")
eng_stop_words = set(vectorizer_model.get_stop_words())
additional_stops = [
    "mk",
    "milton",
    "keynes",
    "world",
    "work",
    "uk",
    "service",
    "search",
    "contact",
    "use",
    "we",
    "provide",
    "website",
    "help",
    "product",
    "solution",
    "company",
    "design",
    "get",
    "team",
    "experience",
    "customer",
    "need",
    "client",
    "develop",
    "quick",
    "sale",
    "day",
    "ltd",
    "project",
    "work",
    "market",
    "can",
    "will",
    "offer",
    "find",
    "new",
    "year",
    "look",
    "call",
    "see",
    "one",
    "service",
    "cookies",
    "cookie",
    "facebook",
    "events",
    "accept",
    "contact",
    "search",
]

vectorizer_model = CountVectorizer(
    stop_words=eng_stop_words.union(
        ["mk", "milton", "keynes", "world", "work", "uk", "service"]
    )
)
topic_model.update_topics(data_cb, topics=topics, vectorizer_model=vectorizer_model)


counts = topic_model.get_topic_info()


# topic_model.get_topics()

In [33]:
categories = pd.DataFrame.from_dict(topic_model.get_topics(), orient="index")
categories.reset_index(inplace=True)
categories.head()

for i in categories.index:
    ch = ""
    for c in categories.columns:
        if type(c) == int:
            ch = ch + categories.loc[i, c][0] + ", "
    ch = ch[:-2]
    categories.loc[i, "terms"] = ch

topic_results = pd.merge(
    categories[["index", "terms"]],
    counts[["Topic", "Count"]],
    left_on="index",
    right_on="Topic",
)
topic_results = topic_results.rename(
    columns={"terms": "10 most representative words", "Topic": "Topic Id"}
).drop(columns=["index"])
topic_results = topic_results[["Topic Id", "Count", "10 most representative words"]]
topic_results.style.set_properties(
    subset=["10 most representative words"], **{"width": "40px"}
)

topic_results.style.set_properties(
    subset=["10 most representative words"], **{"width": "600px"}
)

Unnamed: 0,Topic Id,Count,10 most representative words
0,-1,163,"services, business, contact, cookies, management, privacy, technology, marketing, cookie, customers"
1,0,33,"management, security, services, software, cloud, business, cyber, systems, customer, manage"
2,1,18,"software, business, design, creative, photography, bespoke, app, studio, clients, apps"
3,2,17,"property, marketing, rent, pcm, sale, denbigh, house, removals, houses, 2021"
4,3,16,"camera, cables, packaging, charge, plugs, video, cyc, hdmi, clamps, accessories"
5,4,15,"business, jobs, accountants, accounting, cookies, services, companies, clients, accountancy, businesses"
6,5,14,"services, technology, amn, customer, waam, company, tjr, technologies, supplier, transcosmos"
7,6,12,"envisics, marketing, events, automotive, media, digital, holographic, business, event, robots"


In [69]:
topic_model.get_representative_docs(topic=3)

['Distributors of Quality Engineering Products - Good Hand UK 01908 221151 Call Us sales@goodhanduk.co.uk Email Us Close Login Register Contact 0 items 0.00 My Basket Toggle Clamps Vertical Toggle Clamps 127 Products Horizontal Toggle Clamps 60 Products Push Pull Toggle Clamps 58 Products Latch & Hook Toggle Clamps 63 Products Safety Lock Toggle Clamps With Safety Lock 24 Products Stainless Steel Stainless Steel Toggle Clamps 76 Products Heavy Duty Toggle Clamps 18 Products Black Toggle Clamps 6 Products Toggle Latches 76 Products Pull Back & Flip Flop Toggle clamps 4 Products Toggle Presses 6 Products Pneumatic Toggle Clamps 20 Products Toggle Pliers 7 Products G Clamps Heavy Duty 8 Products Toggle Clamp Accessories 163 Products Exair Air Products Air Knives 161 Products Line Vacs (Product Conveyors) 120 Products Static Elimination Products 67 Products Air Amplifiers Fume Extraction Dry & Clean 22 Products Vortex Tubes Tool Cooling 71 Products Cabinet & Panel Coolers IP 54 & IP 66 86 

In [71]:
html_cb["Category"] = topics

In [68]:
from keybert import KeyBERT
from stop_words import get_stop_words

en_stop = get_stop_words("en")
# en_stop = en_stop.extend(['milton','keynes','world','work','uk','support','service','search','contact','shop','usage','business', 'manage','provide','website','help','product','solution','company','design','get','team','experience','customer','need','client','develop','quick','sale','day','ltd','project','work','office','market','can','will','offer','find','new','year','look','call','see','one','service','news',"news","events","press","contact","search","http"])
html_cb = pd.read_csv(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/crunchbase_clusters.csv",
    index_col=0,
)
html_cb = (
    html_cb.groupby(by="Category")["Content"].apply(lambda x: " ".join(x)).reset_index()
)

for i in html_cb.index:
    print(i)
    doc = html_cb.loc[html_cb.index[i], "Content"]
    kw_model = KeyBERT()
    print(kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None))

0
[('homebase', 0.6385), ('homeshop', 0.4801), ('homeadmin2021', 0.4634), ('homego', 0.4495), ('homepagewhat', 0.4389)]
1
[('nhbc', 0.4194), ('dbs', 0.3322), ('hsbcmoving', 0.3273), ('certifications', 0.3251), ('hrms', 0.321)]
2
[('keynesmaking', 0.4791), ('keynesmk14', 0.4337), ('designing', 0.4144), ('creative', 0.4101), ('strategy', 0.405)]
3
[('avenuewatfordwd18price', 0.4088), ('marketingseo', 0.4033), ('customer', 0.3747), ('ecommerce', 0.3743), ('keynesmk9', 0.3653)]
4
[('uks', 0.4074), ('fleet', 0.3928), ('courier', 0.3922), ('txm', 0.3916), ('logistics', 0.3911)]
5
[('startups', 0.4133), ('f6s', 0.3957), ('startupsicon', 0.3556), ('startup', 0.3531), ('cookie_analytics', 0.3292)]
6
[('amn', 0.5342), ('provider', 0.4674), ('mnos', 0.4658), ('networksaugust', 0.4447), ('network', 0.4421)]
7
[('envisics', 0.5374), ('technologiesenvisics', 0.4875), ('technologyour', 0.4832), ('technology', 0.4756), ('technologies', 0.4644)]


In [75]:
html_cb

Unnamed: 0,Website,Content,Category
0,www.homebase.co.uk/,Homebase - DIY That Turns Your House Into A Ho...,-1
1,www.envisics.com,Envisics - We redefine the way people see and ...,6
2,www.africamobilenetworks.com/,AMN Search this site HomeAbout UsAbout UsMissi...,5
3,www.sbdautomotive.com/,Automotive Technology Consultancy and Research...,-1
7,Www.smartrecruitonline.com,Smart Recruit Online – UK’s leading Talent Acq...,-1
...,...,...,...
456,idealcombi.com/,Idealcombi Windows & Doors - Danish quality co...,-1
457,readphotography.co.uk/,Commercial Photography Milton Keynes - Noel Re...,1
458,www.rapidfulfillment.co.uk/,Home - Rapid Fulfillment Services Ltd +44 (0)1...,3
464,www.badmintonengland.co.uk,Badminton England | The Nation’s Favourite Rac...,-1


In [74]:
# html_cb.to_csv('/Users/cordiez/Documents/CompaniesHouse/OutputFiles/crunchbase_clusters.csv')

In [355]:
topic_model.visualize_term_rank(log_scale=True)

In [None]:
seed_list = [
    ["buiness", "services"],
    ["design", "website"],
    ["security", "services"],
    ["products", "cards"],
    [],
]

In [356]:
classes = pd.read_excel(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/bert_crunchbase_8classes.xls",
    index_col=0,
)
with option_context("display.max_colwidth", 500):
    display(classes)

Unnamed: 0,Topic Id,Count,10 most representative words,Supposed activity
0,-1,186,"business, services, management, contact, privacy, systems, marketing, customers, technology, cookie",Outliers
1,0,17,"design, website, marketing, business, services, clients, app, apps, privacy, agency",Digital marketing & SEO
2,1,16,"security, services, cloud, microsoft, cyber, 365, secure, consultancy, ibm, ccl",Cloud data services
3,2,15,"products, cards, balloons, packaging, accessories, balloon, gift, supplies, decorations, bags",E-commerce
4,3,15,"business, accountants, accounting, taxi, companies, coupons, accountancy, businesses, payroll, accountant",Business intelligence
5,4,15,"technology, vehicle, connectivity, amn, waam, mobility, connected, technologies, telematics, mobile",Internet of things
6,5,14,"property, rent, seo, sale, leaflet, removals, houses, house, search, estate",Property management
7,6,10,"envisics, marketing, events, automotive, holographic, business, robots, robotazia, technology, patent",Robotics & automation


## BERT on Techcrunch

In [100]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(data_tc)


vectorizer_model = CountVectorizer(stop_words="english")
eng_stop_words = set(vectorizer_model.get_stop_words())
additional_stops = ["5g", "5G"]
vectorizer_model = CountVectorizer(stop_words=eng_stop_words.union(additional_stops))
topic_model.update_topics(data_tc, topics=topics, vectorizer_model=vectorizer_model)


counts = topic_model.get_topic_info()


# topic_model.get_topics()

In [109]:
topic_model.visualize_hierarchy(height=450, width=500)

In [102]:
categories = pd.DataFrame.from_dict(topic_model.get_topics(), orient="index")
categories.reset_index(inplace=True)
categories.head()

for i in categories.index:
    ch = ""
    for c in categories.columns:
        if type(c) == int:
            ch = ch + categories.loc[i, c][0] + ", "
    ch = ch[:-2]
    categories.loc[i, "terms"] = ch

topic_results = pd.merge(
    categories[["index", "terms"]],
    counts[["Topic", "Count"]],
    left_on="index",
    right_on="Topic",
)
topic_results = topic_results.rename(
    columns={"terms": "10 most representative words", "Topic": "Topic Id"}
).drop(columns=["index"])
topic_results = topic_results[["Topic Id", "Count", "10 most representative words"]]


topic_results.style.set_properties(
    subset=["10 most representative words"], **{"width": "600px"}
)

topic_results.to_excel(
    "/Users/cordiez/Documents/CompaniesHouse/OutputFiles/bert_tc_14classes.xls"
)

In [107]:
topic_results.style.set_properties(
    subset=["10 most representative words"], **{"width": "650px"}
)

Unnamed: 0,Topic Id,Count,10 most representative words
0,-1,118,"company, companies, tech, technology, industry, mobile, startups, vr, broadband, startup"
1,0,99,"huawei, china, chinese, eu, european, technology, chinas, huaweis, risk, billion"
2,1,61,"phone, apple, pod, apples, phones, smartphone, devices, device, mini, pods"
3,2,60,"samsung, galaxy, device, fold, samsungs, devices, camera, foldable, smartphone, phone"
4,3,51,"company, cloud, fund, bank, funding, investment, startups, companies, investors, startup"
5,4,38,"wireless, network, cities, mobile, networking, bandwidth, verizon, connectivity, internet, networks"
6,5,26,"qualcomm, broadcom, qualcomms, shareholders, company, companies, chipmaker, takeover, nxp, stockholders"
7,6,25,"app, apps, apple, startup, facebook, startups, google, developer, developers, apples"
8,7,17,"intel, chip, intels, company, chips, computing, wireless, companies, technologies, technology"
9,8,17,"merger, deal, carriers, wireless, network, broadband, verizon, transaction, fcc, communications"


In [None]:
tech5G[["title"]].style.set_properties(subset=["title"], **{"width": "500px"})

## Supervised

In [357]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"))
docs = data["data"]
categories = data["target"]
category_names = data["target_names"]

In [375]:
category_names[2]

'comp.os.ms-windows.misc'

In [367]:
labels_to_add = [
    "comp.graphics",
    "comp.os.ms-windows.misc",
    "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware",
    "comp.windows.x",
]
indices = [category_names.index(label) for label in labels_to_add]
y = [label if label in indices else -1 for label in categories]

In [370]:
category_names.index("comp.graphics")

1

In [386]:
"""5G technology has a number of features which will positively impact digital experiences and smart cities.  In addition to a higher speed to upload and download data, it ensures very short latency times and the ability to connect multiple devices at the same time.  Less latency means compressing the time between sending and receiving the signal. 
5G brings the range to at least under 10 milliseconds (that is, half the most advanced 4G could achieve) and in best cases around 1 millisecond delays, meaning data will be transferred about in real time.  Moreover, with new networks, speed and latency don't get worse even with tens of thousands of connected devices.  5G therefore offers more device density. 
clean

Predict

The combination of high density and low latency will deeply transform our cities.  Today, in crowded vacation spots or at stadiums, connection can sometimes get worse.  With 5G it will no longer be like this: it will be possible to have a huge number (up to one million) of connections simultaneously for each square kilometer. This means, in addition to personal devices such as smartphones, tablets, smart speakers and PCs, also many other devices, objects and sensors will be capable of capturing information and dialogue with each other. 
""".replace(
    "\n", " "
)

"5G technology has a number of features which will positively impact digital experiences and smart cities.  In addition to a higher speed to upload and download data, it ensures very short latency times and the ability to connect multiple devices at the same time.  Less latency means compressing the time between sending and receiving the signal.  5G brings the range to at least under 10 milliseconds (that is, half the most advanced 4G could achieve) and in best cases around 1 millisecond delays, meaning data will be transferred about in real time.  Moreover, with new networks, speed and latency don't get worse even with tens of thousands of connected devices.  5G therefore offers more device density.  clean  Predict  The combination of high density and low latency will deeply transform our cities.  Today, in crowded vacation spots or at stadiums, connection can sometimes get worse.  With 5G it will no longer be like this: it will be possible to have a huge number (up to one million) of