# **SETTING UP PYMONGO, THE REQUIRED LIBRARIES & CONNECTING IT TO THE CLUSTER**

In [1]:
import pymongo
import pandas as pd
import os
import plotly.express as px
from sentence_transformers import SentenceTransformer
from pymongo.operations import SearchIndexModel

In [2]:
# Access my MONGO_URI
MONGO_URI="mongodb+srv://Huckletree:huckletree@cluster0.ptlhq.mongodb.net/"

# creating a connection to my cluster
myclient = pymongo.MongoClient(MONGO_URI)
myclient

MongoClient(host=['cluster0-shard-00-02.ptlhq.mongodb.net:27017', 'cluster0-shard-00-01.ptlhq.mongodb.net:27017', 'cluster0-shard-00-00.ptlhq.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-7gzezq-shard-0', tls=True)

# **CREATING THE SAMPLE CSVS**

Creating the smaller dealroom and unicorns CSV from the 16gb CSVs

In [5]:
# Specify the columns and types to load efficiently
columns_to_load = ['domain', 'name', 'role', 'is_unicorn', 'country', 'dr_uuid', 'def_description', 'cb_uuid']
dtype_mapping = {
    'domain': 'string',
    'name': 'string',
    'role': 'string',
    'is_unicorn': 'float64',  # Temporarily treat as float due to NULL values
    'country': 'string',
    'dr_uuid': 'string',
    'def_description': 'string',
    'cb_uuid': 'string'
}

# Filter rows where 'is_unicorn' is True
file_path = r'data_for_assignment/domains.csv'
df_filtered = pd.read_csv(file_path, usecols=columns_to_load, dtype=dtype_mapping)
df_filtered_unicorn = df_filtered[df_filtered['is_unicorn'] == True].copy()

df_filtered_unicorn.drop('is_unicorn', axis=1, inplace=True)
df_filtered_unicorn = df_filtered_unicorn.drop_duplicates(subset=['name'], keep='first')
df_filtered_unicorn = df_filtered_unicorn.dropna(subset=['cb_uuid'])
df_filtered_unicorn.to_csv('filtered_unicorns.csv', index=False)
print(len(df_filtered_unicorn))

1846


In [6]:
l=df_filtered_unicorn['dr_uuid'].to_list()
dealroom_df=pd.read_csv('data_for_assignment/dealroom_companies.csv', usecols=['uuid', 'name', 'launch_date', 'year_became_unicorn', 'industries', 'employees', 'all_locations', 'investors'])
filtered_df = dealroom_df[dealroom_df['uuid'].isin(l)].copy() #only keeping dealroom companies that are unicorns
# Function to convert the employee range to an integer (top range - lower range)
def convert_to_employee_count(employee_range):
    # If the value is NaN, return 1
    if pd.isna(employee_range):
        return 1

    # Handle case for actual integer or float values
    if isinstance(employee_range, (int, float)):
        return employee_range

    # Handle '+' case
    if '+' in employee_range:
        return int(employee_range.replace('+', ''))

    # Split the range and calculate the difference
    try:
        lower, upper = employee_range.split('-')
        lower = int(lower)
        upper = int(upper)
        return upper - lower
    except ValueError:
        return 1  # In case the format is unexpected or the value is missing

# Apply the conversion function to the 'employees' column
filtered_df['employees'] = filtered_df['employees'].apply(convert_to_employee_count)
print(len(filtered_df))
filtered_df.to_csv('filtered_dealrooms.csv', index=False)

1846


creating the smaller founders csv

In [9]:
founders_df = pd.read_csv('data_for_assignment/domains_founders.csv', usecols=['domain','person_name', 'person_facebook_url', 'person_linkedin_url', 'person_twitter_url','country_code'])
l=df_filtered_unicorn['domain'].to_list()
filtered_df = founders_df[founders_df['domain'].isin(l)] #only keeping founders that founded unicorns

print(len(filtered_df))
filtered_df.to_csv('filtered_founders.csv', index=False)

4388


creating the smaller crunchbase rounds csv

In [7]:
rounds_df = pd.read_csv(r'data_for_assignment\crunchbase_rounds.csv', usecols=['round_announced_on','round_raised_amount_usd', 'org_uuid'])
l=df_filtered_unicorn['cb_uuid'].to_list()
rounds_df = rounds_df[rounds_df['org_uuid'].isin(l)] #only keeping unicorn rounds
rounds_df=rounds_df.rename(columns={'org_uuid':'cb_uuid'})
rounds_df.dropna(subset=['round_raised_amount_usd'], inplace=True)
print(len(rounds_df))
rounds_df.to_csv('filtered_rounds.csv', index=False)

7926


# **CREATING THE HUCKLETREE DB & LOADING EACH CSV INTO A COLLECTION**

Dropping the DB in case it has already been created.

In [8]:
myclient.drop_database('Huckletree')

Creating the DB

In [9]:
mydb = myclient["Huckletree"]

In [11]:
mydb['unicorns'].drop()
mydb['dealroom_companies'].drop()
mydb['geo_locations'].drop()
mydb['unicorn_founders'].drop()
mydb['crunchbase_rounds'].drop()

Creating the collections

In [5]:
unicorns = mydb["unicorns"]
geo_locations = mydb["geo_locations"]
dealroom_companies = mydb["dealroom_companies"]
unicorn_founders = mydb["unicorn_founders"]
crunchbase_rounds = mydb["crunchbase_rounds"]

Loading the geo_location and dealroom collections

In [None]:
geos = pd.read_csv(r"data_for_assignment\geo_countries_emojis_capitals.csv", keep_default_na=False)
geos_dict=geos.to_dict(orient="records")
geo_locations.insert_many(geos_dict)

In [None]:
deals=pd.read_csv("filtered_dealrooms.csv")
deals_dict=deals.to_dict(orient="records")
dealroom_companies.insert_many(deals_dict)

For the dealroom, geolocation and unicorns collections, we add new relationships using the object IDs.

In [None]:
unis=pd.read_csv("filtered_unicorns.csv")

# Load dealroom_companies collection into a pandas DataFrame
droom_df = pd.DataFrame(list(mydb.dealroom_companies.find()))  # Fetch all dealroom_companies from MongoDB
droom_df = droom_df[['uuid', '_id']]  # Keep only uuid and ObjectId (_id)

# Load geo_locations collection into a pandas DataFrame
geo_df = pd.DataFrame(list(mydb.geo_locations.find()))  # Fetch all geo_locations from MongoDB
geo_df = geo_df[['name', '_id']]  # Keep only name and ObjectId (_id)

# Create a mapping from name to ObjectId
geo_name_to_objectid = pd.Series(geo_df['_id'].values, index=geo_df['name']).to_dict()

# Create a mapping from uuid to ObjectId
droom_uuid_to_objectid = pd.Series(droom_df['_id'].values, index=droom_df['uuid']).to_dict()

# Create _GeoID in the unicorns DataFrame with the ObjectId
unis['_GeoID'] = unis['country'].map(geo_name_to_objectid)

# Create _DroomID in the unicorns DataFrame with the ObjectId
unis['_DroomID'] = unis['dr_uuid'].map(droom_uuid_to_objectid)

unis_dict=unis.to_dict(orient="records")
unicorns.insert_many(unis_dict)

Inserting data into the founders collection and adding relationships from the geo_location and unicorn collections using the object IDs

In [None]:
founders=pd.read_csv("filtered_founders.csv")

# Load unicorns collection into a pandas DataFrame
unis_df = pd.DataFrame(list(mydb.unicorns.find()))  # Fetch all unicorns from MongoDB
unis_df = unis_df[['domain', '_id']]  # Keep only domain and ObjectId (_id)

# Load geo_locations collection into a pandas DataFrame
geo_df = pd.DataFrame(list(mydb.geo_locations.find()))  # Fetch all geo_locations from MongoDB
geo_df = geo_df[['alpha_3_code', '_id']]  # Keep only alpha code and ObjectId (_id)

# Create a mapping from alpha_code to ObjectId
geo_name_to_objectid = pd.Series(geo_df['_id'].values, index=geo_df['alpha_3_code']).to_dict()

# Create a mapping from domain to ObjectId
uni_domain_to_objectid = pd.Series(unis_df['_id'].values, index=unis_df['domain']).to_dict()

# Create _Unicorn in the founders DataFrame with the ObjectId
founders['_Unicorn'] = founders['domain'].map(uni_domain_to_objectid)

# Create _GeoID in the founders DataFrame with the ObjectId
founders['_GeoID'] = founders['country_code'].map(geo_name_to_objectid)

founders_dict=founders.to_dict(orient="records")
unicorn_founders.insert_many(founders_dict)

Inserting data into the crunchbase_rounds collection and adding the one to many relationship with the unicorn collection using the Object IDs

In [None]:
rounds=pd.read_csv("filtered_rounds.csv")

# Load unicorn collection into a pandas DataFrame
unis_df = pd.DataFrame(list(mydb.unicorns.find()))  # Fetch all unicorns from MongoDB
unis_df = unis_df[['cb_uuid', '_id']]  # Keep only cb_uuid and ObjectId (_id)

# Create a mapping from cb_uuid to ObjectId
uni_cb_uuid_to_objectid = pd.Series(unis_df['_id'].values, index=unis_df['cb_uuid']).to_dict()

# Create uni_ref in the rounds DataFrame with the ObjectId
rounds['uni_ref'] = rounds['cb_uuid'].map(uni_cb_uuid_to_objectid)

rounds_dict=rounds.to_dict(orient="records")
crunchbase_rounds.insert_many(rounds_dict)

# **LOADING AND CREATING THE VECTOR SEARCH INDEX FOR COMPANIES' DESCRIPTIONS**

In [17]:
# Load the embedding model 
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

# Function to generate embeddings
def get_embedding(data):
   embedding = model.encode(data)
   return embedding.tolist()

  state_dict = loader(resolved_archive_file)
<All keys matched successfully>


Creating the vector embedding for each company's description, and setting it to None if it isn't a string

In [18]:
for document in unicorns.find():
    description = document.get("def_description", "")
    
    # Check if the description is valid (not empty or NaN)
    if description and isinstance(description, str):
        vector = get_embedding(description)  # Generate the embedding
    else:
        print(f"Description is NA for {document['name']}.")
        vector = None  # Explicitly set None for invalid descriptions

    # Update the document with the embedding or None
    unicorns.update_one(
        {"_id": document["_id"]},
        {"$set": {"vectorised_description": vector}}
    )


Description is NA for Frubana.
Description is NA for Loom.


Creating the vector search index

In [19]:
# Create the index model for vector search
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",               # Indicating this field is a vector field
        "path": "vectorised_description",  # Field name where embeddings are stored
        "similarity": "dotProduct",      # Similarity measure used for vector search
        "numDimensions": 768            # Number of dimensions in the embedding 
      }
    ]
  },
  name="vector_index", # Name of the search index
  type="vectorSearch",                
)

# Create the search index on the 'unicorns' collection
unicorns.create_search_index(model=search_index_model)

'vector_index'

# **SHOWING ONE INSTANCE OF EACH COLLECTION IN OUR DB**

In [20]:
collections = mydb.list_collection_names()

# Iterate through each collection and print one document
for collection_name in collections:
    collection = mydb[collection_name]
    document = collection.find_one()  # Get one document from the collection
    print(f"Collection: {collection_name}")
    print([f"{i}: {'vector-embeddings-list' if type(document[i])==list and len(document[i]) >=768 else document[i]}" for i in document.keys()])
    print("-" * 50)

Collection: crunchbase_rounds
['_id: 678172f765df5454e4a87033', 'cb_uuid: 53c7e35a-2078-bb61-b1a3-215976c14f1b', 'round_announced_on: 2016-09-29', 'round_raised_amount_usd: 6349900.0', 'uni_ref: 67816f4d65df5454e4a85e3d']
--------------------------------------------------
Collection: unicorn_founders
['_id: 67816f5465df5454e4a85f0f', 'domain: 01zhuanche.com', 'person_name: Dong Wei', 'person_facebook_url: nan', 'person_twitter_url: nan', 'person_linkedin_url: nan', 'country_code: CHN', '_Unicorn: 67816f4d65df5454e4a85a9f', '_GeoID: 67816f3c65df5454e4a84fdd']
--------------------------------------------------
Collection: unicorns
['_id: 67816f4d65df5454e4a857d9', 'domain: skype.com', 'name: Ring2Skype', 'role: company', 'country: United States', 'def_description: Ring2Skype is a platform that allows users to have a local phone number forwarded to their Skype accounts, wherever they may be located.', 'cb_uuid: a6e0d9fb-e377-714f-45c4-1a609142e9ad', 'dr_uuid: d656cd18-d63d-4a37-a8a3-87bd8

# **MY FIRST 5 QUERIES**

## 1: *COMPANY KEY INFORMATION DASHBOARD*

In [346]:
pipeline = [
    {
        "$sample": { "size": 1 }  # Randomly pick one company
    },
    {
        "$lookup": {
            "from": "dealroom_companies",
            "localField": "_DroomID",
            "foreignField": "_id",
            "as": "company_info"
        }
    },
    {
        "$unwind": "$company_info"
    },
    {
        "$lookup": {
            "from": "unicorn_founders",
            "localField": "_id",
            "foreignField": "_Unicorn",
            "as": "founders_info"
        }
    },
    {
        "$unwind": {
            "path": "$founders_info",
            "preserveNullAndEmptyArrays": True
        }
    },
    {
        "$lookup": {
            "from": "geo_locations",
            "localField": "founders_info._GeoID",
            "foreignField": "_id",
            "as": "geo_info"
        }
    },
    {
        "$unwind": {
            "path": "$geo_info",
            "preserveNullAndEmptyArrays": True
        }
    },
    {
        "$addFields": {
            "geo_location": {
                "$cond": {
                    "if": { "$ne": ["$geo_info.prettified_name", None] },
                    "then": "$geo_info.prettified_name",
                    "else": "$REMOVE"
                }
            }
        }
    },
    {
        "$lookup": {
            "from": "crunchbase_rounds",
            "localField": "_id",
            "foreignField": "uni_ref",
            "as": "funding_rounds"
        }
    },
    {
        "$addFields": {
            # Sum total funds raised across all rounds
            "total_funding_rounds": { "$size": "$funding_rounds" },
            "total_funds_raised": {
                "$sum": {
                    "$map": {
                        "input": "$funding_rounds",
                        "as": "round",
                        "in": "$$round.round_raised_amount_usd"
                    }
                }
            },
            # Get the latest funding round date
            "latest_round_date": {
                "$max": "$funding_rounds.round_announced_on"
            }
        }
    },
    {
        "$group": {
            "_id": "$_id",
            "name": { "$first": "$name" },
            "domain": { "$first": "$domain" },
            "def_description": { "$first": "$def_description" },
            "launch_date": { "$first": "$company_info.launch_date" },
            "all_locations": { "$first": "$company_info.all_locations" },
            "employees": { "$first": "$company_info.employees" },
            "investors": { "$first": "$company_info.investors" },
            "industries": { "$first": "$company_info.industries" },
            "founders": {
                "$push": {
                    "name": "$founders_info.person_name",
                    "geo_location": "$geo_location",
                    "facebook": {
                        "$cond": {
                            "if": {
                                    "$eq": ["$founders_info.person_facebook_url", float("nan")] }
                            ,
                            "then": "$REMOVE",
                            "else": "$founders_info.person_facebook_url"
                        }
                    },
                    "twitter": {
                        "$cond": {
                            "if": 
                                    { "$eq": ["$founders_info.person_twitter_url", float("nan")] }
                            ,
                            "then": "$REMOVE",
                            "else": "$founders_info.person_twitter_url"
                        }
                    },
                    "linkedin": {
                        "$cond": {
                            "if": 
                                    { "$eq": ["$founders_info.person_linkedin_url", float("nan")] }
                            ,
                            "then": "$REMOVE",
                            "else": "$founders_info.person_linkedin_url"
                        }
                    }
                }
            },
            "total_funding_rounds": { "$first": "$total_funding_rounds" },
            "total_funds_raised": { "$first": "$total_funds_raised" },
            "latest_round_date": { "$first": "$latest_round_date" }
        }
    },
    {
        "$project": {
            "name": 1,
            "domain": 1,
            "def_description": 1,
            "launch_date": 1,
            "all_locations": 1,
            "employees": 1,
            "investors": 1,
            "industries": 1,
            "founders": 1,
            "total_funding_rounds": 1,
            "total_funds_raised": 1,
            "latest_round_date": 1 
        }
    }
]

cursor = unicorns.aggregate(pipeline)

def beautify(money):
    if money >= 1e9:
        return f"{money / 1e9:.2f}B"
    elif money >= 1e6:
        return f"{money / 1e6:.2f}M"
    elif money >= 1e3:
        return f"{money / 1e3:.2f}K"
    else:
        return f"{money:.2f}"

# Print the formatted result for the selected unicorn
for document in cursor:
    res=document
print(res['name'], res['domain'], '\n'+res['def_description'], '\n'+'Founded: ', res['launch_date'], '\nFounders: ')
for i in [i for i in res['founders']]:
    list_out= [i[j] for j in i.keys() if j != 'name']
    if len(list_out)>=1:
        print('-', i['name'], list_out)
    else:
        try:
            print('-', i['name'])
        except:
            print('No founders found.')
print('Locations: ')
for i in res['all_locations'].split(';'):
    print('-', i)
total_rounds=res['total_funding_rounds']
if total_rounds>1:
    print('Funding: ', '$', beautify(res['total_funds_raised']), 'in', res['total_funding_rounds'], 'rounds.', 'Latest round was on', res['latest_round_date'],  '\nEmployees: ', res['employees'])
elif total_rounds==1:
    print('Funding: ', '$', beautify(res['total_funds_raised']), 'in', res['total_funding_rounds'], 'round.', 'Latest round was on', res['latest_round_date'],  '\nEmployees: ', res['employees'])
else: 
    print('Funding: No data available'  '\nEmployees: ', res['employees'])
print('Investors: ')
investors=res['investors']
if type(investors)==float:
    print('No data available')
else:
    for i in investors.split(','):
        print('-', i)
print('Industries: ')
for i in res['industries'].split(','):
    print('-', i)



Toast toasttab.com 
 Toast is a point-of-sale and restaurant management platform designed for businesses in the food service and hospitality industry. 
 Founded:  2011-12-01 
 Founders: 
Jonathan Grimm
Matias Brecher
Aman Narang
Steve Fredette
Chris Comparato ['🇺🇸United States', 'facebook.com/chris.comparato', 'twitter.com/chriscomparato', 'linkedin.com/in/chriscomparato']
Locations: 
Boston, United States
Funding:  $ 961.95M in 8 rounds. Latest round was on 2020-11-23 
 Employees:  4999
Investors: 
Bessemer Venture Partners
 F-Prime
 Generation Investment Management
 Greenoaks Capital Partners
 GV
 Lead Edge Capital
 Princeton Ventures
 Technology Crossover Ventures
 Tiger Global
 TPG
Industries: 
fintech
 food


## 2: *UNICORNS PER COUNTRY HEATMAP GROUPED BY CONTINENTS*

In [7]:
# Get unique continents
continents = mydb.geo_locations.distinct("continent")
continent_dict={'EU':'europe', 'AS':'asia', 'AF':'africa', 'NA':'north america', 'SA':'south america', 'OC':'oceania'}

# Iterate through each continent
for continent in continents:
    result = mydb.unicorns.aggregate([
    {
        "$lookup": {
            "from": "geo_locations",
            "localField": "_GeoID",
            "foreignField": "_id",
            "as": "geo_info"
        }
    },
    {
        "$unwind": "$geo_info"
    },
    {
        "$match": {
            "geo_info.continent": continent  # Filter by continent
        }
    },
    {
        "$group": {
            "_id": "$geo_info.alpha_3_code",
            "unicorn_count": {"$sum": 1},
            "name": {"$first": "$geo_info.prettified_name"},
            "capital": {"$first": "$geo_info.capital"},
            "currency": {"$first": "$geo_info.currency"},
            "languages": {"$first": "$geo_info.languages"},
            "population2022": {"$first": "$geo_info.population2022"}
        }
    },
    {
        "$project": {
            "alpha_3_code": "$_id",
            "unicorn_count": 1,
            "name": 1,
            "capital": 1,
            "currency": 1,
            "languages": 1,
            "population2022": 1,
            "_id": 0
        }
    }
])

    
    data = list(result)
    df = pd.DataFrame(data)

    # Skip continents with no unicorns or if the continent is Oceania as it doesn't have a scope
    if df.empty or continent=='OC': 
        continue

    # Create the choropleth map
    fig = px.choropleth(
        df,
        locations="alpha_3_code",  # Use ISO Alpha-3 country codes
        color="unicorn_count",
        hover_name="name",
        hover_data={
            'alpha_3_code': False, "unicorn_count":True, "name":False, "capital":True, "currency":True, "languages":True, "population2022":True
        },
        color_continuous_scale="Viridis",  # Color scale
        title=f"Unicorn Companies by Continent - {continent}",
        projection="aitoff",
        scope=continent_dict[continent]  # Use a dictionary for mapping continents
    )
    
    fig.show()

## 3: *TIMESERIES ANALYSIS: NUMBER OF UNICORNS PER COUNTRY GROUPED BY YEAR HEATMAP*

In [10]:
result = mydb.unicorns.aggregate([
    {
        "$lookup": {  # Join with geo_locations
            "from": "geo_locations",
            "localField": "_GeoID",
            "foreignField": "_id",
            "as": "geo_info"
        }
    },
    {
        "$unwind": "$geo_info"  
    },
    {
        "$lookup": {  # Join with dealroom_companies
            "from": "dealroom_companies",
            "localField": "_DroomID",
            "foreignField": "_id",
            "as": "dealroom_info"
        }
    },
    {
        "$unwind": "$dealroom_info"  
    },
    {
        "$group": {  # Group by country and year
            "_id": {
                "country_code": "$geo_info.alpha_3_code",
                "year": "$dealroom_info.year_became_unicorn"
            },
            "unicorn_count": {"$sum": 1},
            "name": {"$first": "$geo_info.prettified_name"},
            "capital": {"$first": "$geo_info.capital"},
            "currency": {"$first": "$geo_info.currency"},
            "languages": {"$first": "$geo_info.languages"},
            "population2022": {"$first": "$geo_info.population2022"},
            "continent": {"$first": "$geo_info.continent"}
        }
    },
    {
        "$project": {  # Output the data
            "country": "$_id.country_code",
            "year": "$_id.year",
            "unicorn_count": 1,
            "name": 1,
            "capital": 1,
            "currency": 1,
            "languages": 1,
            "population2022": 1,
            "continent": 1,
            "_id": 0
        }
    },
    {
        "$sort": { "year": 1}  # Sort by year and country
    }
])

data = list(result)
df = pd.DataFrame(data)

fig = px.choropleth(
    df,
    locations="country",  # Use ISO Alpha-3 country codes
    color="unicorn_count",
    hover_name="name",  # Name of the country
    animation_frame="year",  # Enable time-series animation
    hover_data={
        "country": False,  # Don't show the ISO Alpha-3 code
        "unicorn_count": True,
        "name": False,
        "capital": True,
        "currency": True,
        "languages": True,
        "population2022": True,
        "continent": True
    },
    color_continuous_scale="Viridis",  # Color scale
    projection="natural earth",
    title="Growth of Unicorn Companies by Country"
)

fig.show()

## 4: *VECTOR SEARCH: TOP 5 COMPANIES SIMILAR TO A CHOSEN COMPANY BASED ON DESCRIPTION*

In [7]:
# Choose a random company from the unicorns collection
random_document = unicorns.aggregate([{"$sample": {"size": 1}}]).next()

# Get the description of the randomly chosen company
description = random_document.get("def_description", "")
company_name = random_document.get("name", "")
company_domain = random_document.get("domain", "")
company_id = random_document.get("_id", None)  

# Print the description of the randomly selected company
print(f"Randomly chosen company: {company_name} ({company_domain})")
print(f"Description: {description}")
print("-" * 50)

# Use the existing embedding for the company 
query_embedding = random_document.get("vectorised_description", None)

if query_embedding is None:
    print(f"No vectorised description found for the company: {company_name}.")
else:
    # Perform the vector search to find top 5 most similar companies
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",  
                "path": "vectorised_description",  # Field with embeddings
                "queryVector": query_embedding,    # Embedding for query
                "exact": True,                    
                "limit": 6                        # Limit to top 6 as it also returns the company itself
            }
        },
        {
            "$match": {
                "_id": {"$ne": company_id}  # Exclude the current company by matching the _id
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude _id field
                "name": 1, 
                "domain": 1,
                "def_description": 1, 
                "score": {
                    "$meta": "vectorSearchScore" 
                }
            }
        }
    ]

    results = unicorns.aggregate(pipeline)

    # Print the top 5 similar companies (excluding the query company itself)
    print(f"Top 5 similar companies to {company_name} based on description:")
    rank = 1 
    for result in results:
        print(f"\nRank {rank}: {result['name']} ({result['domain']})")
        print(f"Description: {result['def_description']}")
        print(f"Score: {result['score']}")
        rank += 1


Randomly chosen company: GrubMarket (grubmarket.com)
Description: GrubMarket is the AI-powered technology enabler and digital transformer of American Food Supply Chain industry. Our mission is to build and provide the eCommerce and software technologies to this industry, to transform this completely offline and highly manual industry into modernized online industry powered by software technologies, and improve the efficiency of American food supply chain.

--------------------------------------------------
Top 5 similar companies to GrubMarket based on description:

Rank 1: Afresh (afresh.com)
Description: Afresh is an AI-powered company selling software to track demand and manage orders for fresh produce in grocery stores.
Score: 0.8609310984611511

Rank 2: Text (text.com)
Description: Text builds AI-based communication software used for ecommerce and providing amazing customer service!
Score: 0.8284006118774414

Rank 3: Yijiupi (yijiupi.com)
Description: Yijiupi is an e-commerce comp

## 5: *VECTOR SEARCH AND CUSTOM NORMALISATION OF EMPLOYEE SIMILARITY: GET THE TOP 5 SIMILAR COMPANIES BASED ON DESCRIPTION AND EMPLOYEE COUNT SIMILARITY SCORE*

In [8]:
# Choose a random company from the unicorns collection
random_document = unicorns.aggregate([{"$sample": {"size": 1}}]).next()

# Get the description and dealroom_id for the randomly chosen company
description = random_document.get("def_description", "")
company_name = random_document.get("name", "")
company_domain = random_document.get("domain", "")
company_id = random_document.get("_id", None)  
dealroom_id = random_document.get("_DroomID", None) 

# Print the description and the dealroom_id of the randomly selected company
print(f"Randomly chosen company: {company_name} ({company_domain})")
print(f"Description: {description}")
print("-" * 50)  

# Retrieve employee count from the dealroom_companies collection for the randomly chosen company
dealroom_document = dealroom_companies.find_one({"_id": dealroom_id})
company_employees = dealroom_document.get("employees", 0)  

print(f"Employee Count: {company_employees}")

# Get the maximum number of employees from the dealroom_companies collection
max_employees = dealroom_companies.aggregate([
    {"$group": {"_id": None, "max_employees": {"$max": "$employees"}}}
]).next().get("max_employees")

# Use the existing embedding for the company 
query_embedding = random_document.get("vectorised_description", None)

if query_embedding is None:
    print(f"No vectorised description found for the company: {company_name}.")
else:
    # Perform the vector search to find top 5 most similar companies
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",  
                "path": "vectorised_description",  # Field with embeddings
                "queryVector": query_embedding,    # Embedding for query
                "exact": True,                    
                "limit": 6                        # Limit to top 6 as it includes the company itself
            }
        },
        {
            "$match": {
                "_id": {"$ne": company_id}  # Exclude the current company by matching the _id
            }
        },
        {
            "$lookup": {
                "from": "dealroom_companies",  # Lookup to join with the dealroom_companies collection
                "localField": "_DroomID",      # Field in unicorns collection to match
                "foreignField": "_id",         # Field in dealroom_companies to match
                "as": "dealroom_data"          
            }
        },
        {
            "$unwind": "$dealroom_data"  
        },
        {
            "$project": {
                "_id": 0,  # Exclude _id field
                "name": 1,  
                "domain": 1, 
                "def_description": 1,  
                "dealroom_data.employees": 1, 
                "score": {
                    "$meta": "vectorSearchScore"  
                }
            }
        },
        {
            "$addFields": {
                # Calculate the employee similarity score (absolute difference)
                "employee_similarity": {
                    "$abs": {
                        "$subtract": [
                            "$dealroom_data.employees",  # Employees of the matched company
                            company_employees  # Employees of the randomly chosen company
                        ]
                    }
                }
            }
        },
        {
            "$addFields": {
                # Normalize the employee similarity score (closer to 0, higher normalized similarity)
                "normalized_employee_similarity": {
                    "$let": {
                        "vars": {
                            "max_possible_value": max_employees  # Use the max employee count from the dataset
                        },
                        "in": {
                            "$subtract": [
                                1,  # Max similarity score
                                {
                                    "$divide": [
                                        "$employee_similarity",  # Employee similarity score
                                        "$$max_possible_value"  # Maximum employee similarity threshold
                                    ]
                                }
                            ]
                        }
                    }
                }
            }
        },
        {
            "$addFields": {
                # Combine the vector search score and normalized employee similarity score into a single score
                "combined_score": {
                    "$add": [
                        {"$multiply": ["$score", 0.7]},  # Weight the vector search score higher
                        {"$multiply": ["$normalized_employee_similarity", 0.3]}  # Weight the normalized employee similarity lower
                    ]
                }
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude _id field
                "name": 1,  
                "domain": 1, 
                "def_description": 1,  
                "employee_similarity": 1,  
                "normalized_employee_similarity": 1,  
                "score": 1, 
                "combined_score": 1 
            }
        },
        {
            "$sort": {
                "combined_score": -1  # Sort by the combined score in descending order
            }
        },
        {
            "$limit": 5  # Get the top 5 results
        }
    ]

    results = unicorns.aggregate(pipeline)

    # Print the top 5 similar companies (excluding the query company itself)
    print(f"Top 5 similar companies to {company_name} based on description and employee similarity:")
    rank = 1  
    for result in results:
        print(f"\nRank {rank}: {result['name']} ({result['domain']})")
        print(f"Description: {result['def_description']}")
        print(f"Vector Search Score: {result['score']}")
        print(f"Employee Similarity: {result['employee_similarity']}")
        print(f"Normalized Employee Similarity: {result['normalized_employee_similarity']}")
        print(f"Combined Score: {result['combined_score']}")
        rank += 1


Randomly chosen company: Quantron (quantron.net)
Description: Quantron is a system provider of clean battery and hydrogen-powered e-mobility for commercial vehicles such as trucks, buses, and vans.
--------------------------------------------------
Employee Count: 149
Top 5 similar companies to Quantron based on description and employee similarity:

Rank 1: H2 MOBILITY (h2-mobility.de)
Description: H2 MOBILITY is a hydrogen filling station operator that offers consulting, planning, and construction services for hydrogen stations.
Vector Search Score: 0.8402981758117676
Employee Similarity: 0
Normalized Employee Similarity: 1.0
Combined Score: 0.8882087230682372

Rank 2: Niu Technologies (niu.com)
Description: Niu Technologies is an electric scooter manufacturing company that provides smart urban mobility solutions.
Vector Search Score: 0.8133228421211243
Employee Similarity: 0
Normalized Employee Similarity: 1.0
Combined Score: 0.8693259894847869

Rank 3: Battery Smart (batterysmart.in