In [1]:
# Step 1: Install Required Libraries
pip install spacy scikit-learn

Collecting spacy
  Downloading spacy-3.7.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downlo

In [4]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz (13.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting spacy<3.1.0,>=3.0.0 (from en-core-web-sm==3.0.0)
  Downloading spacy-3.0.9.tar.gz (989 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m989.2/989.2 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting thinc<8.1.0,>=8.0.3 (from spacy<3.1.0,>=3.0.0->en-core-web-sm==3.0.0)
  Using ca

In [None]:
!python -m spacy download en_core_web_sm

In [5]:



# Step 2: Preprocess Your Data
documents = [
    "Artificial intelligence is revolutionizing various industries with its ability to automate tasks and make data-driven decisions.",
    "Machine learning algorithms analyze large datasets to identify patterns and make predictions without explicit programming.",
    "Natural language processing enables computers to understand and generate human language, facilitating communication between humans and machines.",
    "Robotics technology is advancing rapidly, with robots being used in manufacturing, healthcare, exploration, and other fields.",
    "Blockchain technology provides a secure and transparent way to record transactions and maintain decentralized databases.",
    "Augmented reality overlays digital information onto the physical world, enhancing user experiences in gaming, education, and navigation.",
    "Internet of Things (IoT) devices connect everyday objects to the internet, enabling remote monitoring, control, and automation.",
    "Biotechnology innovations are transforming healthcare, agriculture, and environmental sustainability through genetic engineering and bioinformatics.",
    "Renewable energy sources like solar, wind, and hydro power offer sustainable alternatives to fossil fuels, reducing carbon emissions and mitigating climate change.",
    "Space exploration efforts aim to expand humanity's understanding of the universe and potentially establish colonies on other planets.",
]


# Step 3: Tokenization and Vectorization
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_sm")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Step 4: User Query Processing
user_query = "What are the applications of artificial intelligence in various industries?"
query_vector = tfidf_vectorizer.transform([user_query])

# Step 5: Semantic Search
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
most_similar_document_index = cosine_similarities.argmax()

# Step 6: Retrieve Results
most_similar_document = documents[most_similar_document_index]
print("Most similar document:", most_similar_document)




Most similar document: Artificial intelligence is revolutionizing various industries with its ability to automate tasks and make data-driven decisions.


In [7]:
# Step 1: Install Required Libraries
!pip install transformers



In [9]:
# Step 2: Preprocess Your Data
# Assume documents and user_query are defined as in the previous example

# Step 3: Tokenize and Encode Documents
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

document_embeddings = []
for document in documents:
    inputs = tokenizer(document, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    document_embedding = outputs.last_hidden_state.mean(dim=1) # Average over tokens
    document_embeddings.append(document_embedding)
document_embeddings = torch.cat(document_embeddings)

# Move document embeddings to CPU and detach to convert to NumPy
document_embeddings_np = document_embeddings.detach().cpu().numpy()

# Step 4: Tokenize and Encode User Query
user_query_inputs = tokenizer(user_query, return_tensors="pt", padding=True, truncation=True)
user_query_outputs = model(**user_query_inputs)
user_query_embedding = user_query_outputs.last_hidden_state.mean(dim=1)

# Move user query embedding to CPU and detach to convert to NumPy
user_query_embedding_np = user_query_embedding.detach().cpu().numpy()

# Step 5: Semantic Search
# Assuming cosine_similarity is from sklearn.metrics.pairwise
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(user_query_embedding_np, document_embeddings_np)
most_similar_document_index = similarities.argmax()

# Step 6: Retrieve Results
most_similar_document = documents[most_similar_document_index]
print("Most similar document:", most_similar_document)

Most similar document: Artificial intelligence is revolutionizing various industries with its ability to automate tasks and make data-driven decisions.
