In [2]:
!pip install sentence-transformers scikit-learn



Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-3.4.1

In [15]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the scraped data (material_name and price)
scraped_df = pd.read_csv("incision/material_data.csv")

# Load the target CSV file (with missing prices)
target_df = pd.read_csv("incision/Materials in Protocols - Salford Royal Hospital (1).csv")


# Load the NLP model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert material names to lowercase and replace NaN values with an empty string
scraped_df["material_name"] = scraped_df["material_name"].astype(str).str.lower().str.strip()
target_df["material_name"] = target_df["material_name"].astype(str).str.lower().str.strip()

# Encode material names into vector embeddings
scraped_embeddings = model.encode(scraped_df["material_name"].tolist())
target_embeddings = model.encode(target_df["material_name"].tolist())

# Function to find the best match based on cosine similarity
def get_best_match_index(target_embedding, scraped_embeddings):
    similarities = cosine_similarity([target_embedding], scraped_embeddings)
    best_match_idx = similarities.argmax()
    best_match_score = similarities[0, best_match_idx]
    return best_match_idx if best_match_score > 0.7 else None  # Set a similarity threshold

# Find matches and fill in missing prices
matched_prices = []
for target_embedding in target_embeddings:
    best_match_idx = get_best_match_index(target_embedding, scraped_embeddings)
    if best_match_idx is not None:
        matched_prices.append(scraped_df.iloc[best_match_idx]["price"])
    else:
        matched_prices.append(None)  # No suitable match found

# Update target DataFrame with matched prices
target_df["price"] = matched_prices

# Save the updated target data
target_df.to_csv("updated_target_data.csv", index=False)

print("Prices updated using NLP-based matching!")


Prices updated using NLP-based matching!
