In [1]:
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')




In [3]:
candidate_keywords = [
 "Ethical Sustainable Design", "Safety Risk Management", "Professional Responsibility",
 "Animal Research Ethics","Human Movement Analysis Ethics", "Robotics Ethics", "Prosthetics Ethics",
 "Research Data Management", "Research supervision", "Intellectual Property Rights", "Ethical Dilemma",
 "Doctor-Patient Relationship", "Clinical Trial Ethics", "Bioethics","Biomedicine","Patient Consent",
 "Legal Professional Ethics", "Justice and Fairness in Law", "Client Confidentiality"
]

In [4]:
# Function to get word embeddings in tensor format
def get_word_embedding(word, tokenizer, model):
    tokens = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1)


In [5]:
# Precompute embeddings for candidate keywords
candidate_embeddings = [get_word_embedding(word, tokenizer, model) for word in candidate_keywords]

In [6]:
# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return torch.nn.functional.cosine_similarity(vec1, vec2).item()

In [8]:
# Function to calculate similarity for each row
def calculate_similarity(keywords):
    keywords = keywords.strip("[]").replace("'", "").split(",")
    count = 0
    matching_keywords = []
    for keyword in keywords:
        keyword_vec = get_word_embedding(keyword, tokenizer, model)
        for business_word_vec in candidate_embeddings:
            similarity = cosine_similarity(keyword_vec, business_word_vec)
            if similarity >= 0.7:
                count += 1
                matching_keywords.append(keyword)
                break
    return count, matching_keywords

In [10]:
# Load the CSV file
file_path = "/Users/faizanmulla/Desktop/MENG Project 2/CSV MENG 2/final_extracted_data.csv"
df = pd.read_csv(file_path)


In [11]:
# Function to process each row using ThreadPoolExecutor
def process_row(row):
    count, matching_keywords = calculate_similarity(row.BusinessKeywords)
    return count, ', '.join(matching_keywords)

In [14]:
# Apply the function to each row in parallel
with ThreadPoolExecutor(max_workers=12) as executor:
    results = list(executor.map(process_row, df.itertuples(index=False)))

In [15]:
# Update the dataframe with the results
df['EthicsCount'], df['ExtractedKeywords'] = zip(*results)

# Save the updated dataframe to a new CSV file
output_file_path = "/Users/faizanmulla/Desktop/MENG Project 2/MEngProject2/Final_Data_Analysis_Ethics.csv"
df.to_csv(output_file_path, index=False)

print("Ethics Count and ExtractedKeywords columns added and CSV file saved.")

Ethics Count and ExtractedKeywords columns added and CSV file saved.
