In [1]:
import torch
import pandas as pd
from transformers import BertModel, BertTokenizer
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')




In [3]:
# Define candidate keywords
candidate_keywords = [
    "marketing", "finance", "investment", "startup",
    "entrepreneurship", "management", "corporate", "economics",
    "venture capital", "market analysis", "business development",
    "commercialization", "innovation", "strategic planning"
]

In [4]:
# Function to get word embeddings in tensor format
def get_word_embedding(word, tokenizer, model):
    tokens = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1)


In [5]:
# Precompute embeddings for candidate keywords
candidate_embeddings = [get_word_embedding(word, tokenizer, model) for word in candidate_keywords]

In [6]:
# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return torch.nn.functional.cosine_similarity(vec1, vec2).item()

In [7]:
# Function to calculate similarity for each row
def calculate_similarity(keywords):
    keywords = keywords.strip("[]").replace("'", "").split(",")
    count = 0
    matching_keywords = []
    for keyword in keywords:
        keyword_vec = get_word_embedding(keyword, tokenizer, model)
        for business_word_vec in candidate_embeddings:
            similarity = cosine_similarity(keyword_vec, business_word_vec)
            if similarity >= 0.84:
                count += 1
                matching_keywords.append(keyword)
                break
    return count, matching_keywords

In [8]:
# Load the CSV file
file_path = "/Users/faizanmulla/Desktop/MENG Project 2/MEngProject2/final_extracted_data.csv"
df = pd.read_csv(file_path)


In [12]:
# Function to process each row using ThreadPoolExecutor
def process_row(row):
    count, matching_keywords = calculate_similarity(row.BusinessKeywords)
    return count, ', '.join(matching_keywords)

In [13]:
# Apply the function to each row in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_row, df.itertuples(index=False)))

In [14]:
# Update the dataframe with the results
df['BusinessCount'], df['ExtractedKeywords'] = zip(*results)

# Save the updated dataframe to a new CSV file
output_file_path = "/Users/faizanmulla/Desktop/MENG Project 2/MEngProject2/Final_Data_Analysis_Business.csv"
df.to_csv(output_file_path, index=False)

print("BusinessCount and ExtractedKeywords columns added and CSV file saved.")

BusinessCount and ExtractedKeywords columns added and CSV file saved.
