In [31]:
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine

# Initialize KeyBERT with a specific SentenceTransformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
kw_model = KeyBERT(model=sentence_model)

# Define candidate business-related keywords
candidate_keywords = [
    "marketing", "finance", "investment", "startup",
    "entrepreneurship", "management", "corporate", "economics",
    "venture capital", "market analysis", "business development",
    "commercialization", "innovation", "strategic planning"
]

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get word embedding using BERT
def get_word_embedding(word, tokenizer, model):
    inputs = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Precompute embeddings for candidate keywords
candidate_embeddings = [get_word_embedding(word, tokenizer, model) for word in candidate_keywords]

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Function to count business-related keywords in text using KeyBERT
def count_business_keywords(text):
    if pd.isna(text):
        return 0
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=50, diversity=0.4)
    count = 0
    for keyword, _ in keywords:
        keyword_vec = get_word_embedding(keyword, tokenizer, model)
        for business_word_vec in candidate_embeddings:
            similarity = cosine_similarity(keyword_vec, business_word_vec)
            if similarity >= 0.85:
                count += 1
                break  # Assuming each keyword is counted only once even if it matches multiple business keywords
    return count

# Define the file path of the updated CSV
updated_file_path = r'C:\Users\Lenovo\OneDrive-CornellUniversity\Desktop\Cornell Meng CS\MEng Project 2\updated_extracted_data.csv'

# Read the updated CSV file
data = pd.read_csv(updated_file_path)

# Add the "BusinessWords" column
data['BusinessWords'] = data['Description'].apply(count_business_keywords)

# Save the updated DataFrame to a new CSV file
output_file_path = r'C:\Users\Lenovo\OneDrive-CornellUniversity\Desktop\Cornell Meng CS\MEng Project 2\final_extracted_data.csv'
data.to_csv(output_file_path, index=False)

print(f"Updated CSV saved to {output_file_path}")

                                         Description  \
3                                                NaN   
4  Entertainment forms of political communication...   
5  Introduction to the theory of computational co...   
6  Punk Culture–comprised of music, fashion, lite...   

                                  CleanedDescription  BusinessWords  
3                                        EmptyString              0  
4  entertainment form political communication pop...              7  
5  introduction theory computational complexity b...             12  
6  punk culture comprise music fashion literature...              8  
