In [33]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import nltk

# Download NLTK data files (if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')

# Load the existing list of capabilities
capabilities_df = pd.read_excel('/Users/aaryanshah/Oncampus-Job/NLP_Gal/data/List of capabilities.xlsx')

# Concatenate Category, Segment, Sub-Segment, and Technologies/Skills into a single string
capabilities_df['combined'] = capabilities_df[['Category', 'Segment', 'Sub-Segment', 'Technologies/Skills']].agg(' '.join, axis=1)
existing_capabilities = capabilities_df['combined'].tolist()

# Load the text data
file_path = "/Users/aaryanshah/Oncampus-Job/NLP_Gal/data/TrainingSet 1.xlsx"
df = pd.read_excel(file_path, engine='openpyxl')

# Drop duplicate rows
df = df.drop_duplicates()

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply text cleaning function
df['cleaned_text'] = df['Target Business Description'].apply(clean_text)

# Preprocess the cleaned text data into lists of words
processed_text_data = df['cleaned_text'].apply(lambda x: simple_preprocess(x)).tolist()

# Train Word2Vec model
model = Word2Vec(sentences=processed_text_data, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")  # Save the model for later use (optional)

# Preprocess the existing capabilities
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_capability(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

preprocessed_existing_capabilities = [preprocess_capability(cap) for cap in existing_capabilities]

# Find new capabilities by comparing extracted features with existing capabilities
new_capabilities = [word for word in model.wv.index_to_key if word not in preprocessed_existing_capabilities]

# Use BERT embeddings for semantic similarity
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed existing capabilities and new capabilities
existing_capabilities_embeddings = bert_model.encode(preprocessed_existing_capabilities)
new_capabilities_embeddings = bert_model.encode(new_capabilities)

# Calculate cosine similarity with a higher threshold
similarity_threshold = 0.4  # Increase threshold to get more relevant capabilities
filtered_capabilities = []

for i, new_embedding in enumerate(new_capabilities_embeddings):
    similarity_scores = cosine_similarity(new_embedding.reshape(1, -1), existing_capabilities_embeddings)
    max_similarity = max(similarity_scores[0])
    if max_similarity < similarity_threshold:
        filtered_capabilities.append(new_capabilities[i])

print("Filtered Capabilities:", filtered_capabilities[:20])  # Displaying first 20 filtered capabilities for brevity

print("Length", len(filtered_capabilities))

# Get vectors for filtered capabilities
filtered_capabilities_vectors = bert_model.encode(filtered_capabilities)

# Cluster the filtered capabilities using DBSCAN
dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine')  # Adjust eps and min_samples as needed
clusters = dbscan.fit_predict(filtered_capabilities_vectors)

# Group the words by cluster
clustered_capabilities = {}
for i, word in enumerate(filtered_capabilities):
    cluster_label = clusters[i]
    if cluster_label not in clustered_capabilities:
        clustered_capabilities[cluster_label] = []
    clustered_capabilities[cluster_label].append(word)

# Output the clusters
print("Clusters of Filtered Capabilities:")
for label, cluster_keywords in clustered_capabilities.items():
    print(f"Cluster {label}: {cluster_keywords}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaryanshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aaryanshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Filtered Capabilities: ['located', 'company', 'service', 'founded', 'manufacture', 'product', 'natural', 'ltd', 'energy', 'sa', 'business', 'engaged', 'power', 'field', 'crude', 'provide', 'station', 'co', 'asset', 'operates']
Length 4515
Clusters of Filtered Capabilities:
Cluster 1: ['located', 'brazil', 'thailand', 'france', 'australia', 'spain', 'texas', 'south', 'bangkok', 'italy', 'india', 'norway', 'north', 'germany', 'argentina', 'canada', 'russian', 'uk', 'alberta', 'japan', 'indonesia', 'area', 'mexico', 'city', 'west', 'london', 'netherlands', 'san', 'africa', 'chile', 'state', 'east', 'calgary', 'china', 'austria', 'colombia', 'region', 'singapore', 'western', 'madrid', 'belgium', 'romania', 'colorado', 'america', 'california', 'thai', 'dhabi', 'portugal', 'houston', 'egypt', 'delaware', 'buenos', 'northern', 'turkey', 'asia', 'peru', 'central', 'korea', 'eastern', 'paris', 'oklahoma', 'york', 'papua', 'europe', 'la', 'tokyo', 'vietnam', 'emirate', 'rio', 'arab', 'vienna', '