<a href="https://colab.research.google.com/github/Aftabbs/Sentence-Similarity-Model-with-NLP-and-Transformers/blob/main/Sentence_Similarity_Transformers_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/Projects/Linktransformer')

### Transformers/NLP

In [None]:
pip install sentence_transformers



### Base Transformers Usage

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

file_df1 = "ADV Unique Managers.xlsx"
file_df2 = "Euro Advisors.xlsx"

df1 = pd.read_excel(file_df1, sheet_name='Sheet1')
df2 = pd.read_excel(file_df2, sheet_name='Sheet1')

df1['clean_name'] = df1['Unique Manager'].str.lower().str.strip()
df2['clean_name'] = df2['Advisor'].str.lower().str.strip()

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

embeddings_df1 = model.encode(df1['clean_name'].tolist(), convert_to_tensor=True)
embeddings_df2 = model.encode(df2['clean_name'].tolist(), convert_to_tensor=True)

cosine_scores = util.pytorch_cos_sim(embeddings_df2, embeddings_df1)

matches = []
for i in range(len(df2)):
    row = cosine_scores[i]
    top_two_matches = torch.topk(row, k=2)
    best_match_idx = top_two_matches.indices[0].item()
    best_match_score = top_two_matches.values[0].item()

    next_closest_match_idx = top_two_matches.indices[1].item() if len(top_two_matches.indices) > 1 else None
    next_closest_match_score = top_two_matches.values[1].item() if len(top_two_matches.values) > 1 else None

    matches.append((
        df2.iloc[i]['Advisor'],
        df1.iloc[best_match_idx]['Unique Manager'],
        best_match_score,
        df1.iloc[next_closest_match_idx]['Unique Manager'] if next_closest_match_idx is not None else None,
        next_closest_match_score if next_closest_match_score is not None else None
    ))

df_matches = pd.DataFrame(matches, columns=['Advisor', 'Best_Match_Manager', 'Best_Match_Score', 'Next_Closest_Manager', 'Next_Closest_Score'])



# df_matches.to_csv('matched_manager_advisor_with_next_closest.csv', index=False)




In [None]:
len(df_matches)

9175

In [None]:
df_matches[df_matches.Best_Match_Score>.80]

Unnamed: 0,Advisor,Best_Match_Manager,Best_Match_Score,Next_Closest_Manager,Next_Closest_Score
12,AKTSIASELTS TRIGON ASSET MANAGEMENT,AS TRIGON ASSET MANAGEMENT,0.849886,TRIENT ASSET MANAGEMENT AS,0.760598
14,ALEXANDRIA,ALEXANDRIA CAPITAL,0.858322,ALEXANDRIA CAPITAL MANAGEMENT,0.688048
20,ALTAN CAPITAL SGIIC SA,ALTA VIA CAPITAL,0.817855,"ALTAI CAPITAL MANAGEMENT, L.P.",0.766206
21,ALTUS,ALTUS,1.000000,ALTI,0.819256
24,ANTIN INFRASTRUCTURE PARTNERS,ANTIN INFRASTRUCTURE PARTNERS,1.000000,TACTICAL INFRASTRUCTURE PARTNERS,0.628234
...,...,...,...,...,...
9156,WAYSTONE ASSET MANAGEMENT IE LTD,WAYSTONE FUND MANAGEMENT IE,0.871629,WAYSTONE ADVISORS,0.828751
9157,WAYSTONE INVESTMENT MANAGEMENT (IE) LTD,WAYSTONE FUND MANAGEMENT IE,0.855825,WAYSTONE ADVISORS,0.777870
9163,CRESCA INVESTMENT ADVISORY LLC,"CR INVESTMENT ADVISORS, LLC",0.821460,CRA INVESTMENTS,0.786869
9170,CCB INTERNATIONAL ASSET MANAGEMENT LTD,"CCG ASSET MANAGEMENT, LLC",0.807297,CCB SECURITIES,0.784094


### Tune/Enhancement

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs_tfidf

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

file_df1 = "ADV Unique Managers.xlsx"
file_df2 = "Euro Advisors.xlsx"

df1 = pd.read_excel(file_df1, sheet_name='Sheet1')
df2 = pd.read_excel(file_df2, sheet_name='Sheet1')

df1['clean_name'] = df1['Unique Manager'].apply(preprocess_text)
df2['clean_name'] = df2['Advisor'].apply(preprocess_text)

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings_df1 = model.encode(df1['clean_name'].tolist(), convert_to_tensor=True)
embeddings_df2 = model.encode(df2['clean_name'].tolist(), convert_to_tensor=True)

cosine_scores = util.pytorch_cos_sim(embeddings_df2, embeddings_df1)

vectorizer = TfidfVectorizer()
tfidf_matrix_df1 = vectorizer.fit_transform(df1['clean_name'])
tfidf_matrix_df2 = vectorizer.transform(df2['clean_name'])
tfidf_cosine_scores = cs_tfidf(tfidf_matrix_df2, tfidf_matrix_df1)

device = embeddings_df1.device
cosine_scores = cosine_scores.to(device)
tfidf_cosine_scores = torch.tensor(tfidf_cosine_scores, device=device)

combined_scores = (cosine_scores + tfidf_cosine_scores) / 2

matches = []
for i in range(len(df2)):
    row = combined_scores[i]
    top_two_matches = torch.topk(row, k=2)
    best_match_idx = top_two_matches.indices[0].item()
    best_match_score = top_two_matches.values[0].item()

    next_closest_match_idx = top_two_matches.indices[1].item() if len(top_two_matches.indices) > 1 else None
    next_closest_match_score = top_two_matches.values[1].item() if len(top_two_matches.values) > 1 else None

    matches.append((
        df2.iloc[i]['Advisor'],
        df1.iloc[best_match_idx]['Unique Manager'],
        best_match_score,
        df1.iloc[next_closest_match_idx]['Unique Manager'] if next_closest_match_idx is not None else None,
        next_closest_match_score if next_closest_match_score is not None else None
    ))

df_matches = pd.DataFrame(matches, columns=['Advisor', 'Best_Match_Manager', 'Best_Match_Score', 'Next_Closest_Manager', 'Next_Closest_Score'])

# df_matches.to_csv('advisor_manager_matches_enhanced.csv', index=False)


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
len(df_matches)

9175

In [None]:
df_matches[df_matches.Best_Match_Score>=.8]

Unnamed: 0,Advisor,Best_Match_Manager,Best_Match_Score,Next_Closest_Manager,Next_Closest_Score
12,AKTSIASELTS TRIGON ASSET MANAGEMENT,AS TRIGON ASSET MANAGEMENT,0.915792,TRIENT ASSET MANAGEMENT AS,0.475989
14,ALEXANDRIA,ALEXANDRIA CAPITAL,0.899158,ALEXANDRIA CAPITAL MANAGEMENT,0.808675
21,ALTUS,ALTUS,1.000000,ALTUS CAPITAL INC.,0.789961
24,ANTIN INFRASTRUCTURE PARTNERS,ANTIN INFRASTRUCTURE PARTNERS,1.000000,TACTICAL INFRASTRUCTURE PARTNERS,0.520319
27,APRIN INVEST,INVEST BY DOING,0.890486,INVEST CAPITAL,0.817566
...,...,...,...,...,...
9122,RICHTWERT FUNDS,UP FUNDS,0.862806,LL FUNDS,0.862806
9143,GALILEE ASSET MANAGEMENT SA,SA ASSET MANAGEMENT,0.842598,CAPITAL FUND MANAGEMENT S.A.,0.628884
9156,WAYSTONE ASSET MANAGEMENT IE LTD,WAYSTONE FUND MANAGEMENT IE,0.823756,WAYSTONE ADVISORS,0.629607
9157,WAYSTONE INVESTMENT MANAGEMENT (IE) LTD,WAYSTONE FUND MANAGEMENT IE,0.849431,WAYSTONE ADVISORS,0.638106


### differnet models exploration

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs_tfidf

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Load datasets from Excel files
file_df1 = "ADV Unique Managers.xlsx"
file_df2 = "Euro Advisors.xlsx"

df1 = pd.read_excel(file_df1, sheet_name='Sheet1')
df2 = pd.read_excel(file_df2, sheet_name='Sheet1')

# Preprocess names
df1['clean_name'] = df1['Unique Manager'].apply(preprocess_text)
df2['clean_name'] = df2['Advisor'].apply(preprocess_text)

# Load pre-trained LaBSE model
model = SentenceTransformer('google-bert/bert-base-uncased')

# Compute embeddings
embeddings_df1 = model.encode(df1['clean_name'].tolist(), convert_to_tensor=True)
embeddings_df2 = model.encode(df2['clean_name'].tolist(), convert_to_tensor=True)

# Compute cosine similarity for embeddings
cosine_scores = util.pytorch_cos_sim(embeddings_df2, embeddings_df1)

# Compute TF-IDF cosine similarity
vectorizer = TfidfVectorizer()
tfidf_matrix_df1 = vectorizer.fit_transform(df1['clean_name'])
tfidf_matrix_df2 = vectorizer.transform(df2['clean_name'])
tfidf_cosine_scores = cs_tfidf(tfidf_matrix_df2, tfidf_matrix_df1)

# Ensure all tensors are on the same device
device = embeddings_df1.device
cosine_scores = cosine_scores.to(device)
tfidf_cosine_scores = torch.tensor(tfidf_cosine_scores, device=device)

# Combine similarity scores with weighted average
combined_scores = (cosine_scores + tfidf_cosine_scores) / 2

# Determine matches
matches = []
for i in range(len(df2)):
    row = combined_scores[i]
    top_two_matches = torch.topk(row, k=2)
    best_match_idx = top_two_matches.indices[0].item()
    best_match_score = top_two_matches.values[0].item()

    next_closest_match_idx = top_two_matches.indices[1].item() if len(top_two_matches.indices) > 1 else None
    next_closest_match_score = top_two_matches.values[1].item() if len(top_two_matches.values) > 1 else None

    matches.append((
        df2.iloc[i]['Advisor'],
        df1.iloc[best_match_idx]['Unique Manager'],
        best_match_score,
        df1.iloc[next_closest_match_idx]['Unique Manager'] if next_closest_match_idx is not None else None,
        next_closest_match_score if next_closest_match_score is not None else None
    ))

# Convert matches to DataFrame
df_matches = pd.DataFrame(matches, columns=['Advisor', 'Best_Match_Manager', 'Best_Match_Score', 'Next_Closest_Manager', 'Next_Closest_Score'])

# Output or further process df_matches as needed
# df_matches.to_csv('advisor_manager_matches_enhanced.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.53 GiB. GPU 

In [None]:
df_matches[df_matches.Best_Match_Score>.8].sample(10)

Unnamed: 0,Advisor,Best_Match_Manager,Best_Match_Score,Next_Closest_Manager,Next_Closest_Score
2497,DIFFUSION CAPITAL PARTNERS,T CAPITAL PARTNERS,0.89789,HAVEN CAPITAL PARTNERS,0.89789
5195,EXANE ASSET MANAGEMENT SA,SA ASSET MANAGEMENT,0.905552,CAPITAL FUND MANAGEMENT S.A.,0.711315
1044,ARCADIA BEFEKTETESI,ARCADIA INVESTMENT PARTNERS,0.81085,ARCADIA WEALTH MANAGEMENT,0.80644
3969,LIVELIHOODS FUNDS,UP FUNDS,0.872686,LL FUNDS,0.872686
6011,HOLD ALAPKEZELO INVESTMENT FUND MANAGEMENT,FUND MANAGEMENT,0.813798,F & H FUND MANAGEMENT,0.792913
1835,SFM STOCKHOLM,"SFM, LLC",0.810035,SFM FINANCIAL ADVISORS,0.738392
2211,SENEVE CAPITAL,MOST CAPITAL,0.846572,WHAT IF CAPITAL,0.846572
4112,BNP PARIBAS ASSET MANAGEMENT MALAYSIA SDN BHD,BNP PARIBAS ASSET MANAGEMENT GROUP,0.896727,BNP GROUP,0.552474
8364,SG ASSET MANAGEMENT LTD,SG CAPITAL MANAGEMENT LLC,0.815835,SG WEALTH MANAGERS LLC,0.706798
2981,KAROLL CAPITAL MANAGEMENT EAD,CAPITAL Q MANAGEMENT,0.831042,E CAPITAL MANAGEMENT,0.824034


### Additional Matches columns

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs_tfidf

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

file_df1 = "ADV Unique Managers.xlsx"
file_df2 = "Euro Advisors.xlsx"

df1 = pd.read_excel(file_df1, sheet_name='Sheet1')
df2 = pd.read_excel(file_df2, sheet_name='Sheet1')

df1['clean_name'] = df1['Unique Manager'].apply(preprocess_text)
df2['clean_name'] = df2['Advisor'].apply(preprocess_text)

model = SentenceTransformer('sentence-transformers/LaBSE')

embeddings_df1 = model.encode(df1['clean_name'].tolist(), convert_to_tensor=True)
embeddings_df2 = model.encode(df2['clean_name'].tolist(), convert_to_tensor=True)

cosine_scores = util.pytorch_cos_sim(embeddings_df2, embeddings_df1)

vectorizer = TfidfVectorizer()
tfidf_matrix_df1 = vectorizer.fit_transform(df1['clean_name'])
tfidf_matrix_df2 = vectorizer.transform(df2['clean_name'])
tfidf_cosine_scores = cs_tfidf(tfidf_matrix_df2, tfidf_matrix_df1)

device = embeddings_df1.device
cosine_scores = cosine_scores.to(device)
tfidf_cosine_scores = torch.tensor(tfidf_cosine_scores, device=device)

combined_scores = (cosine_scores + tfidf_cosine_scores) / 2

matches = []
for i in range(len(df2)):
    row = combined_scores[i]
    top_three_matches = torch.topk(row, k=3)
    best_match_idx = top_three_matches.indices[0].item()
    best_match_score = top_three_matches.values[0].item()

    next_closest_match_idx_1 = top_three_matches.indices[1].item() if len(top_three_matches.indices) > 1 else None
    next_closest_match_score_1 = top_three_matches.values[1].item() if len(top_three_matches.values) > 1 else None

    next_closest_match_idx_2 = top_three_matches.indices[2].item() if len(top_three_matches.indices) > 2 else None
    next_closest_match_score_2 = top_three_matches.values[2].item() if len(top_three_matches.values) > 2 else None

    matches.append((
        df2.iloc[i]['Advisor'],
        df1.iloc[best_match_idx]['Unique Manager'],
        best_match_score,
        df1.iloc[next_closest_match_idx_1]['Unique Manager'] if next_closest_match_idx_1 is not None else None,
        next_closest_match_score_1 if next_closest_match_score_1 is not None else None,
        df1.iloc[next_closest_match_idx_2]['Unique Manager'] if next_closest_match_idx_2 is not None else None,
        next_closest_match_score_2 if next_closest_match_score_2 is not None else None
    ))

df_matches = pd.DataFrame(matches, columns=[
    'Advisor',
    'Best_Match_Manager',
    'Best_Match_Score',
    'Next_Closest_Manager_1',
    'Next_Closest_Score_1',
    'Next_Closest_Manager_2',
    'Next_Closest_Score_2'
])

df_matches.to_csv('advisor_manager_matches_enhanced.csv', index=False)

# print(df_matches.head())


Recommended Models:
XLM-R (XLM-Roberta)

Description: XLM-R is a multilingual variant of RoBERTa, trained on 100 languages. It is known for its strong performance on a wide range of tasks and languages.
Model Name: xlm-roberta-base or xlm-roberta-large
Usage: Good for tasks involving multiple languages and complex text understanding.
mBERT (Multilingual BERT)

Description: mBERT is a multilingual version of BERT, trained on the top 104 languages with the largest Wikipedia. It is designed to work well across a variety of languages.
Model Name: bert-base-multilingual-cased or bert-base-multilingual-uncased
Usage: Suitable for general multilingual tasks.
LaBSE (Language-agnostic BERT Sentence Embedding)

Description: LaBSE is designed specifically for producing high-quality sentence embeddings across a wide variety of languages.
Model Name: sentence-transformers/LaBSE
Usage: Excellent for multilingual sentence embeddings and cross-lingual tasks.
MUSE (Multilingual Universal Sentence Encoder)

Description: MUSE is developed by Google and is capable of generating embeddings for multiple languages. It focuses on providing universal sentence representations.
Model Name: tensorflow/hub/muse
Usage: Great for tasks requiring universal sentence representations.