In [22]:
# Import required libraries
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def first_word_penalty(name1, name2):
    """Apply a penalty if the first two words are completely different"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    if len(tokens1) > 1 and len(tokens2) > 1 and tokens1[:2] != tokens2[:2]:
        return 0.2  # Apply a penalty if the first two words do not match
    return 0.0

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric combining multiple approaches"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(tokens1, tokens2)
    subset_bonus = subset_match(name1, name2)
    penalty = first_word_penalty(name1, name2)
    
    hybrid_score = (seq_sim * 0.3 + jaccard_sim * 0.4 + subset_bonus * 0.3) * (1 - penalty)
    return hybrid_score

def compare_names(df, threshold=0.7):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    
    return pd.DataFrame(matches)

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east",
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine services", 
    "almuhit alhadi marine services",
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")


Matching Results:
                           Name 1               Name 2  Similarity
0        litasco middle east dmcc  litasco middle east       0.865
1  star voyages shipping services         star voyages       0.671


In [33]:
# Import required libraries
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 0.5  # Higher importance for earlier words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 1.5  # Increase weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric combining multiple approaches"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    if len(tokens1) < 3 or len(tokens2) < 3:
        return sequence_similarity(name1_clean, name2_clean)  # Simple similarity for short names
    
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    
    hybrid_score = (seq_sim * 0.2 + jaccard_sim * 0.3 + subset_bonus * 0.2 + position_bonus * 0.3)
    return hybrid_score

def compare_names(df, threshold=0.7):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    
    return pd.DataFrame(matches)

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east",
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine services", 
    "almuhit alhadi marine services",
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")


Matching Results:
                             Name 1                            Name 2  \
0          litasco middle east dmcc               litasco middle east   
7                    vladimir putin          president vladimir putin   
5    star voyages shipping services                      star voyages   
4  alqutb alshamali marine services    almuhit alhadi marine services   
2          one moon marine services  alqutb alshamali marine services   
3          one moon marine services    almuhit alhadi marine services   
6                    vladimir putin                             putin   
1                    dynamik trader                     nari strength   

   Similarity  
0       0.902  
7       0.737  
5       0.571  
4       0.568  
2       0.529  
3       0.526  
6       0.526  
1       0.519  


In [34]:
matches.sort_values('Similarity', ascending=False)

Unnamed: 0,Name 1,Name 2,Similarity
0,litasco middle east dmcc,litasco middle east,0.902
7,vladimir putin,president vladimir putin,0.737
5,star voyages shipping services,star voyages,0.571
4,alqutb alshamali marine services,almuhit alhadi marine services,0.568
2,one moon marine services,alqutb alshamali marine services,0.529
3,one moon marine services,almuhit alhadi marine services,0.526
6,vladimir putin,putin,0.526
1,dynamik trader,nari strength,0.519


In [47]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric combining multiple approaches with specific focus for long names"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # If both names have less than 3 tokens, apply sequence similarity directly
    if len(tokens1) < 3 or len(tokens2) < 3:
        return sequence_similarity(name1_clean, name2_clean)  # Simple similarity for short names
    
    # Calculate Jaccard, Sequence, Position, and Common Word Bonus
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Adjust sequence similarity: If few common words, reduce its weight
    if common_bonus < 0.2:  # Low commonality between words
        seq_sim *= 0.4  # Reduce the influence of sequence similarity
    
    # Hybrid scoring with reduced penalty for dissimilar names
    hybrid_score = (seq_sim * 0.2 + jaccard_sim * 0.3 + subset_bonus * 0.1 + position_bonus * 0.2 + common_bonus * 0.2)
    
    # If very low common words, apply a more significant penalty
    if common_bonus < 0.2:
        hybrid_score *= 0.4  # Apply stronger penalty for minimal common words
    
    return hybrid_score

def compare_names(df, threshold=0.5):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    
    return pd.DataFrame(matches)

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east",
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")

Matching Results:
                           Name 1                          Name 2  Similarity
0        litasco middle east dmcc             litasco middle east       0.852
5                  vladimir putin        president vladimir putin       0.737
3  star voyages shipping services                    star voyages       0.571
2        one moon marine services  almuhit alhadi marine services       0.526
4                  vladimir putin                           putin       0.526
1                  dynamik trader                   nari strength       0.519


In [49]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric with stronger penalties for dissimilar names"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # Immediately return very low score if no common words at all
    common_tokens = set(tokens1).intersection(set(tokens2))
    if len(common_tokens) == 0:
        return 0.1  # Very low baseline score for completely different names
    
    # If both names have less than 3 tokens, apply sequence similarity directly
    if len(tokens1) < 3 or len(tokens2) < 3:
        return sequence_similarity(name1_clean, name2_clean)
    
    # Calculate Jaccard, Sequence, Position, and Common Word Bonus
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Adjust sequence similarity: If few common words, reduce its weight more aggressively
    if common_bonus < 0.3:  # Low commonality between words
        seq_sim *= 0.3  # Reduce the influence of sequence similarity more
    
    # Hybrid scoring with stronger penalty for dissimilar names
    hybrid_score = (seq_sim * 0.2 + jaccard_sim * 0.3 + subset_bonus * 0.1 + position_bonus * 0.2 + common_bonus * 0.2)
    
    # Apply stronger penalties for minimal common words
    if common_bonus < 0.3:
        hybrid_score *= 0.5
    
    return hybrid_score

def compare_names(df, threshold=0.6):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    
    return pd.DataFrame(matches)

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc", #
    "litasco middle east", #
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", # 
    "uae shipping association", 
    "star voyages", #
    "vladimir putin", # 
    "putin", #
    "president vladimir putin" #
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)  # Increased threshold for more precision

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")

Matching Results:
                           Name 1                          Name 2  Similarity
0        litasco middle east dmcc             litasco middle east       0.852
4                  vladimir putin        president vladimir putin       0.737
2  star voyages shipping services                    star voyages       0.571
1        one moon marine services  almuhit alhadi marine services       0.526
3                  vladimir putin                           putin       0.526


In [62]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric balancing common industry terms with unique identifiers"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # Slightly reduced list of common industry terms
    common_industry_terms = {
        'marine', 'services', 'shipping', 'association', 'inc', 'ltd', 'llc',
        'corp', 'company', 'co', 'group', 'international', 'global', 'trader',
        'trading', 'logistics', 'transport', 'agency', 'middle', 'east', 'voyages'
    }
    
    # Calculate standard similarity metrics
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Calculate meaningful common words (excluding common industry terms)
    meaningful_tokens1 = set(t for t in tokens1 if t not in common_industry_terms)
    meaningful_tokens2 = set(t for t in tokens2 if t not in common_industry_terms)
    meaningful_common = meaningful_tokens1.intersection(meaningful_tokens2)
    
    # Calculate common industry terms
    common_tokens = set(tokens1).intersection(set(tokens2))
    industry_common = set(t for t in common_tokens if t in common_industry_terms)
    
    # Apply specific penalty for the "one moon/almuhit alhadi" case
    # If they only share industry terms and have different distinctive words
    if common_tokens and len(industry_common) > 0 and len(meaningful_common) == 0:
        # Only reduce score for specific case, not eliminate completely
        seq_sim *= 0.5
        jaccard_sim *= 0.5
        position_bonus *= 0.5
    
    # Standard hybrid scoring 
    hybrid_score = (seq_sim * 0.25 + jaccard_sim * 0.3 + subset_bonus * 0.15 + 
                   position_bonus * 0.15 + common_bonus * 0.15)
    
    # Apply a less aggressive penalty for specific case
    if common_tokens and len(industry_common) / len(common_tokens) > 0.5 and len(meaningful_common) == 0:
        hybrid_score *= 0.7  # Moderate penalty
    
    return hybrid_score

def compare_names(df, threshold=0.6):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    
    return pd.DataFrame(matches)

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east",
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.1)  # You can adjust threshold as needed

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")

Matching Results:
                            Name 1                          Name 2  Similarity
0         litasco middle east dmcc             litasco middle east       0.858
10  star voyages shipping services                    star voyages       0.668
12                  vladimir putin        president vladimir putin       0.634
11                  vladimir putin                           putin       0.507
13                           putin        president vladimir putin       0.386
6         one moon marine services  almuhit alhadi marine services       0.195
8          alqutb alshamali marine  almuhit alhadi marine services       0.133
3                   dynamik trader                   nari strength       0.130
2              litasco middle east  almuhit alhadi marine services       0.112
5         one moon marine services         alqutb alshamali marine       0.111
7         one moon marine services  star voyages shipping services       0.110
9   almuhit alhadi marine services

In [65]:
common_industry_terms = {
    # Company types and legal entities
    'inc', 'ltd', 'llc', 'corp', 'corporation', 'company', 'co', 'group', 'holdings', 
    'enterprises', 'international', 'global', 'worldwide', 'national', 'incorporated',
    'limited', 'partners', 'partnership', 'gmbh', 'srl', 'sa', 'ag', 'bv', 'pte',
    
    # Industry descriptors
    'marine', 'services', 'shipping', 'logistics', 'transport', 'transportation', 
    'cargo', 'freight', 'forwarding', 'trading', 'trader', 'import', 'export',
    'commercial', 'business', 'industrial', 'industries', 'solutions', 'systems',
    
    # Maritime specific
    'shipping', 'maritime', 'sea', 'ocean', 'port', 'harbor', 'vessel', 'boat',
    'ship', 'tanker', 'carrier', 'fleet', 'navigation', 'offshore', 'voyages',
    
    # General business terms
    'association', 'agency', 'bureau', 'center', 'centre', 'office', 'department',
    'division', 'unit', 'management', 'consulting', 'consultancy', 'advisory',
    'resources', 'operations', 'ventures', 'investment', 'investments',
    
    # Geographical/regional indicators
    'middle', 'east', 'west', 'north', 'south', 'central', 'eastern', 'western',
    'northern', 'southern', 'regional', 'local', 'international', 'gulf', 'asia',
    'european', 'american', 'africa', 'pacific', 'atlantic',
    
    # Common words in company names
    'the', 'and', 'of', 'for', 'to', 'by', 'on', 'with', 'in', 'al', 'el'
}

import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric balancing common industry terms with unique identifiers"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # Slightly reduced list of common industry terms
    
    # Calculate standard similarity metrics
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Calculate meaningful common words (excluding common industry terms)
    meaningful_tokens1 = set(t for t in tokens1 if t not in common_industry_terms)
    meaningful_tokens2 = set(t for t in tokens2 if t not in common_industry_terms)
    meaningful_common = meaningful_tokens1.intersection(meaningful_tokens2)
    
    # Calculate common industry terms
    common_tokens = set(tokens1).intersection(set(tokens2))
    industry_common = set(t for t in common_tokens if t in common_industry_terms)
    
    # Apply specific penalty for the "one moon/almuhit alhadi" case
    # If they only share industry terms and have different distinctive words
    if common_tokens and len(industry_common) > 0 and len(meaningful_common) == 0:
        # Only reduce score for specific case, not eliminate completely
        seq_sim *= 0.5
        jaccard_sim *= 0.5
        position_bonus *= 0.5
    
    # Standard hybrid scoring 
    hybrid_score = (seq_sim * 0.25 + jaccard_sim * 0.3 + subset_bonus * 0.15 + 
                   position_bonus * 0.15 + common_bonus * 0.15)
    
    # Apply a less aggressive penalty for specific case
    if common_tokens and len(industry_common) / len(common_tokens) > 0.5 and len(meaningful_common) == 0:
        hybrid_score *= 0.7  # Moderate penalty
    
    return hybrid_score

def compare_names(df, threshold=0.6):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    print(matches)
    return pd.DataFrame(matches)

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east"
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin",

    

    "John Smith",
    "Jon Smith",
    "Dr. John Smith",
    "John J. Smith",
    "Jonathan Smith",
    "Johnny Smith",
    "Sarah Jones",
    "Michael Johnson"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)  # You can adjust threshold as needed

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")

[{'Name 1': 'litasco middle east dmcc', 'Name 2': 'litasco middle eastdynamik trader', 'Similarity': 0.509}, {'Name 1': 'star voyages shipping services', 'Name 2': 'star voyages', 'Similarity': 0.668}, {'Name 1': 'vladimir putin', 'Name 2': 'putin', 'Similarity': 0.507}, {'Name 1': 'vladimir putin', 'Name 2': 'president vladimir putin', 'Similarity': 0.634}, {'Name 1': 'John Smith', 'Name 2': 'Jon Smith', 'Similarity': 0.532}, {'Name 1': 'John Smith', 'Name 2': 'Dr. John Smith', 'Similarity': 0.667}, {'Name 1': 'John Smith', 'Name 2': 'John J. Smith', 'Similarity': 0.827}, {'Name 1': 'John Smith', 'Name 2': 'Jonathan Smith', 'Similarity': 0.503}, {'Name 1': 'John Smith', 'Name 2': 'Johnny Smith', 'Similarity': 0.522}, {'Name 1': 'Jon Smith', 'Name 2': 'Johnny Smith', 'Similarity': 0.509}, {'Name 1': 'Dr. John Smith', 'Name 2': 'John J. Smith', 'Similarity': 0.546}]
Matching Results:
                            Name 1                             Name 2  \
6                       John Sm

In [None]:
common_industry_terms = {
    # Company types and legal entities
    'inc', 'ltd', 'llc', 'corp', 'corporation', 'company', 'co', 'group', 'holdings', 
    'enterprises', 'international', 'global', 'worldwide', 'national', 'incorporated',
    'limited', 'partners', 'partnership', 'gmbh', 'srl', 'sa', 'ag', 'bv', 'pte',
    
    # Industry descriptors
    'marine', 'services', 'shipping', 'logistics', 'transport', 'transportation', 
    'cargo', 'freight', 'forwarding', 'trading', 'trader', 'import', 'export',
    'commercial', 'business', 'industrial', 'industries', 'solutions', 'systems',
    
    # Maritime specific
    'shipping', 'maritime', 'sea', 'ocean', 'port', 'harbor', 'vessel', 'boat',
    'ship', 'tanker', 'carrier', 'fleet', 'navigation', 'offshore', 'voyages',
    
    # General business terms
    'association', 'agency', 'bureau', 'center', 'centre', 'office', 'department',
    'division', 'unit', 'management', 'consulting', 'consultancy', 'advisory',
    'resources', 'operations', 'ventures', 'investment', 'investments',
    
    # Geographical/regional indicators
    'middle', 'east', 'west', 'north', 'south', 'central', 'eastern', 'western',
    'northern', 'southern', 'regional', 'local', 'international', 'gulf', 'asia',
    'european', 'american', 'africa', 'pacific', 'atlantic',
    
    # Common words in company names
    'the', 'and', 'of', 'for', 'to', 'by', 'on', 'with', 'in', 'al', 'el'
}

import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric balancing common industry terms with unique identifiers"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # Slightly reduced list of common industry terms
    
    # Calculate standard similarity metrics
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Calculate meaningful common words (excluding common industry terms)
    meaningful_tokens1 = set(t for t in tokens1 if t not in common_industry_terms)
    meaningful_tokens2 = set(t for t in tokens2 if t not in common_industry_terms)
    meaningful_common = meaningful_tokens1.intersection(meaningful_tokens2)
    
    # Calculate common industry terms
    common_tokens = set(tokens1).intersection(set(tokens2))
    industry_common = set(t for t in common_tokens if t in common_industry_terms)
    
    # Apply specific penalty for the "one moon/almuhit alhadi" case
    # If they only share industry terms and have different distinctive words
    if common_tokens and len(industry_common) > 0 and len(meaningful_common) == 0:
        # Only reduce score for specific case, not eliminate completely
        seq_sim *= 0.5
        jaccard_sim *= 0.5
        position_bonus *= 0.5
    
    # Standard hybrid scoring 
    hybrid_score = (seq_sim * 0.25 + jaccard_sim * 0.3 + subset_bonus * 0.15 + 
                   position_bonus * 0.15 + common_bonus * 0.15)
    
    # Apply a less aggressive penalty for specific case
    if common_tokens and len(industry_common) / len(common_tokens) > 0.5 and len(meaningful_common) == 0:
        hybrid_score *= 0.7  # Moderate penalty
    
    return hybrid_score

def compare_names(df, threshold=0.6):
    """Find matching names in a DataFrame"""
    matches = []
    matchesx = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            name1, name2 = df.iloc[i]['name'], df.iloc[j]['name']
            similarity = calculate_name_similarity(name1, name2)
            
            if similarity >= threshold:
                matchesx.append({
                    'match': [name1, name2],
                    'Record 1': f"ID: {df.iloc[i].name}, Name: {name1}",
                    'Record 2': f"ID: {df.iloc[j].name}, Name: {name2}",
                    'Similarities': {
                        'name': f"{similarity:.2f}"
                    }
                })
                matches.append({
                    'Name 1': name1,
                    'Name 2': name2,
                    'Similarity': round(similarity, 3)
                })
    print(matches)
    return pd.DataFrame(matches), matchesx

# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east"
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin",

    

    "John Smith",
    "Jon Smith",
    "Dr. John Smith",
    "John J. Smith",
    "Jonathan Smith",
    "Johnny Smith",
    "Sarah Jones",
    "Michael Johnson"
]})

# Compare names and find matches
matchesy,fer = compare_names(df, threshold=0.5)  # You can adjust threshold as needed

# Display results


[{'Name 1': 'litasco middle east dmcc', 'Name 2': 'litasco middle eastdynamik trader', 'Similarity': 0.509}, {'Name 1': 'star voyages shipping services', 'Name 2': 'star voyages', 'Similarity': 0.668}, {'Name 1': 'vladimir putin', 'Name 2': 'putin', 'Similarity': 0.507}, {'Name 1': 'vladimir putin', 'Name 2': 'president vladimir putin', 'Similarity': 0.634}, {'Name 1': 'John Smith', 'Name 2': 'Jon Smith', 'Similarity': 0.532}, {'Name 1': 'John Smith', 'Name 2': 'Dr. John Smith', 'Similarity': 0.667}, {'Name 1': 'John Smith', 'Name 2': 'John J. Smith', 'Similarity': 0.827}, {'Name 1': 'John Smith', 'Name 2': 'Jonathan Smith', 'Similarity': 0.503}, {'Name 1': 'John Smith', 'Name 2': 'Johnny Smith', 'Similarity': 0.522}, {'Name 1': 'Jon Smith', 'Name 2': 'Johnny Smith', 'Similarity': 0.509}, {'Name 1': 'Dr. John Smith', 'Name 2': 'John J. Smith', 'Similarity': 0.546}]


In [78]:
matchesy

Unnamed: 0,Name 1,Name 2,Similarity
0,litasco middle east dmcc,litasco middle eastdynamik trader,0.509
1,star voyages shipping services,star voyages,0.668
2,vladimir putin,putin,0.507
3,vladimir putin,president vladimir putin,0.634
4,John Smith,Jon Smith,0.532
5,John Smith,Dr. John Smith,0.667
6,John Smith,John J. Smith,0.827
7,John Smith,Jonathan Smith,0.503
8,John Smith,Johnny Smith,0.522
9,Jon Smith,Johnny Smith,0.509


In [79]:
matchesd

[{'match': ['litasco middle east dmcc', 'litasco middle eastdynamik trader'],
  'Record 1': 'ID: 0, Name: litasco middle east dmcc',
  'Record 2': 'ID: 1, Name: litasco middle eastdynamik trader',
  'Similarities': {'name': '0.51'}},
 {'match': ['star voyages shipping services', 'star voyages'],
  'Record 1': 'ID: 6, Name: star voyages shipping services',
  'Record 2': 'ID: 8, Name: star voyages',
  'Similarities': {'name': '0.67'}},
 {'match': ['vladimir putin', 'putin'],
  'Record 1': 'ID: 9, Name: vladimir putin',
  'Record 2': 'ID: 10, Name: putin',
  'Similarities': {'name': '0.51'}},
 {'match': ['vladimir putin', 'president vladimir putin'],
  'Record 1': 'ID: 9, Name: vladimir putin',
  'Record 2': 'ID: 11, Name: president vladimir putin',
  'Similarities': {'name': '0.63'}},
 {'match': ['John Smith', 'Jon Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 13, Name: Jon Smith',
  'Similarities': {'name': '0.53'}},
 {'match': ['John Smith', 'Dr. John Smith'],
 

In [64]:
matches

Unnamed: 0,Name 1,Name 2,Similarity
0,litasco middle east dmcc,litasco middle eastdynamik trader,0.509
1,star voyages shipping services,star voyages,0.668
2,vladimir putin,putin,0.507
3,vladimir putin,president vladimir putin,0.634
4,John Smith,Jon Smith,0.532
5,John Smith,Dr. John Smith,0.667
6,John Smith,John J. Smith,0.827
7,John Smith,Jonathan Smith,0.503
8,John Smith,Johnny Smith,0.522
9,Jon Smith,Johnny Smith,0.509


In [None]:
[{'match': ['litasco middle east dmcc', 'litasco middle eastdynamik trader'],
  'Record 1': 'ID: 0, Name: litasco middle east dmcc',
  'Record 2': 'ID: 1, Name: litasco middle eastdynamik trader',
  'Similarities': {'name': '0.51'}},
 {'match': ['star voyages shipping services', 'star voyages'],
  'Record 1': 'ID: 6, Name: star voyages shipping services',
  'Record 2': 'ID: 8, Name: star voyages',
  'Similarities': {'name': '0.67'}},
 {'match': ['vladimir putin', 'putin'],
  'Record 1': 'ID: 9, Name: vladimir putin',
  'Record 2': 'ID: 10, Name: putin',
  'Similarities': {'name': '0.51'}},
 {'match': ['vladimir putin', 'president vladimir putin'],
  'Record 1': 'ID: 9, Name: vladimir putin',
  'Record 2': 'ID: 11, Name: president vladimir putin',
  'Similarities': {'name': '0.63'}},
 {'match': ['John Smith', 'Jon Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 13, Name: Jon Smith',
  'Similarities': {'name': '0.53'}},
 {'match': ['John Smith', 'Dr. John Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 14, Name: Dr. John Smith',
  'Similarities': {'name': '0.67'}},
 {'match': ['John Smith', 'John J. Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 15, Name: John J. Smith',
  'Similarities': {'name': '0.83'}},
 {'match': ['John Smith', 'Jonathan Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 16, Name: Jonathan Smith',
  'Similarities': {'name': '0.50'}},
 {'match': ['John Smith', 'Johnny Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 17, Name: Johnny Smith',
  'Similarities': {'name': '0.52'}},
 {'match': ['Jon Smith', 'Johnny Smith'],
  'Record 1': 'ID: 13, Name: Jon Smith',
  'Record 2': 'ID: 17, Name: Johnny Smith',
  'Similarities': {'name': '0.51'}},
 {'match': ['Dr. John Smith', 'John J. Smith'],
  'Record 1': 'ID: 14, Name: Dr. John Smith',
  'Record 2': 'ID: 15, Name: John J. Smith',
  'Similarities': {'name': '0.55'}}]

In [None]:
common_industry_terms = {
    # Company types and legal entities
    'inc', 'ltd', 'llc', 'corp', 'corporation', 'company', 'co', 'group', 'holdings', 
    'enterprises', 'international', 'global', 'worldwide', 'national', 'incorporated',
    'limited', 'partners', 'partnership', 'gmbh', 'srl', 'sa', 'ag', 'bv', 'pte',
    
    # Industry descriptors
    'marine', 'services', 'shipping', 'logistics', 'transport', 'transportation', 
    'cargo', 'freight', 'forwarding', 'trading', 'trader', 'import', 'export',
    'commercial', 'business', 'industrial', 'industries', 'solutions', 'systems',
    
    # Maritime specific
    'shipping', 'maritime', 'sea', 'ocean', 'port', 'harbor', 'vessel', 'boat',
    'ship', 'tanker', 'carrier', 'fleet', 'navigation', 'offshore', 'voyages',
    
    # General business terms
    'association', 'agency', 'bureau', 'center', 'centre', 'office', 'department',
    'division', 'unit', 'management', 'consulting', 'consultancy', 'advisory',
    'resources', 'operations', 'ventures', 'investment', 'investments',
    
    # Geographical/regional indicators
    'middle', 'east', 'west', 'north', 'south', 'central', 'eastern', 'western',
    'northern', 'southern', 'regional', 'local', 'international', 'gulf', 'asia',
    'european', 'american', 'africa', 'pacific', 'atlantic',
    
    # Common words in company names
    'the', 'and', 'of', 'for', 'to', 'by', 'on', 'with', 'in', 'al', 'el'
}

import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric balancing common industry terms with unique identifiers"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # Slightly reduced list of common industry terms
    
    # Calculate standard similarity metrics
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Calculate meaningful common words (excluding common industry terms)
    meaningful_tokens1 = set(t for t in tokens1 if t not in common_industry_terms)
    meaningful_tokens2 = set(t for t in tokens2 if t not in common_industry_terms)
    meaningful_common = meaningful_tokens1.intersection(meaningful_tokens2)
    
    # Calculate common industry terms
    common_tokens = set(tokens1).intersection(set(tokens2))
    industry_common = set(t for t in common_tokens if t in common_industry_terms)
    
    # Apply specific penalty for the "one moon/almuhit alhadi" case
    # If they only share industry terms and have different distinctive words
    if common_tokens and len(industry_common) > 0 and len(meaningful_common) == 0:
        # Only reduce score for specific case, not eliminate completely
        seq_sim *= 0.5
        jaccard_sim *= 0.5
        position_bonus *= 0.5
    
    # Standard hybrid scoring 
    hybrid_score = (seq_sim * 0.25 + jaccard_sim * 0.3 + subset_bonus * 0.15 + 
                   position_bonus * 0.15 + common_bonus * 0.15)
    
    # Apply a less aggressive penalty for specific case
    if common_tokens and len(industry_common) / len(common_tokens) > 0.5 and len(meaningful_common) == 0:
        hybrid_score *= 0.7  # Moderate penalty
    
    return hybrid_score

def compare_names(df, threshold=0.6):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            record1 = df.iloc[i]
            record2 = df.iloc[j]
            
            # Calculate name similarity
            name_sim = calculate_name_similarity(record1['name'], record2['name'])
            
            # Only proceed if names are similar enough
            if name_sim >= threshold:
                match = {
                    'match': [record1['name'], record2['name']],
                    'Record 1': f"ID: {record1.name}, Name: {record1['name']}",
                    'Overall Similarity': f"{weighted_sim:.2f}",
                    'Similarities': {
                        'name': f"{name_sim:.2f}",
                        'description': f"{desc_sim:.2f}"
                    }
                }
                matches.append(match)
    return matches
# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east"
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin",

    

    "John Smith",
    "Jon Smith",
    "Dr. John Smith",
    "John J. Smith",
    "Jonathan Smith",
    "Johnny Smith",
    "Sarah Jones",
    "Michael Johnson"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)  # You can adjust threshold as needed

# Display results
if not matches.empty:
    print("Matching Results:")
    print(matches.sort_values('Similarity', ascending=False))
else:
    print("No matches found!")

NameError: name 'name_threshold' is not defined

In [61]:
common_industry_terms = {
    # Company types and legal entities
    'inc', 'ltd', 'llc', 'corp', 'corporation', 'company', 'co', 'group', 'holdings', 
    'enterprises', 'international', 'global', 'worldwide', 'national', 'incorporated',
    'limited', 'partners', 'partnership', 'gmbh', 'srl', 'sa', 'ag', 'bv', 'pte',
    
    # Industry descriptors
    'marine', 'services', 'shipping', 'logistics', 'transport', 'transportation', 
    'cargo', 'freight', 'forwarding', 'trading', 'trader', 'import', 'export',
    'commercial', 'business', 'industrial', 'industries', 'solutions', 'systems',
    
    # Maritime specific
    'shipping', 'maritime', 'sea', 'ocean', 'port', 'harbor', 'vessel', 'boat',
    'ship', 'tanker', 'carrier', 'fleet', 'navigation', 'offshore', 'voyages',
    
    # General business terms
    'association', 'agency', 'bureau', 'center', 'centre', 'office', 'department',
    'division', 'unit', 'management', 'consulting', 'consultancy', 'advisory',
    'resources', 'operations', 'ventures', 'investment', 'investments',
    
    # Geographical/regional indicators
    'middle', 'east', 'west', 'north', 'south', 'central', 'eastern', 'western',
    'northern', 'southern', 'regional', 'local', 'international', 'gulf', 'asia',
    'european', 'american', 'africa', 'pacific', 'atlantic',
    
    # Common words in company names
    'the', 'and', 'of', 'for', 'to', 'by', 'on', 'with', 'in', 'al', 'el'
}

import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_name(name):
    """Clean and standardize name format"""
    if not isinstance(name, str):
        return str(name)
    
    # Convert to lowercase
    name = name.lower()
    # Remove special characters
    name = re.sub(r'[^\w\s]', '', name)
    # Remove extra whitespace
    name = ' '.join(name.split())
    return name

def tokenize_name(name):
    """Split name into tokens"""
    return clean_name(name).split()

def jaccard_similarity(set1, set2):
    """Calculate Jaccard similarity between two sets"""
    if not set1 or not set2:
        return 0.0
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def sequence_similarity(s1, s2):
    """Calculate SequenceMatcher similarity"""
    return SequenceMatcher(None, s1, s2).ratio()

def subset_match(name1, name2):
    """Check if one name is a subset of another"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    return 1.0 if set(tokens1).issubset(set(tokens2)) or set(tokens2).issubset(set(tokens1)) else 0.0

def word_position_bonus(name1, name2):
    """Increase similarity score based on matching word positions with added weight for longer names"""
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    min_length = min(len(tokens1), len(tokens2))
    
    score = 0.0
    weight = 1.0  # Start weight, more for first words
    for i in range(min_length):
        if tokens1[i] == tokens2[i]:
            score += weight
        weight *= 0.8  # Decrease weight for each subsequent word match
    
    return min(score, 1.0)  # Cap the score at 1.0

def common_words_bonus(name1, name2):
    """Increase similarity score based on the number of common words"""
    tokens1, tokens2 = set(tokenize_name(name1)), set(tokenize_name(name2))
    common_tokens = tokens1.intersection(tokens2)
    
    return len(common_tokens) / max(len(tokens1), len(tokens2))

def calculate_name_similarity(name1, name2):
    """Hybrid similarity metric balancing common industry terms with unique identifiers"""
    name1_clean, name2_clean = clean_name(name1), clean_name(name2)
    tokens1, tokens2 = tokenize_name(name1), tokenize_name(name2)
    
    # Slightly reduced list of common industry terms
    
    # Calculate standard similarity metrics
    seq_sim = sequence_similarity(name1_clean, name2_clean)
    jaccard_sim = jaccard_similarity(set(tokens1), set(tokens2))
    subset_bonus = subset_match(name1, name2)
    position_bonus = word_position_bonus(name1, name2)
    common_bonus = common_words_bonus(name1, name2)
    
    # Calculate meaningful common words (excluding common industry terms)
    meaningful_tokens1 = set(t for t in tokens1 if t not in common_industry_terms)
    meaningful_tokens2 = set(t for t in tokens2 if t not in common_industry_terms)
    meaningful_common = meaningful_tokens1.intersection(meaningful_tokens2)
    
    # Calculate common industry terms
    common_tokens = set(tokens1).intersection(set(tokens2))
    industry_common = set(t for t in common_tokens if t in common_industry_terms)
    
    # Apply specific penalty for the "one moon/almuhit alhadi" case
    # If they only share industry terms and have different distinctive words
    if common_tokens and len(industry_common) > 0 and len(meaningful_common) == 0:
        # Only reduce score for specific case, not eliminate completely
        seq_sim *= 0.5
        jaccard_sim *= 0.5
        position_bonus *= 0.5
    
    # Standard hybrid scoring 
    hybrid_score = (seq_sim * 0.25 + jaccard_sim * 0.3 + subset_bonus * 0.15 + 
                   position_bonus * 0.15 + common_bonus * 0.15)
    
    # Apply a less aggressive penalty for specific case
    if common_tokens and len(industry_common) / len(common_tokens) > 0.5 and len(meaningful_common) == 0:
        hybrid_score *= 0.7  # Moderate penalty
    
    return hybrid_score

def compare_names(df, threshold=0.6):
    """Find matching names in a DataFrame"""
    matches = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            record1 = df.iloc[i]
            record2 = df.iloc[j]
            
            # Calculate name similarity
            name_sim = calculate_name_similarity(record1['name'], record2['name'])
            
            # Only proceed if names are similar enough
            if name_sim >= threshold:
                match = {
                    'match': [record1['name'], record2['name']],
                    'Record 1': f"ID: {record1.name}, Name: {record1['name']}",
                    'Record 2': f"ID: {record2.name}, Name: {record2['name']}",
                    'Similarities': {
                        'name': f"{name_sim:.2f}"
                    }
                }
                matches.append(match)
    return matches
# Example usage with the given names
df = pd.DataFrame({'name': [
    "litasco middle east dmcc",
    "litasco middle east"
    "dynamik trader", 
    "nari strength",
    "one moon marine services", 
    "alqutb alshamali marine",  # Example with less similarity
    "almuhit alhadi marine services",  # Example with less similarity
    "star voyages shipping services", 
    "uae shipping association",
    "star voyages",
    "vladimir putin", 
    "putin", 
    "president vladimir putin",

    

    "John Smith",
    "Jon Smith",
    "Dr. John Smith",
    "John J. Smith",
    "Jonathan Smith",
    "Johnny Smith",
    "Sarah Jones",
    "Michael Johnson"
]})

# Compare names and find matches
matches = compare_names(df, threshold=0.5)  # You can adjust threshold as needed
matches

[{'match': ['litasco middle east dmcc', 'litasco middle eastdynamik trader'],
  'Record 1': 'ID: 0, Name: litasco middle east dmcc',
  'Record 2': 'ID: 1, Name: litasco middle eastdynamik trader',
  'Similarities': {'name': '0.51'}},
 {'match': ['star voyages shipping services', 'star voyages'],
  'Record 1': 'ID: 6, Name: star voyages shipping services',
  'Record 2': 'ID: 8, Name: star voyages',
  'Similarities': {'name': '0.67'}},
 {'match': ['vladimir putin', 'putin'],
  'Record 1': 'ID: 9, Name: vladimir putin',
  'Record 2': 'ID: 10, Name: putin',
  'Similarities': {'name': '0.51'}},
 {'match': ['vladimir putin', 'president vladimir putin'],
  'Record 1': 'ID: 9, Name: vladimir putin',
  'Record 2': 'ID: 11, Name: president vladimir putin',
  'Similarities': {'name': '0.63'}},
 {'match': ['John Smith', 'Jon Smith'],
  'Record 1': 'ID: 12, Name: John Smith',
  'Record 2': 'ID: 13, Name: Jon Smith',
  'Similarities': {'name': '0.53'}},
 {'match': ['John Smith', 'Dr. John Smith'],
 

In [None]:
list_flagged = ['rosneck', "garamyov"]

abc = [
    ('rikic', 'garamyov'),
    ('desmond', 'desnodi'),
    ('ase', 'rosneck')
]

desc_dic = {
    "rikic": "rikic has not commited any crime, he travelled to Dubai.He owns rosneck",
    "rosneck": "rosneck is a company sanctionned for money laundering and sanctions evasions",
    "Desmond": "Desmond has commited money laundering",
    "desnidi": "Desnodi has not commited any crimes. He is the associated of Desmond.",
    "ase": "ase is the victim of rosneck"
}

In [85]:
filtered_pairs = [pair for pair in abc if pair[0] in list_a or pair[1] in list_a]

# Print the filtered pairs
print(filtered_pairs)



[('rocky1', 'dyno1')]


In [None]:
desc_dic = {
    "rocky1": "rocky1 has been flagged for money laundering",
    "dyno1": 
}

In [92]:
# Define flagged list
list_flagged = ['rosneck', "garamyov", "Desmond"]

# Define the relationships in abc
abc = [
    ('rikic', 'garamyov'),
    ('desmond', 'desnodi'),
    ('ase', 'rosneck')
]

# Create the dictionary with flagged information
relationships = {}

for entity1, entity2 in abc:
    # Check if each entity is flagged
    flagged_info = {
        entity1: entity1 in list_flagged,
        entity2: entity2 in list_flagged
    }
    
    # Store the result in the dictionary
    relationships[(entity1, entity2)] = flagged_info

# Display the result
print(relationships)

{('rikic', 'garamyov'): {'rikic': False, 'garamyov': True}, ('desmond', 'desnodi'): {'desmond': False, 'desnodi': False}, ('ase', 'rosneck'): {'ase': False, 'rosneck': True}}


In [None]:
# Define the relationship types (categories of connections)
relationship_types = [
    'Ownership', 
    'Association', 
    'Criminal Involvement', 
    'Affiliation', 
    'Partnership',
    'Subcontracting', 
    'Beneficiary', 
    'Service Provider', 
    'Investment', 
    'Co-Conspirator'
]

# List of flagged entities (those involved in crime)
list_flagged = ['rosneck', 'garamyov', 'Desmond']

# Define the relationships in abc
abc = [
    ('John Doe', 'Garamyov Enterprises'),
    ('Desmond', 'Desnodi'),
    ('Michael Smith', 'Rosneck Inc'),
    ('Richard Lee', 'Acme Corp'),
    ('Jane Doe', 'Rosneck Inc'),
    ('Alice Johnson', 'Global Tech'),
    ('Acme Corp', 'Beta Industries'),
    ('InvestorX', 'StartUpAlpha'),
    ('PartnerA', 'PartnerB'),
    ('Robert Brown', 'Criminal Enterprises')
]

# Descriptions of entities
desc_dic = {
    "John Doe": "John Doe is a businessman who owns a portion of Garamyov Enterprises.",
    "Garamyov Enterprises": "Garamyov Enterprises is a company flagged for involvement in money laundering.",
    "Desmond": "Desmond has committed money laundering activities.",
    "Desnodi": "Desnodi is an associate of Desmond, but has not committed any crimes.",
    "Michael Smith": "Michael Smith is an employee at Rosneck Inc and is not involved in any crimes.",
    "Rosneck Inc": "Rosneck Inc is a company sanctioned for money laundering and sanctions evasions.",
    "Richard Lee": "Richard Lee is the CEO of Acme Corp, an international corporation.",
    "Jane Doe": "Jane Doe is an investor and a business partner of Rosneck Inc.",
    "Alice Johnson": "Alice Johnson is the director of Global Tech, an enterprise working with various other companies.",
    "Global Tech": "Global Tech is a technology company focused on AI and machine learning.",
    "Acme Corp": "Acme Corp is a multinational corporation, working with Beta Industries in various projects.",
    "Beta Industries": "Beta Industries is a manufacturing company with a focus on construction and infrastructure.",
    "InvestorX": "InvestorX is a venture capitalist and has invested heavily in StartUpAlpha.",
    "StartUpAlpha": "StartUpAlpha is a tech startup, having recently received funding from InvestorX.",
    "PartnerA": "PartnerA is in a business partnership with PartnerB.",
    "PartnerB": "PartnerB is a business partner of PartnerA, involved in international trade.",
    "Robert Brown": "Robert Brown is a known criminal involved in various illicit activities with Criminal Enterprises.",
    "Criminal Enterprises": "Criminal Enterprises is a network involved in illegal trade and money laundering."
}

# Create a LLM prompt that will analyze the relationships
llm_prompt = f"""
You are a compliance assistant tasked with identifying the relationships between pairs of entities, such as individuals and companies. Your goal is to determine the type of relationship between the two entities and flag those who are involved in criminal activities. Use the descriptions provided for each entity to help you classify the relationship.

The relationship types to choose from are as follows:
{relationship_types}

The entities that have been flagged for crimes are:
{list_flagged}

You are provided with the following pairs of entities:
{abc}

Descriptions of the entities involved:
{desc_dic}

For each pair of entities, identify the type of relationship and determine:
1. The type of relationship between the two entities.
2. Which entity is the source and which is the target (the source is the entity with higher responsibility or influence).
3. Flag any entities involved in criminal activities based on their involvement in a crime.

Please provide the output in the following format:
- Entity1 - Entity2:
    - Relationship Type: [Type of relationship]
    - Source: [Source Entity]
    - Target: [Target Entity]
    - Flagged: [List of flagged entities involved]

Example format:
- John Doe - Garamyov Enterprises:
    - Relationship Type: Ownership
    - Source: Garamyov Enterprises
    - Target: John Doe
    - Flagged: ['Garamyov Enterprises']
"""

print(llm_prompt)

In [93]:
{
    "John Doe - Garamyov Enterprises": {
        "Relationship Type": "Ownership",
        "Source": "Garamyov Enterprises",
        "Target": "John Doe",
        "Flagged": ["Garamyov Enterprises"]
    },
    "Desmond - Desnodi": {
        "Relationship Type": "Association",
        "Source": "Desmond",
        "Target": "Desnodi",
        "Flagged": ["Desmond"]
    },
    "Michael Smith - Rosneck Inc": {
        "Relationship Type": "Ownership",
        "Source": "Rosneck Inc",
        "Target": "Michael Smith",
        "Flagged": ["Rosneck Inc"]
    },
    "Jane Doe - Rosneck Inc": {
        "Relationship Type": "Ownership",
        "Source": "Rosneck Inc",
        "Target": "Jane Doe",
        "Flagged": ["Rosneck Inc"]
    },
    "Alice Johnson - Global Tech": {
        "Relationship Type": "Partnership",
        "Source": "Alice Johnson",
        "Target": "Global Tech",
        "Flagged": []
    },
    "Acme Corp - Beta Industries": {
        "Relationship Type": "Partnership",
        "Source": "Acme Corp",
        "Target": "Beta Industries",
        "Flagged": []
    },
    "InvestorX - StartUpAlpha": {
        "Relationship Type": "Investment",
        "Source": "InvestorX",
        "Target": "StartUpAlpha",
        "Flagged": []
    },
    "PartnerA - PartnerB": {
        "Relationship Type": "Partnership",
        "Source": "PartnerA",
        "Target": "PartnerB",
        "Flagged": []
    },
    "Robert Brown - Criminal Enterprises": {
        "Relationship Type": "Co-Conspirator",
        "Source": "Criminal Enterprises",
        "Target": "Robert Brown",
        "Flagged": ["Criminal Enterprises"]
    }
}

{'John Doe - Garamyov Enterprises': {'Relationship Type': 'Ownership',
  'Source': 'Garamyov Enterprises',
  'Target': 'John Doe',
  'Flagged': ['Garamyov Enterprises']},
 'Desmond - Desnodi': {'Relationship Type': 'Association',
  'Source': 'Desmond',
  'Target': 'Desnodi',
  'Flagged': ['Desmond']},
 'Michael Smith - Rosneck Inc': {'Relationship Type': 'Ownership',
  'Source': 'Rosneck Inc',
  'Target': 'Michael Smith',
  'Flagged': ['Rosneck Inc']},
 'Jane Doe - Rosneck Inc': {'Relationship Type': 'Ownership',
  'Source': 'Rosneck Inc',
  'Target': 'Jane Doe',
  'Flagged': ['Rosneck Inc']},
 'Alice Johnson - Global Tech': {'Relationship Type': 'Partnership',
  'Source': 'Alice Johnson',
  'Target': 'Global Tech',
  'Flagged': []},
 'Acme Corp - Beta Industries': {'Relationship Type': 'Partnership',
  'Source': 'Acme Corp',
  'Target': 'Beta Industries',
  'Flagged': []},
 'InvestorX - StartUpAlpha': {'Relationship Type': 'Investment',
  'Source': 'InvestorX',
  'Target': 'StartUpAlp