In [11]:
#Pre-Processing
import unicodedata
import re
from cleanco import basename


company_names = [
    "asml holding nv",
    "unilever nv",
    "royal dutch shell plc",
    "ing group nv",
    "koninklijke philips nv",
    "adyen nv",
    "relx plc",
    "prosus nv",
    "koninklijke dsm nv",
    "koninklijke ahold delhaize nv",
    "heineken nv"
]
company_name_dataset = ["asml nv", "unilever bv", "shell bv", "ing bank nv", "koninklijke filips", "adyen nv", "relx plc", "prosus group", "dsm", "ahold-delheize", "heineken breweries"]

def preprocess_company_name(company_name):
    company_name = company_name.lower()
    company_name = unicodedata.normalize('NFKD', company_name).encode('ASCII', 'ignore').decode()
    company_name = re.sub(r'[^\w\s]', '', company_name)
    company_name = basename(company_name)
    common_words = ['and', 'the', 'of', 'in', 'for', 'on', 'at', 'with']
    for word in common_words:
        company_name = re.sub(r'\b' + re.escape(word) + r'\b', '', company_name, flags=re.IGNORECASE)
    return company_name


preprocessed_names = [preprocess_company_name(name) for name in company_names]


for name in preprocessed_names:
    print(name)

asml holding
unilever
royal dutch shell
ing group
koninklijke philips
adyen
relx
prosus
koninklijke dsm
koninklijke ahold delhaize
heineken


In [12]:
#Vectorization
!pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(lowercase=False, analyzer="char", ngram_range=(2, 3))
vec.fit(company_name_dataset)
vec.transform(preprocessed_names)



<11x171 sparse matrix of type '<class 'numpy.float64'>'
	with 173 stored elements in Compressed Sparse Row format>

In [22]:
#Discounted Levenshtein Calculations
!pip install abydos
import abydos.distance as abd
dl_distance = abd.DiscountedLevenshtein()


for i, name in enumerate(company_names):
    for dataset_name in company_name_dataset:
        similarity = dl_distance.sim(name, dataset_name)
        print(f"Similarity between '{name}' and '{dataset_name}': {similarity:.2f}")

Similarity between 'asml holding nv' and 'asml nv': 0.55
Similarity between 'asml holding nv' and 'unilever bv': 0.19
Similarity between 'asml holding nv' and 'shell bv': 0.33
Similarity between 'asml holding nv' and 'ing bank nv': 0.27
Similarity between 'asml holding nv' and 'koninklijke filips': 0.08
Similarity between 'asml holding nv' and 'adyen nv': 0.31
Similarity between 'asml holding nv' and 'relx plc': 0.14
Similarity between 'asml holding nv' and 'prosus group': 0.10
Similarity between 'asml holding nv' and 'dsm': 0.20
Similarity between 'asml holding nv' and 'ahold-delheize': 0.21
Similarity between 'asml holding nv' and 'heineken breweries': 0.04
Similarity between 'unilever nv' and 'asml nv': 0.30
Similarity between 'unilever nv' and 'unilever bv': 0.93
Similarity between 'unilever nv' and 'shell bv': 0.25
Similarity between 'unilever nv' and 'ing bank nv': 0.33
Similarity between 'unilever nv' and 'koninklijke filips': 0.25
Similarity between 'unilever nv' and 'adyen nv'

In [23]:
#SSK Simlarity Calculation
ssk = abd.SSK()
for name in company_names:
    for dataset_name in company_name_dataset:
        similarity = ssk.sim(name, dataset_name)
        print(f"SSK similarity between '{name}' and '{dataset_name}': {similarity:.2f}")

SSK similarity between 'asml holding nv' and 'asml nv': 0.63
SSK similarity between 'asml holding nv' and 'unilever bv': 0.12
SSK similarity between 'asml holding nv' and 'shell bv': 0.28
SSK similarity between 'asml holding nv' and 'ing bank nv': 0.40
SSK similarity between 'asml holding nv' and 'koninklijke filips': 0.15
SSK similarity between 'asml holding nv' and 'adyen nv': 0.30
SSK similarity between 'asml holding nv' and 'relx plc': 0.07
SSK similarity between 'asml holding nv' and 'prosus group': 0.08
SSK similarity between 'asml holding nv' and 'dsm': 0.07
SSK similarity between 'asml holding nv' and 'ahold-delheize': 0.22
SSK similarity between 'asml holding nv' and 'heineken breweries': 0.06
SSK similarity between 'unilever nv' and 'asml nv': 0.22
SSK similarity between 'unilever nv' and 'unilever bv': 0.87
SSK similarity between 'unilever nv' and 'shell bv': 0.19
SSK similarity between 'unilever nv' and 'ing bank nv': 0.24
SSK similarity between 'unilever nv' and 'koninklij

In [25]:
#Fuzz Similarity Calculation
!pip install thefuzz
from thefuzz import fuzz
for name in company_names:

    for dataset_name in company_name_dataset:

        similarity = fuzz.token_sort_ratio(name, dataset_name)

        similarity_score = similarity / 100.0

        print(f"Token sort ratio similarity between '{name}' and '{dataset_name}': {similarity_score:.2f}")

Token sort ratio similarity between 'asml holding nv' and 'asml nv': 0.64
Token sort ratio similarity between 'asml holding nv' and 'unilever bv': 0.23
Token sort ratio similarity between 'asml holding nv' and 'shell bv': 0.26
Token sort ratio similarity between 'asml holding nv' and 'ing bank nv': 0.62
Token sort ratio similarity between 'asml holding nv' and 'koninklijke filips': 0.30
Token sort ratio similarity between 'asml holding nv' and 'adyen nv': 0.52
Token sort ratio similarity between 'asml holding nv' and 'relx plc': 0.26
Token sort ratio similarity between 'asml holding nv' and 'prosus group': 0.15
Token sort ratio similarity between 'asml holding nv' and 'dsm': 0.22
Token sort ratio similarity between 'asml holding nv' and 'ahold-delheize': 0.41
Token sort ratio similarity between 'asml holding nv' and 'heineken breweries': 0.36
Token sort ratio similarity between 'unilever nv' and 'asml nv': 0.33
Token sort ratio similarity between 'unilever nv' and 'unilever bv': 0.91
T

In [None]:
#Fuzzy Chinese Match
import pandas as pd
from fuzzychinese import FuzzyChineseMatch

# Define the test dictionary and raw words
test_dict = pd.Series(['长白朝鲜族自治县','长阳土家族自治县','城步苗族自治县','达尔罕茂明安联合旗','汨罗市'])
raw_word = pd.Series(['达茂联合旗','长阳县','汩罗市'])

# Ensure the different representation of the words are recognized as different
assert('汩罗市' != '汨罗市')

# Initialize FuzzyChineseMatch with specified ngram range and analyzer
fcm = FuzzyChineseMatch(ngram_range=(3, 3), analyzer='stroke')

# Fit the model on the test dictionary
fcm.fit(test_dict)

# Transform the raw words to get the top 2 similar matches
top2_similar = fcm.transform(raw_word, n=2)

# Concatenate the results into a single DataFrame
res = pd.concat([raw_word,
        pd.DataFrame(top2_similar, columns=['top1', 'top2']),
        pd.DataFrame(
            fcm.get_similarity_score(),
            columns=['top1_score', 'top2_score']),
        pd.DataFrame(
            fcm.get_index(),
            columns=['top1_index', 'top2_index'])],
                    axis=1)

# Display the results
print(res)


In [10]:
#Name Matcher Test
import pandas as pd
from name_matching.name_matcher import NameMatcher

# define a dataset with bank names
# 新的公司名称数据集
df_companies_a = pd.DataFrame({'Company name': [
    'asml holding nv', 'unilever nv', 'royal dutch shell plc', 'ing group nv',
    'koninklijke philips nv', 'adyen nv', 'relx plc', 'prosus nv', 'koninklijke dsm nv',
    'koninklijke ahold delhaize nv', 'heineken nv'
]})

# 经过修改的名称数据集，用于测试匹配
df_companies_b = pd.DataFrame({'name': [
    'asml nv', 'unilever bv', 'rd shell bv', 'ing bank nv', 'koninklijke philips nv',
    'adyen nv', 'relx plc', 'prosus group', 'koninkdsm', 'ahold-delheize', 'heineken breweries'
]})

# initialise the name matcher
matcher = NameMatcher(number_of_matches=1, 
                      legal_suffixes=True, 
                      common_words=False, 
                      top_n=50, 
                      verbose=True)

# adjust the distance metrics to use
matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])

# load the data to which the names should be matched
matcher.load_and_process_master_data(column='Company name',
                                     df_matching_data=df_companies_a, 
                                     transform=True)

# perform the name matching on the data you want matched
matches = matcher.match_names(to_be_matched=df_companies_b, 
                              column_matching='name')

# combine the datasets based on the matches
combined = pd.merge(df_companies_a, matches, how='left', left_index=True, right_on='match_index')
combined = pd.merge(combined, df_companies_b, how='left', left_index=True, right_index=True)

# Print the matches found
print("Matches Found:")
print(matches)

# Combine the datasets based on the matches
combined = pd.merge(df_companies_a, matches, how='left', left_index=True, right_on='match_index')
combined = pd.merge(combined, df_companies_b, how='left', left_index=True, right_index=True)

# Print the combined dataframe to see the matching results
#print("\nCombined DataFrame:")
#print(combined)


preprocessing...

preprocessing complete 
 searching for matches...



100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1805.55it/s]


possible matches found   
 fuzzy matching...



100%|███████████████████████████████████████████| 11/11 [00:00<00:00, 35.11it/s]

done
Matches Found:
             original_name                     match_name       score  \
0                  asml nv                asml holding nv   38.888889   
1              unilever bv                    unilever nv  100.000000   
2              rd shell bv          royal dutch shell plc   50.420168   
3              ing bank nv                   ing group nv   45.121456   
4   koninklijke philips nv         koninklijke philips nv  100.000000   
5                 adyen nv                       adyen nv  100.000000   
6                 relx plc                       relx plc  100.000000   
7             prosus group                      prosus nv   52.380952   
8                koninkdsm             koninklijke dsm nv   63.333333   
9            aholddelheize  koninklijke ahold delhaize nv   44.021169   
10      heineken breweries                    heineken nv   46.296296   

    match_index  
0             0  
1             1  
2             2  
3             3  
4            




In [59]:
#Chinese English Splitter
import re

ENGLISH_SPLITTER_REGEX = re.compile(r"[^\w&_+*\\/'#\-]+")
CJK_CHARACTERS = r'\u1100-\u11ff\u2e80-\u2fff\u3040-\u31ff\u3400-\u9fff\ua960-\ua97f\uac00-\ud7ff\uf900-\ufaff'
CJK_OR_NUMERIC_REGEX = re.compile(rf"(?P<cjk>[{CJK_CHARACTERS}]+)|(?P<numeric>((?<=^\D)|(?<=[^\W0-9_]|\s))(?<!\b[a-zA-Z])(\d+([\W_]{{0,5}}\d+){{0,5}})(?=($|[^\W0-9_]|\s)))")


def split_english_number_cjk(text: str, separate_return=False, split_same_language=False):
    """
    Chinese and Chinese-English Mixture Operations
    :param text: e.g. "non_asians string: 并删除掉Machine-learning多余的, 字符串"
    :param separate_return: False
    :param split_same_language: False
    :return: e.g. ["non_asians string:", "并删除掉", "Machine-learning", "多余的, 字符串"]
    :param separate_return: False
    :param split_same_language: True
    :return: e.g. ['non_asians', 'string', '并', '删', '除', '掉', 'Machine-learning', '多', '余', '的', '字', '符', '串']
    :param separate_return: True
    :param split_same_language: False
    :return: e.g. (["non_asians string:", "Machine-learning"], ["并删除掉", "多余的, 字符串"])
    :param separate_return: True
    :param split_same_language: True
    :return: e.g. (['non_asians', 'string', 'Machine-learning'], ['并', '删', '除', '掉', '多', '余', '的', '字', '符', '串'])
    """
    en_number_words = []
    cjk_words = []
    start, end = 0, len(text)
    while start < end and (r_ := CJK_OR_NUMERIC_REGEX.search(text, pos=start)):
        # before word
        if word := text[start:r_.start()].strip():
            if split_same_language:
                en_number_words.extend(
                    [stripped_word for word_ in ENGLISH_SPLITTER_REGEX.split(word) if (stripped_word := word_.strip())])
            else:
                en_number_words.append(word)
        # matched word
        if cjk_word := r_.groupdict().get("cjk"):
            if split_same_language:
                (cjk_words if separate_return else en_number_words).extend(list(cjk_word))
            else:
                (cjk_words if separate_return else en_number_words).append(cjk_word)
        else:
            numeric_word = r_.groupdict().get("numeric")
            if split_same_language:
                en_number_words.extend([stripped_word for word_ in ENGLISH_SPLITTER_REGEX.split(numeric_word) if (stripped_word := word_.strip())])
            else:
                en_number_words.append(numeric_word)
        start = r_.end()

    # save after left words
    if end > start and (text := text[start:].strip()):
        if split_same_language:
            en_number_words.extend([word_ for word in ENGLISH_SPLITTER_REGEX.split(text) if (word_ := word.strip())])
        else:
            en_number_words.append(text)
    return (en_number_words, cjk_words) if separate_return else en_number_words

text = "non_asians string: 并删除掉Machine-learning多余的子3围观围观, 字符串12345"
result = split_english_number_cjk(text, separate_return=True, split_same_language=False)
print(result)


(['non_asians string:', 'Machine-learning', '3', ',', '12345'], ['并删除掉', '多余的子', '围观围观', '字符串'])


In [None]:
#Read CSV Column
import pandas as pd

# Load your CSV file
df = pd.read_csv('chinese_search_strings.csv')  # Replace with your CSV file path

# Print the column names to check for any discrepancies
print(df.columns)

In [5]:
#Chinese English Splitter for name-matching(CSV)
import pandas as pd
import json
import re
import string
from name_matching.name_matcher import NameMatcher
from cleanco import basename

# Regex for identifying CJK characters
CJK_CHARACTERS = r'\u1100-\u11ff\u2e80-\u2fff\u3040-\u31ff\u3400-\u9fff\ua960-\ua97f\uac00-\ud7ff\uf900-\ufaff'
CJK_OR_NUMERIC_REGEX = re.compile(rf"(?P<cjk>[{CJK_CHARACTERS}]+)|(?P<numeric>((?<=^\D)|(?<=[^\W0-9_]|\s))(?<!\b[a-zA-Z])(\d+([\W_]{{0,5}}\d+){{0,5}})(?=($|[^\W0-9_]|\s)))")

ENGLISH_SPLITTER_REGEX = re.compile(r"[^\w&_+*\\/'#\-]+")

# Function to split and separate English/Chinese strings
def split_english_number_cjk(text, separate_return=True, split_same_language=False):
    en_number_words = []
    cjk_words = []
    start, end = 0, len(text)
    while start < end and (r_ := CJK_OR_NUMERIC_REGEX.search(text, pos=start)):
        if word := text[start:r_.start()].strip():
            if split_same_language:
                en_number_words.extend(
                    [stripped_word for word_ in ENGLISH_SPLITTER_REGEX.split(word) if (stripped_word := word_.strip())])
            else:
                en_number_words.append(word)
        if cjk_word := r_.groupdict().get("cjk"):
            if split_same_language:
                (cjk_words if separate_return else en_number_words).extend(list(cjk_word))
            else:
                (cjk_words if separate_return else en_number_words).append(cjk_word)
        else:
            numeric_word = r_.groupdict().get("numeric")
            if split_same_language:
                en_number_words.extend([stripped_word for word_ in ENGLISH_SPLITTER_REGEX.split(numeric_word) if (stripped_word := word_.strip())])
            else:
                en_number_words.append(numeric_word)
        start = r_.end()

    if end > start and (text := text[start:].strip()):
        if split_same_language:
            en_number_words.extend([word_ for word in ENGLISH_SPLITTER_REGEX.split(text) if (word_ := word.strip())])
        else:
            en_number_words.append(text)
    return (en_number_words, cjk_words) if separate_return else en_number_words

# Function to parse JSONL and separate English/Chinese strings
def parse_and_separate_jsonl(input_path):
    english_list = []
    chinese_list = []

    with open(input_path, 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            company_name = data.get('companyName', '')
            required_search_strings = data.get('requiredSearchStrings', [])
            aliases = data.get('aliases', [])

            all_strings = list(set(required_search_strings + aliases))

            for string in all_strings:
                en_strings, cjk_strings = split_english_number_cjk(string, separate_return=True)
                for en_str in en_strings:
                    english_list.append({'Company Name': company_name, 'Search String': en_str})
                for cjk_str in cjk_strings:
                    chinese_list.append({'Company Name': company_name, 'Search String': cjk_str})

    df_english = pd.DataFrame(english_list)
    df_chinese = pd.DataFrame(chinese_list)

    return df_english, df_chinese

# Parse JSONL and separate English/Chinese strings
input_file_path = 'bd_companies_international.jsonl'
df_english, df_chinese = parse_and_separate_jsonl(input_file_path)

# Save Chinese strings to CSV
df_chinese.to_csv('chinese_search_strings.csv', index=False, encoding='utf-8-sig')

# Save English strings to CSV
df_english.to_csv('english_search_strings.csv', index=False, encoding='utf-8-sig')

请输入匹配的名称:  apple


preprocessing...

preprocessing complete 
 searching for matches...



100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 407.81it/s]


possible matches found   
 fuzzy matching...



100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 52.25it/s]

done
行缺少预期的列: original_name        apple
match_name_0         apple
score_0              100.0
match_index_0        49423
match_name_1         apple
score_1              100.0
match_index_1        49421
match_name_2        lapple
score_2          86.363636
match_index_2        71824
match_name_3        lapple
score_3          86.363636
match_index_3        71825
match_name_4        apples
score_4          86.363636
match_index_4        49419
match_name_5        apples
score_5          86.363636
match_index_5        49420
match_name_6      appleone
score_6          81.662088
match_index_6        49437
Name: 0, dtype: object





In [51]:
#Working fuzzychinese code
!pip install fuzzychinese
import pandas as pd
from fuzzychinese import FuzzyChineseMatch

# Load your CSV file
df = pd.read_csv('chinese_search_strings.csv')  

# Extract the 'search string' and 'company name' columns
search_strings = df['Search String']
company_names = df['Company Name']

# Initialize FuzzyChineseMatch
fcm = FuzzyChineseMatch(ngram_range=(3, 3), analyzer='stroke')
fcm.fit(search_strings)

# User input
user_input = pd.Series([input("Please enter something: ")])  # Replace with the actual user input

# Find the top matches for the user input
top_matches = fcm.transform(user_input, n=1)  # Get the top 1 match

# Retrieve the matching company name
matched_index = fcm.get_index()[0][0]  # Index of the top match
matched_company_name = company_names.iloc[matched_index]

# Output the result
print(f"User Input: {user_input[0]}")
print(f"Matched Company Name: {matched_company_name}")



Please enter something:  鞍山钢铁集团公司


User Input: 鞍山钢铁集团公司
Matched Company Name: JFE Holdings, Inc.


In [65]:
import pandas as pd
from fuzzychinese import FuzzyChineseMatch

def find_company_name(user_input, csv_file='chinese_search_strings.csv'):
    # Load your CSV file
    df = pd.read_csv(csv_file)

    # Extract the 'search string' and 'company name' columns
    search_strings = df['Search String']
    company_names = df['Company Name']

    # Initialize FuzzyChineseMatch
    fcm = FuzzyChineseMatch(ngram_range=(3, 3), analyzer='stroke')
    fcm.fit(search_strings)

    # Find the top matches for the user input
    user_input_series = pd.Series([user_input])
    top_matches = fcm.transform(user_input_series, n=1)  # Get the top 1 match

    # Retrieve the matching company name
    matched_index = fcm.get_index()[0][0]  # Index of the top match
    matched_company_name = company_names.iloc[matched_index]

    return matched_company_name

# User input
user_input = input("Please enter something: ")  # Replace with the actual user input

# Get the matched company name
matched_company_name = find_company_name(user_input)

# Output the result
print(f"User Input: {user_input}")
print(f"Matched Company Name: {matched_company_name}")

Please enter something:  索尼


User Input: 索尼
Matched Company Name: Sony Corporation
