In [None]:
import pandas as pd
from openai import OpenAI
import json
import os

# OpenAI API key
client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")

Data Preparation

In [None]:
df_plmdb = pd.read_excel('PLM-DB.xlsx', na_filter=False, dtype={'PartNumber': str})
df_qc = pd.read_excel('QC.xlsx', na_filter=False)

In [None]:
# Pre-filters for QC
#...
# Pre-filters for PLM-DB

# Filter and rename columns
#...

# Define exclusion lists
#...

# Filter entries
#...

In [None]:
unique_manufacturers = pd.concat([df_qc_filtered['Manufacturer'], df_plmdb_filtered['Manufacturer']]).unique().tolist()
len(unique_manufacturers)

In [None]:
prompt = f"""Given a list of unique manufacturer names from a database, clean and standardize them to ensure consistency. The names currently can appear in multiple inconsistent forms, such as abbreviations, different spellings, casing variations, suffix differences (e.g., 'Inc.', 'Corp.', 'Ltd.'), and other inconsistencies, such as a company changing its name over time.  

### Task:  
- Identify and group equivalent manufacturer names that represent the same entity.  
- Standardize each group into a single, consistent name using the most common or official version (if available).  
- Ignore minor differences in punctuation, casing, and common suffixes while ensuring important distinctions are maintained.  
- Ensure every manufacturer name is either grouped into an existing standardized name or explicitly recognized as unique.
- Ensure the output strictly follows the JSON format as shown below:  

### Example Output Format:  
[
    {{
        "variants": ["ABC Corporation", "ABC Corp", "ABC"],
        "standardized_name": "ABC Corporation"
    }},
    {{
        "variants": ["XYZ Inc.", "XYZ Incorporated", "X.Y.Z. Inc"],
        "standardized_name": "XYZ Inc."
    }}
]

Here is the list of unique manufacturer names to clean:  

\"\"\"  
{unique_manufacturers}  
\"\"\"  

"""  


response = client.chat.completions.create(model="gpt-oss-120b",
messages=[{"role": "system", "content": "You are an expert system specialized in data cleaning and homogeneization."},
        {"role": "user", "content": prompt}],
)

response = response.choices[0].message.content

print(response)

In [None]:
file_path = 'manufacturer_names.json'
if os.path.exists(file_path):
    data = json.load(open(file_path, 'r'))
else:
    data = json.loads(response.replace("```json", "").replace("```", ""))

    # Extract all variants from JSON output
    json_variants = []
    for entry in data:
        for variant in entry['variants']:
            json_variants.append(variant)  # Add all variants to list

    print(sorted(json_variants) == sorted(unique_manufacturers))

    not_included = [manufacturer for manufacturer in unique_manufacturers if manufacturer not in json_variants]
    hallucinated = [variant for variant in json_variants if variant not in unique_manufacturers]

    print(not_included)
    print(hallucinated)

    # Add missing manufacturers to JSON as individual entries
    for manufacturer in not_included:
        data.append({
            "variants": [manufacturer],  # Only itself as a variant
            "standardized_name": manufacturer  # Standardized to itself
        })
    print(data)
    json.dump(data, open('manufacturer_names.json', 'w'), indent=4)

In [None]:
data_with_variants = [entry for entry in data if len(entry["variants"]) > 1]

pd.set_option("display.max_colwidth", None)  # Show full column content
display(pd.DataFrame(data_with_variants))
#pd.set_option('display.max_rows', None)
#display(pd.DataFrame(data))

In [None]:
index = 10

df_qc_filtered['Notes'].tolist()[index]

In [None]:
prompt = f"""Extract the product number from the following sentence. The product number (pn) is an alphanumeric sequence. If a product number is found, return only the number. If no product number is present, return 'None'.
Examples of product numbers: '...'.

Sentence: \"""
"""


prompt += str(df_qc_filtered['Board Title'].tolist()[index])
prompt += '"""'

response = client.chat.completions.create(model="gpt-oss-120b",
messages=[{"role": "system", "content": "You are a highly accurate and detail-oriented expert system specialized in data extraction from unstructured text. Always prioritize precision, consistency, adherence to the rules provided."},
        {"role": "user", "content": prompt}],
)

response = response.choices[0].message.content

print(response)

PLM-DB

In [None]:
display(df_plmdb_filtered[df_plmdb_filtered['Part Number (PN)'] == response])

Qualification Catalog

In [None]:
display(df_qc_filtered[df_qc_filtered['Board Title'] == df_qc_filtered['Board Title'].tolist()[index]])

In [None]:
package_name_list = pd.concat([df_qc_filtered['Package Code'], df_plmdb_filtered['Package Code']]).unique().tolist()
print(len(package_name_list))

print(package_name_list)

In [None]:
subpackage_name_list = pd.concat([df_qc_filtered['Subpackage Code'], df_plmdb_filtered['Subpackage Code']]).unique().tolist()
print(len(subpackage_name_list))

print(subpackage_name_list)