### Machine Learning Scrubbing Corrections

In [1]:
import torch
print(torch.__version__)

2.3.1+cpu


In [2]:
from torch.cuda.amp import autocast

In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Define the data
original_strings = [
    "GDZ5V1LP3", "DMP2035UVT", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A/SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951-33DRG", "LM76002RNP", "LM2734Y", "VLMH3100",
    "VLMH3100", "SMBJ3V3", "BZX584C10-V-G"
]
corrected_strings = [
    "GDZ5V1LP3-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMC2990UDJ-7", "D1213A-01LP4-7B",
    "BAT54A-7-F", "74LVC1G04SE-7", "TPS61080DRCT", "TPS3808G30DBVT", "TPS22994RUKT", "TPS22994RUKT",
    "TPS2051BDBVT", "TLV75901PDRVT", "TL431IDBVT", "TL431IDBVT", "SN74LVC1T45DBVT", "SN74LVC1T45DBVT",
    "OPA365AIDBVRG4", "LP2951-33DRG4", "LP2951-33DRG4", "LM76002RNPT", "LM2734YQMKE/NOPB", "VLMH3100-GS08",
    "VLMH3100-GS08", "SMBJ3V3/52", "BZX584C10-V-G-08"
]

# Load pre-trained model and tokenizer
model_name = 't5-small'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(original_strings, corrected_strings)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(input_ids=input_encodings.input_ids, labels=target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("GDZ5V1LP3"))  # Output should be "GDZ5V1LP3-7"
print(correct_text("DMP2035UVT"))  # Output should be "DMP2035UVT-7"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


correct correct: GDZ5V1LP3
DMP2035UVT


In [11]:
correct_text("GDZ5V")

'GDZ5V'

In [3]:
print(f"MPN GDZ5V1LP3 \t SE_PART {correct_text("GDZ5V1LP3")}")

SyntaxError: f-string: unmatched '(' (338295964.py, line 1)

In [5]:
print(correct_text("GDZ5V1LP3"))  # Output should be "GDZ5V1LP3-7"
print(correct_text("DMP2035UVT"))  # Output should be "DMP2035UVT-7"

GDZ5V1LP3
DMP2035UVT


In [12]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Define the data
original_strings = [
    "GDZ5V1LP3", "DMP2035UVT", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A/SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951-33DRG", "LM76002RNP", "LM2734Y", "VLMH3100",
    "VLMH3100", "SMBJ3V3", "BZX584C10-V-G"
]
corrected_strings = [
    "GDZ5V1LP3-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMC2990UDJ-7", "D1213A-01LP4-7B",
    "BAT54A-7-F", "74LVC1G04SE-7", "TPS61080DRCT", "TPS3808G30DBVT", "TPS22994RUKT", "TPS22994RUKT",
    "TPS2051BDBVT", "TLV75901PDRVT", "TL431IDBVT", "TL431IDBVT", "SN74LVC1T45DBVT", "SN74LVC1T45DBVT",
    "OPA365AIDBVRG4", "LP2951-33DRG4", "LP2951-33DRG4", "LM76002RNPT", "LM2734YQMKE/NOPB", "VLMH3100-GS08",
    "VLMH3100-GS08", "SMBJ3V3/52", "BZX584C10-V-G-08"
]

# Load pre-trained model and tokenizer
model_name = 't5-small'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(original_strings, corrected_strings)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 2  # Small batch size due to limited data
epochs = 50  # Increase number of epochs for better learning

# Train the model
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=input_encodings.input_ids, labels=target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("GDZ5V1LP3"))  # Output should be "GDZ5V1LP3-7"
print(correct_text("DMP2035UVT"))  # Output should be "DMP2035UVT-7"


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 1/50, Loss: 3.928147792816162
Epoch 2/50, Loss: 4.002223968505859
Epoch 3/50, Loss: 3.335390329360962
Epoch 4/50, Loss: 3.438812494277954
Epoch 5/50, Loss: 3.148794412612915
Epoch 6/50, Loss: 2.886232376098633
Epoch 7/50, Loss: 2.717434883117676
Epoch 8/50, Loss: 3.2918918132781982
Epoch 9/50, Loss: 3.307771921157837
Epoch 10/50, Loss: 2.9674534797668457
Epoch 11/50, Loss: 3.000439405441284
Epoch 12/50, Loss: 2.825639009475708
Epoch 13/50, Loss: 2.8077619075775146
Epoch 14/50, Loss: 2.6439552307128906
Epoch 15/50, Loss: 2.520214796066284
Epoch 16/50, Loss: 3.060020923614502
Epoch 17/50, Loss: 2.6868293285369873
Epoch 18/50, Loss: 2.2225213050842285
Epoch 19/50, Loss: 2.5690698623657227
Epoch 20/50, Loss: 2.7356319427490234
Epoch 21/50, Loss: 2.6360042095184326
Epoch 22/50, Loss: 2.3264708518981934
Epoch 23/50, Loss: 2.5517923831939697
Epoch 24/50, Loss: 2.3474032878875732
Epoch 25/50, Loss: 2.194178819656372
Epoch 26/50, Loss: 2.1760733127593994
Epoch 27/50, Loss: 1.8778218030929

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GDZ5V1LP3
DMP2035UVT


In [13]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import random

# Define the data
original_strings = [
    "GDZ5V1LP3", "DMP2035UVT", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A/SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951-33DRG", "LM76002RNP", "LM2734Y", "VLMH3100",
    "VLMH3100", "SMBJ3V3", "BZX584C10-V-G"
]
corrected_strings = [
    "GDZ5V1LP3-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMC2990UDJ-7", "D1213A-01LP4-7B",
    "BAT54A-7-F", "74LVC1G04SE-7", "TPS61080DRCT", "TPS3808G30DBVT", "TPS22994RUKT", "TPS22994RUKT",
    "TPS2051BDBVT", "TLV75901PDRVT", "TL431IDBVT", "TL431IDBVT", "SN74LVC1T45DBVT", "SN74LVC1T45DBVT",
    "OPA365AIDBVRG4", "LP2951-33DRG4", "LP2951-33DRG4", "LM76002RNPT", "LM2734YQMKE/NOPB", "VLMH3100-GS08",
    "VLMH3100-GS08", "SMBJ3V3/52", "BZX584C10-V-G-08"
]

# Data augmentation function
def augment_data(original, corrected, num_augmentations=3):
    augmented_original = []
    augmented_corrected = []
    for orig, corr in zip(original, corrected):
        augmented_original.append(orig)
        augmented_corrected.append(corr)
        for _ in range(num_augmentations):
            # Randomly insert a character from the correction into the original
            index = random.randint(0, len(orig) - 1)
            char = random.choice(corr)
            new_orig = orig[:index] + char + orig[index:]
            augmented_original.append(new_orig)
            augmented_corrected.append(corr)
    return augmented_original, augmented_corrected

# Augment the data
augmented_original, augmented_corrected = augment_data(original_strings, corrected_strings)

# Load pre-trained model and tokenizer
model_name = 't5-base'  # Using a larger model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(augmented_original, augmented_corrected)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 4  # Adjusted batch size for the larger model
epochs = 50  # Increase number of epochs for better learning

# Train the model
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=input_encodings.input_ids, labels=target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("GDZ5V1LP3"))  # Output should be "GDZ5V1LP3-7"
print(correct_text("DMP2035UVT"))  # Output should be "DMP2035UVT-7"


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/50, Loss: 6.638936519622803
Epoch 2/50, Loss: 5.816136360168457
Epoch 3/50, Loss: 4.893213272094727
Epoch 4/50, Loss: 4.126890659332275
Epoch 5/50, Loss: 3.540929079055786
Epoch 6/50, Loss: 3.1761319637298584
Epoch 7/50, Loss: 2.7373435497283936
Epoch 8/50, Loss: 2.7316083908081055
Epoch 9/50, Loss: 2.502169132232666
Epoch 10/50, Loss: 2.4096741676330566
Epoch 11/50, Loss: 2.359092950820923
Epoch 12/50, Loss: 2.2819361686706543
Epoch 13/50, Loss: 2.171050548553467
Epoch 14/50, Loss: 2.1392455101013184
Epoch 15/50, Loss: 2.0251879692077637
Epoch 16/50, Loss: 1.9782861471176147
Epoch 17/50, Loss: 1.8503508567810059
Epoch 18/50, Loss: 1.8784576654434204
Epoch 19/50, Loss: 1.7560721635818481
Epoch 20/50, Loss: 1.6789259910583496
Epoch 21/50, Loss: 1.6070284843444824
Epoch 22/50, Loss: 1.5710372924804688
Epoch 23/50, Loss: 1.615328073501587
Epoch 24/50, Loss: 1.4380848407745361
Epoch 25/50, Loss: 1.357596755027771
Epoch 26/50, Loss: 1.2929190397262573
Epoch 27/50, Loss: 1.2471916675

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GDZ5V1LP3-7
DMP2035UVT-7


In [22]:
print(correct_text("LM2734Y"))  # Output should be "GDZ5V1LP3-7"

LM2734YK-T


In [23]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import random

# Define the data
original_strings = [
    "GDZ5V1LP3", "DMP2035UVT", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A/SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951-33DRG", "LM76002RNP", "LM2734Y", "VLMH3100",
    "VLMH3100", "SMBJ3V3", "BZX584C10-V-G"
]
corrected_strings = [
    "GDZ5V1LP3-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMC2990UDJ-7", "D1213A-01LP4-7B",
    "BAT54A-7-F", "74LVC1G04SE-7", "TPS61080DRCT", "TPS3808G30DBVT", "TPS22994RUKT", "TPS22994RUKT",
    "TPS2051BDBVT", "TLV75901PDRVT", "TL431IDBVT", "TL431IDBVT", "SN74LVC1T45DBVT", "SN74LVC1T45DBVT",
    "OPA365AIDBVRG4", "LP2951-33DRG4", "LP2951-33DRG4", "LM76002RNPT", "LM2734YQMKE/NOPB", "VLMH3100-GS08",
    "VLMH3100-GS08", "SMBJ3V3/52", "BZX584C10-V-G-08"
]

# Data augmentation function
def augment_data(original, corrected, num_augmentations=3):
    augmented_original = []
    augmented_corrected = []
    for orig, corr in zip(original, corrected):
        augmented_original.append(orig)
        augmented_corrected.append(corr)
        for _ in range(num_augmentations):
            # Randomly insert a character from the correction into the original
            index = random.randint(0, len(orig) - 1)
            char = random.choice(corr)
            new_orig = orig[:index] + char + orig[index:]
            augmented_original.append(new_orig)
            augmented_corrected.append(corr)
    return augmented_original, augmented_corrected

# Augment the data
augmented_original, augmented_corrected = augment_data(original_strings, corrected_strings)

# Load pre-trained model and tokenizer
model_name = 't5-base'  # Using a larger model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(augmented_original, augmented_corrected)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 4  # Adjusted batch size for the larger model
epochs = 100  # Increase number of epochs for better learning

# Train the model
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=input_encodings.input_ids, labels=target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("GDZ5V1LP3"))  # Output should be "GDZ5V1LP3-7"
print(correct_text("DMP2035UVT"))  # Output should be "DMP2035UVT-7"


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/100, Loss: 6.85982084274292
Epoch 2/100, Loss: 5.8824968338012695
Epoch 3/100, Loss: 5.031924724578857
Epoch 4/100, Loss: 4.291821002960205
Epoch 5/100, Loss: 3.627164363861084
Epoch 6/100, Loss: 3.2053158283233643
Epoch 7/100, Loss: 3.0328004360198975
Epoch 8/100, Loss: 2.8952414989471436
Epoch 9/100, Loss: 2.839867115020752
Epoch 10/100, Loss: 2.6082260608673096
Epoch 11/100, Loss: 2.5520877838134766
Epoch 12/100, Loss: 2.4886207580566406
Epoch 13/100, Loss: 2.3434622287750244
Epoch 14/100, Loss: 2.2522172927856445
Epoch 15/100, Loss: 2.269653081893921
Epoch 16/100, Loss: 2.3157200813293457
Epoch 17/100, Loss: 2.081393003463745
Epoch 18/100, Loss: 2.0173497200012207
Epoch 19/100, Loss: 2.006260395050049
Epoch 20/100, Loss: 1.8795157670974731
Epoch 21/100, Loss: 1.907446265220642
Epoch 22/100, Loss: 1.81791353225708
Epoch 23/100, Loss: 1.836065411567688
Epoch 24/100, Loss: 1.6820017099380493
Epoch 25/100, Loss: 1.7044832706451416
Epoch 26/100, Loss: 1.6037298440933228
Epoch 27

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GDZ5V1LP3-7
DMP2035UVT-7


In [25]:
print(correct_text("BAT54A/SOT"))

BAT54A-7-F


In [27]:
print(correct_text("BAT54A/SOT"))

BAT54A-7-F


In [26]:
print(correct_text("LM2734Y"))

LM2734YQMKE/NOPB


In [28]:
print(correct_text("LM76002RNP"))

LM76002RNPT


In [32]:
partlist=[
    "GDZ5V1LP", "DMP2035UV", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A//SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951/33DRG", "LM76002RNP", "LM2734Y", "VLMH3-100",
    "VLMH3100", "SMBJ3V3", "BZX584C10_V-G"
]

In [33]:
for part in partlist:
    print(f"{part}\t{correct_text(part)}")

GDZ5V1LP	GDZ5V1LP4-7
DMP2035UV	DMP2035UVH-7
DMP2035UVT	DMP2035UVT-7
DMP2035UVT	DMP2035UVT-7
DMC2990UDJ	DMC2990UDJ-7
D1213A-01LP4	D1213A-01LP4-7B
BAT54A//SOT	BAT54A-7-F
74LVC1G04SE	74LVC1G04SE-7
TPS61080DRC	TPS61080DRCT
TPS3808G30DBV	TPS3808G30DBVT
TPS22994RUK	TPS22994RUKT
TPS22994RUK	TPS22994RUKT
TPS2051BDBV	TPS2051BDBVT
TLV759PDRVT	TLV75901PDRVT
TL431IDBV	TL431IDBVT
TL431IDBV	TL431IDBVT
SN74LVC1T45DBV	SN74LVC1T45DBVT
SN74LVC1T45DBV	SN74LVC1T45DBVT
OPA365AIDBVRG	OPA365AIDBVRG4
LP2951-33DRG	LP2951-33DRG4
LP2951/33DRG	LP2951/33DRG4
LM76002RNP	LM76002RNPT
LM2734Y	LM2734YQMKE/NOPB
VLMH3-100	VLMH3-100
VLMH3100	VLMH3100-GS08
SMBJ3V3	SMBJ3V3/52
BZX584C10_V-G	BZX584C10_V-G-08


In [42]:
print(f"{'RK73H1J/TTD475'}\t{correct_text('RK73H1J/TTD475')}")

RK73H1J/TTD475	RK73H1J/7DT475


### Try Vishay and Murata suppliers correction with 17K rows trained data

In [21]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import random
import pandas as pd
df=pd.read_csv("IMPUT_MPN_SE_PART.txt",sep='\t',encoding='iso-8859-1')
input_list=df['MPN'].to_list()
ouptu_list=df['SE_PART'].to_list()

# Define the data
original_strings = input_list
corrected_strings = ouptu_list


# Load pre-trained model and tokenizer
model_name = 't5-base'  # Using a larger model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
print("start preprocess data")
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(original_strings, corrected_strings)

print("end preprocess data")
# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 4  # Adjusted batch size for the larger model
epochs = 100  # Increase number of epochs for better learning

print("start model train)")
# Train the model
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=input_encodings.input_ids, labels=target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("RCA1206 106 J 200 RT6"))  # Output should be "RCA120610M0JNTC"
print(correct_text("LQW15AN51NJ00"))  # Output should be "LQW15AN51NJ00D"


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


start preprocess data
end preprocess data
start model train)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 6240829440 bytes.

In [1]:
import random
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Define the data
original_strings = [
    "GDZ5V1LP3", "DMP2035UVT", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A/SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951-33DRG", "LM76002RNP", "LM2734Y", "VLMH3100",
    "VLMH3100", "SMBJ3V3", "BZX584C10-V-G"
]
corrected_strings = [
    "GDZ5V1LP3-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMP2035UVT-7", "DMC2990UDJ-7", "D1213A-01LP4-7B",
    "BAT54A-7-F", "74LVC1G04SE-7", "TPS61080DRCT", "TPS3808G30DBVT", "TPS22994RUKT", "TPS22994RUKT",
    "TPS2051BDBVT", "TLV75901PDRVT", "TL431IDBVT", "TL431IDBVT", "SN74LVC1T45DBVT", "SN74LVC1T45DBVT",
    "OPA365AIDBVRG4", "LP2951-33DRG4", "LP2951-33DRG4", "LM76002RNPT", "LM2734YQMKE/NOPB", "VLMH3100-GS08",
    "VLMH3100-GS08", "SMBJ3V3/52", "BZX584C10-V-G-08"
]

# Data augmentation function
def augment_data(original, corrected, num_augmentations=3):
    augmented_original = []
    augmented_corrected = []
    for orig, corr in zip(original, corrected):
        augmented_original.append(orig)
        augmented_corrected.append(corr)
        for _ in range(num_augmentations):
            # Randomly insert a character from the correction into the original
            index = random.randint(0, len(orig) - 1)
            char = random.choice(corr)
            new_orig = orig[:index] + char + orig[index:]
            augmented_original.append(new_orig)
            augmented_corrected.append(corr)
    return augmented_original, augmented_corrected

# Augment the data
augmented_original, augmented_corrected = augment_data(original_strings, corrected_strings)

# Load pre-trained model and tokenizer
model_name = 't5-base'  # Using a larger model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(augmented_original, augmented_corrected)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 4  # Adjusted batch size for the larger model
epochs = 50  # Increase number of epochs for better learning

# Train the model
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=input_encodings.input_ids, labels=target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("GDZ5V1LP3"))  # Output should be "GDZ5V1LP3-7"
print(correct_text("DMP2035UVT"))  # Output should be "DMP2035UVT-7"


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/50, Loss: 6.703507900238037
Epoch 2/50, Loss: 5.922305583953857
Epoch 3/50, Loss: 4.811218738555908
Epoch 4/50, Loss: 4.206692695617676
Epoch 5/50, Loss: 3.800602674484253
Epoch 6/50, Loss: 3.2254796028137207
Epoch 7/50, Loss: 3.007265329360962
Epoch 8/50, Loss: 2.8541789054870605
Epoch 9/50, Loss: 2.7452476024627686
Epoch 10/50, Loss: 2.7033519744873047
Epoch 11/50, Loss: 2.4654407501220703
Epoch 12/50, Loss: 2.339860200881958
Epoch 13/50, Loss: 2.333681344985962
Epoch 14/50, Loss: 2.283118963241577
Epoch 15/50, Loss: 2.2288670539855957
Epoch 16/50, Loss: 2.0995965003967285
Epoch 17/50, Loss: 1.9308871030807495
Epoch 18/50, Loss: 2.079052448272705
Epoch 19/50, Loss: 1.8566230535507202
Epoch 20/50, Loss: 1.7367079257965088
Epoch 21/50, Loss: 1.703191876411438
Epoch 22/50, Loss: 1.5965909957885742
Epoch 23/50, Loss: 1.523922085762024
Epoch 24/50, Loss: 1.5287590026855469
Epoch 25/50, Loss: 1.4866442680358887
Epoch 26/50, Loss: 1.4051965475082397
Epoch 27/50, Loss: 1.332137823104

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GDZ5V1LP3-7
DMP2035UVT-7


### Calculate Accuracy and performance measures

In [2]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch.nn.functional as F

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Function to calculate confidence score
def calculate_confidence(logits, predicted_tokens):
    probabilities = [F.softmax(logit, dim=-1) for logit in logits]
    predicted_probs = [prob[0, token].item() for prob, token in zip(probabilities, predicted_tokens[0])]
    confidence = sum(predicted_probs) / len(predicted_probs)
    return confidence

# Example inference with confidence score
def correct_text_with_confidence(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, output_scores=True, return_dict_in_generate=True)
    
    # Decode the generated tokens to get the corrected text
    corrected_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    
    # Calculate the confidence score
    confidence = calculate_confidence(outputs.scores, outputs.sequences[:, 1:])
    
    return corrected_text, confidence

# Example usage
corrected_text, confidence = correct_text_with_confidence("GDZ5V1LP3")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Corrected Text: GDZ5V1LP3FKEA, Confidence: 0.82
Corrected Text: DMP2035UVT0, Confidence: 0.85


In [11]:

MPNlist=["CRCW2010470RFK","CRCW201056R0JN","VO615-A-3X019","CRCW0402470KJN","CRCW04028K87FK","WSL2512R0120F","WSL1206R1500FE","TNPW120613K0BEE","TNPW08059K88BE","TNPW08056K19BEE","TNPW080561K9BEE","TNPW08055K05BEE","TNPW080554K9BE","CRCW0402200KFKT","CRCW04021M43FK","CRCW04021K82FK","CRCW040215K0FKT","CRCW0402110KFK","CRCW04020000ZS","CRCW04020000Z0","CRCW020133R0FN","593D476X9016C","TNPW04028K87BE","TNPW0402715RBE","TNPW04023K74BE","RK73H1J1501F","SR731JTR220F","RK73HW3A2T1500F","MOS3C123J","RK73BW2HT333J","CFPS1/4C150J","CFPS1/4C105J","CFPS1/2C105J","CFPS1/2C102J","CFPS1/2C101J","X7R0603HT?471K","X7R0603HT?103K","X7R0603HT?102K","RK73Z2BL","RK73H2A1330F","RK73H1JT8201F","RK73H1JL9091F","TLR3AWDXX4L00F75","RK73Z1ETTP#","RM73B3A301J","rk73h2ht5110f","rk73h2ht4322f","rk73h2bt1374f","RK73H1HT~~10R0F","RK73H1HT~~10R0F","RK73H1HT~~1004F","RK73H1HT~~1002F","RK73H1ET~~91R0F","RK73H1ET~~8201F","RK73H1ET~~7R50F","RK73H1ET~~5104F","RK73H1ET~~3002F","RK73H1ET~~2703F","RK73H1ET~~2203F","RK73H1ET~~1800F","RK73H1ET~~~9100F","RK73H1ETx110kF","RK73H1ETx180kF","RK73H1ETx18RF","RK73H1ETx1k5F","RK73H1ETx33kF","RK73H1ETx3MF","RK73H1ETx430kF","RK73H1ETx49R9F","RK73H1ETx51kF","RK73H1ETx750kF"]
for MPN in MPNlist:
    
    corrected_text, confidence = correct_text_with_confidence(MPN)
    print(f"{MPN}\t{corrected_text}\t{confidence:.2f}")

CRCW2010470RFK	CRCW2010470RFKEA	0.95
CRCW201056R0JN	CRCW201056R0JNKEA	0.86
VO615-A-3X019	VO615-A-3X019NW	0.83
CRCW0402470KJN	CRCW0402470KJN	0.92
CRCW04028K87FK	CRCW04028K87FKEA	0.98
WSL2512R0120F	WSL2512R0120FKEA	0.98
WSL1206R1500FE	WSL1206R1500FEA	0.99
TNPW120613K0BEE	TNPW120613K0BEEA	0.99
TNPW08059K88BE	TNPW08059K88BEEA	0.95
TNPW08056K19BEE	TNPW08056K19BEEA	1.00
TNPW080561K9BEE	TNPW080561K9BEEA	1.00
TNPW08055K05BEE	TNPW08055K05BEEA	1.00
TNPW080554K9BE	TNPW080554K9BEEA	0.94
CRCW0402200KFKT	CRCW0402200KFKTAT	0.91
CRCW04021M43FK	CRCW04021M43FKEA	0.96
CRCW04021K82FK	CRCW04021K82FKEA	0.98
CRCW040215K0FKT	CRCW040215K0FKTED	0.89
CRCW0402110KFK	CRCW0402110KFKEA	0.97
CRCW04020000ZS	CRCW04020000ZS.	0.89
CRCW04020000Z0	CRCW04020000Z0FKEA	0.99
CRCW020133R0FN	CRCW020133R0FNED	1.00
593D476X9016C	593D476X9016C/H	0.89
TNPW04028K87BE	TNPW04028K87BEEA	0.94
TNPW0402715RBE	TNPW0402715RBEA	0.95
TNPW04023K74BE	TNPW04023K74BEA	0.92
RK73H1J1501F	RK73H1J1501FKEA	0.97
SR731JTR220F	SR731JTR220FKEA	0.97
RK73HW3

In [18]:
corrected_text, confidence = correct_text_with_confidence("DMP2035UVT")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")

Corrected Text: DMP2035UVT-7, Confidence: 0.95


In [20]:
partlist=[
    "GDZ5V1LP", "DMP2035UV", "DMP2035UVT", "DMP2035UVT", "DMC2990UDJ", "D1213A-01LP4",
    "BAT54A//SOT", "74LVC1G04SE", "TPS61080DRC", "TPS3808G30DBV", "TPS22994RUK", "TPS22994RUK",
    "TPS2051BDBV", "TLV759PDRVT", "TL431IDBV", "TL431IDBV", "SN74LVC1T45DBV", "SN74LVC1T45DBV",
    "OPA365AIDBVRG", "LP2951-33DRG", "LP2951/33DRG", "LM76002RNP", "LM2734Y", "VLMH3-100",
    "VLMH3100", "SMBJ3V3", "BZX584C10_V-G"
]
for part in partlist:
    corrected_text, confidence = correct_text_with_confidence(part)
    print(f"{part}\t{corrected_text}\t{confidence:.2f}")



GDZ5V1LP	GDZ5V1LP	0.90
DMP2035UV	DMP2035UV-7	0.88
DMP2035UVT	DMP2035UVT-7	0.95
DMP2035UVT	DMP2035UVT-7	0.95
DMC2990UDJ	DMC2990UDJ-00	0.75
D1213A-01LP4	D1213A-01LP4-7	0.89
BAT54A//SOT	BAT54A//SOT	0.87
74LVC1G04SE	74LVC1G04SE-T	0.83
TPS61080DRC	TPS61080DRC	0.89
TPS3808G30DBV	TPS3808G30DBVT	0.97
TPS22994RUK	TPS22994RUKT	0.92
TPS22994RUK	TPS22994RUKT	0.92
TPS2051BDBV	TPS2051BDVT	0.84
TLV759PDRVT	TLV759PDRVT	0.90
TL431IDBV	TL431IDBVT	0.90
TL431IDBV	TL431IDBVT	0.90
SN74LVC1T45DBV	SN74LVC1T45DBVT	0.99
SN74LVC1T45DBV	SN74LVC1T45DBVT	0.99
OPA365AIDBVRG	OPA365AIDBVRG	0.85
LP2951-33DRG	LP2951-33DRG-7	0.91
LP2951/33DRG	LP2951/33DRG-00	0.76
LM76002RNP	LM76002RNPT	0.93
LM2734Y	LM2734YYT	0.70
VLMH3-100	VLMH3-100	0.85
VLMH3100	VLMH3100-G1-C6	0.52
SMBJ3V3	SMBJ3V3-T3	0.69
BZX584C10_V-G	BZX584C10_V-G-00	0.87


In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import pandas as pd

# Load data
df = pd.read_csv("IMPUT_MPN_SE_PART.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()

# Load pre-trained model and tokenizer
model_name = 't5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(input_list, output_list)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 100
epochs = 50
accumulation_steps = 16  # Adjust this to accumulate gradients over multiple steps

# Mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Train the model
model.train()
for epoch in range(epochs):
    for i in range(0, len(input_encodings.input_ids), batch_size):
        input_ids = input_encodings.input_ids[i:i + batch_size]
        labels = target_encodings.input_ids[i:i + batch_size]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i // batch_size + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        print(f"Epoch {epoch + 1}/{epochs}, Step {i // batch_size + 1}/{len(input_encodings.input_ids) // batch_size}, Loss: {loss.item() * accumulation_steps}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("RCA1206 106 J 200 RT6"))
print(correct_text("LQW15AN51NJ00"))


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/50, Step 1/99, Loss: 5.538491249084473
Epoch 1/50, Step 2/99, Loss: 5.579962730407715
Epoch 1/50, Step 3/99, Loss: 5.70332145690918
Epoch 1/50, Step 4/99, Loss: 5.503213882446289
Epoch 1/50, Step 5/99, Loss: 6.001406669616699
Epoch 1/50, Step 6/99, Loss: 5.678353309631348
Epoch 1/50, Step 7/99, Loss: 5.310850620269775
Epoch 1/50, Step 8/99, Loss: 6.643881797790527
Epoch 1/50, Step 9/99, Loss: 5.545333385467529
Epoch 1/50, Step 10/99, Loss: 5.376596450805664
Epoch 1/50, Step 11/99, Loss: 5.627132415771484
Epoch 1/50, Step 12/99, Loss: 5.947790622711182
Epoch 1/50, Step 13/99, Loss: 5.566442489624023
Epoch 1/50, Step 14/99, Loss: 7.416925430297852
Epoch 1/50, Step 15/99, Loss: 8.470320701599121
Epoch 1/50, Step 16/99, Loss: 8.49547290802002
Epoch 1/50, Step 17/99, Loss: 6.484898090362549
Epoch 1/50, Step 18/99, Loss: 6.4276580810546875
Epoch 1/50, Step 19/99, Loss: 6.3027544021606445
Epoch 1/50, Step 20/99, Loss: 6.603045463562012
Epoch 1/50, Step 21/99, Loss: 4.0932111740112305


KeyboardInterrupt: 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random

# Load data
df = pd.read_csv("IMPUT_MPN_SE_PART.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()


# Define the data
original_strings = input_list
corrected_strings = output_list


# Data augmentation function
def augment_data(original, corrected, num_augmentations=3):
    # augmented_original = []
    # augmented_corrected = []
    # for orig, corr in zip(original, corrected):
    #     augmented_original.append(orig)
    #     augmented_corrected.append(corr)
    #     for _ in range(num_augmentations):
    #         # Randomly insert a character from the correction into the original
    #         index = random.randint(0, len(orig) - 1)
    #         char = random.choice(corr)
    #         new_orig = orig[:index] + char + orig[index:]
    #         augmented_original.append(new_orig)
    #         augmented_corrected.append(corr)
    # return augmented_original, augmented_corrected
    return original, corrected

# Augment the data
augmented_original, augmented_corrected = augment_data(original_strings, corrected_strings)

# Load pre-trained model and tokenizer
model_name = 't5-base'  # Using a larger model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings


# Split data into training and validation sets
train_original, val_original, train_corrected, val_corrected = train_test_split(
    augmented_original, augmented_corrected, test_size=0.2, random_state=42)

# Prepare data for training
train_input_encodings, train_target_encodings = preprocess_data(train_original, train_corrected)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
batch_size = 8
epochs = 100

# Training loop
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=train_input_encodings.input_ids, labels=train_target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    scheduler.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Prepare data for validation
val_input_encodings, _ = preprocess_data(val_original, val_corrected)

# Inference on validation set
model.eval()
predictions = []
with torch.no_grad():
    for i in range(len(val_original)):
        input_ids = tokenizer("correct: " + val_original[i], return_tensors="pt").input_ids.to(device)
        outputs = model.generate(input_ids)
        corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(corrected_text)

# Calculate metrics
accuracy = accuracy_score(val_corrected, predictions)
precision = precision_score(val_corrected, predictions, average='weighted')
recall = recall_score(val_corrected, predictions, average='weighted')
f1 = f1_score(val_corrected, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/100, Loss: 7.051552772521973
Epoch 11/100, Loss: 2.5047192573547363
Epoch 21/100, Loss: 2.356590986251831
Epoch 31/100, Loss: 2.410599946975708
Epoch 41/100, Loss: 2.347416639328003
Epoch 51/100, Loss: 2.3547701835632324
Epoch 61/100, Loss: 2.43377947807312
Epoch 71/100, Loss: 2.2297239303588867


KeyboardInterrupt: 

### train model with knowing accuracy metrics

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random
import pandas as pd

# Load data
df = pd.read_csv("IMPUT_MPN_SE_PART.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()


# Define the data
original_strings = input_list
corrected_strings = output_list

# Data augmentation function
def augment_data(original, corrected, num_augmentations=3):
    # augmented_original = []
    # augmented_corrected = []
    # for orig, corr in zip(original, corrected):
    #     augmented_original.append(orig)
    #     augmented_corrected.append(corr)
    #     for _ in range(num_augmentations):
    #         # Randomly insert a character from the correction into the original
    #         index = random.randint(0, len(orig) - 1)
    #         char = random.choice(corr)
    #         new_orig = orig[:index] + char + orig[index:]
    #         augmented_original.append(new_orig)
    #         augmented_corrected.append(corr)
    # return augmented_original, augmented_corrected
    return original, corrected

# Augment the data
augmented_original, augmented_corrected = augment_data(original_strings, corrected_strings)

# Split data into training and validation sets
train_original, val_original, train_corrected, val_corrected = train_test_split(
    augmented_original, augmented_corrected, test_size=0.2, random_state=42)

# Load pre-trained model and tokenizer
model_name = 't5-base'  # Using a larger model
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

train_input_encodings, train_target_encodings = preprocess_data(train_original, train_corrected)
val_input_encodings, val_target_encodings = preprocess_data(val_original, val_corrected)

# Training parameters
learning_rate = 5e-5
batch_size = 8
epochs = 100
weight_decay = 1e-5

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Training loop
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(input_ids=train_input_encodings.input_ids, labels=train_target_encodings.input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    scheduler.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Evaluate on validation set
model.eval()
val_predictions = []
with torch.no_grad():
    for i in range(len(val_original)):
        input_ids = tokenizer("correct: " + val_original[i], return_tensors="pt").input_ids
        outputs = model.generate(input_ids)
        corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        val_predictions.append(corrected_text)

# Calculate accuracy metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(val_corrected, val_predictions)
precision = precision_score(val_corrected, val_predictions, average='weighted')
recall = recall_score(val_corrected, val_predictions, average='weighted')
f1 = f1_score(val_corrected, val_predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import pandas as pd

# Load data
df = pd.read_csv("IMPUT_MPN_SE_PART.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()

# Load pre-trained model and tokenizer
model_name = 't5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(input_list, output_list)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
batch_size = 100
epochs = 50
accumulation_steps = 16  # Adjust this to accumulate gradients over multiple steps

# Mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Train the model
model.train()
for epoch in range(epochs):
    for i in range(0, len(input_encodings.input_ids), batch_size):
        input_ids = input_encodings.input_ids[i:i + batch_size]
        labels = target_encodings.input_ids[i:i + batch_size]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i // batch_size + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        print(f"Epoch {epoch + 1}/{epochs}, Step {i // batch_size + 1}/{len(input_encodings.input_ids) // batch_size}, Loss: {loss.item() * accumulation_steps}")

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("RCA1206 106 J 200 RT6"))
print(correct_text("LQW15AN51NJ00"))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/50, Step 1/99, Loss: 5.4451189041137695
Epoch 1/50, Step 2/99, Loss: 5.6883063316345215
Epoch 1/50, Step 3/99, Loss: 5.944300651550293
Epoch 1/50, Step 4/99, Loss: 5.540075302124023


### Train Model on KOA Speer Parts Only

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import pandas as pd

# Load data
df = pd.read_csv("KOA_SPEER_Train_Data.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()

# Load pre-trained model and tokenizer
model_name = 't5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(input_list, output_list)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
batch_size = 400
epochs = 15
accumulation_steps = 16  # Adjust this to accumulate gradients over multiple steps

# Mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Train the model
model.train()
for epoch in range(epochs):
    for i in range(0, len(input_encodings.input_ids), batch_size):
        input_ids = input_encodings.input_ids[i:i + batch_size]
        labels = target_encodings.input_ids[i:i + batch_size]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i // batch_size + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        print(f"Epoch {epoch + 1}/{epochs}, Step {i // batch_size + 1}/{len(input_encodings.input_ids) // batch_size}, Loss: {loss.item() * accumulation_steps}",flush=True)

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("# RN73R2ATTD2053B25"))
print(correct_text("(OBS) RN731ETTP2401F25"))

### Final KOA MODEL

In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch.nn.functional as F

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./corrector_model_KOA")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model_KOA")

# Function to calculate confidence score
def calculate_confidence(logits, predicted_tokens):
    probabilities = [F.softmax(logit, dim=-1) for logit in logits]
    predicted_probs = [prob[0, token].item() for prob, token in zip(probabilities, predicted_tokens[0])]
    confidence = sum(predicted_probs) / len(predicted_probs)
    return confidence

# Example inference with confidence score
def correct_Koa_text_with_confidence(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, output_scores=True, return_dict_in_generate=True)
    
    # Decode the generated tokens to get the corrected text
    corrected_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    
    # Calculate the confidence score
    confidence = calculate_confidence(outputs.scores, outputs.sequences[:, 1:])
    
    return corrected_text, confidence

# Example usage
corrected_text, confidence = correct_Koa_text_with_confidence("RK73C2BTE1693D *OBS*")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Corrected Text: RK73C2BTE1693D, Confidence: 0.94




### Run KOA Model on New Parts

In [4]:
MPNlist=pd.read_csv("koa_PS_Input.txt",sep='\t')['MPN'].to_list()
for MPN in MPNlist:
    
    corrected_text, confidence = correct_Koa_text_with_confidence(MPN)
    print(f"{MPN}\t{corrected_text}\t{confidence:.2f}")

RK73H1JTTD0000F	RK73H1JTTD0000F	1.00
CF1/2-392-J	CF1/2-392-J	0.98
RK73H1JTTDB3241F	RK73H1JTTD3241F	0.99
PCF1/2C222K	PCF1/2C222K	0.99
RK73H2ATTDB8252F	RK73H2ATTD8252F	0.99
CF1/8-223-J	CF1/8-223-J	0.98
CF1/4CBK5R6J	CF1/4CBK5R6J	0.99
CF1/8-273-J	CF1/8-273-J	0.99
CN2B4T223J	CN2B4T223J	0.98
RM73B2ETE-272J	RM73B2ETTD272J	0.91
RN73C2BTE20R0F	RN73C2BTE20R0F	0.97
NPO0402HTTD4R7J	NPO0402HTTD4R7J	0.97
RK73H1JTE1132F	RK73H1JTTD1132F	0.98
CF1/8-104-J	CF1/8-104-J	1.00
CF1/8-470-J	CF1/8-470-J	1.00
RK73H1JTTE1911F	RK73H1JTTD1911F	0.99
CF1/8-391-J	CF1/8-391-J	0.99
MF1/4CCL1211F	MF1/4CCL1211F	0.99
RN732A7.5KF50	RN732ATTD50	0.89
RN73C2AT3011F	RN73C2ATTD3011F	0.98
RN73C2AT4750F	RN73C2ATTD4750F	0.96
RK73H2ATE1743F	RK73H2ATTD1743F	0.99
RK73H2ATTD0R00F	RK73H2ATTD0R00F	1.00
RK73H1FTTD1000F	RK73H1FTTD1000F	0.99
RK73H1JTTE2432F	RK73H1JTTD2432F	0.99
RK73H1JTE2261F	RK73H1JTTD2261F	0.97
RK73H1JTTDB2670F	RK73H1JTTD2670F	0.99
RK73H1JTTDB8252F	RK73H1JTTD8252F	1.00
RM73B2ATE225J	RM73B2ATTD225J	0.94
CF1/4822J	CF1/4822J

### Train Model on Vishay Parts Only

In [39]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import pandas as pd

# Load data
df = pd.read_csv("Vishay_Trained_Data.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()

# Load pre-trained model and tokenizer
model_name = 't5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected):
    inputs = ["correct: " + sentence for sentence in original]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(input_list, output_list)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
batch_size = 400
epochs = 15
accumulation_steps = 16  # Adjust this to accumulate gradients over multiple steps

# Mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Train the model
model.train()
for epoch in range(epochs):
    for i in range(0, len(input_encodings.input_ids), batch_size):
        input_ids = input_encodings.input_ids[i:i + batch_size]
        labels = target_encodings.input_ids[i:i + batch_size]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i // batch_size + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        print(f"Epoch {epoch + 1}/{epochs}, Step {i // batch_size + 1}/{len(input_encodings.input_ids) // batch_size}, Loss: {loss.item() * accumulation_steps}",flush=True)

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("CRCW0603180RFK(EA, EB or EC)"))
print(correct_text("ZY 27 DO-41"))
print(correct_text("ZM4746A-GS08-CUTTAPE"))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/15, Step 1/242, Loss: 7.746111869812012
Epoch 1/15, Step 2/242, Loss: 6.90484094619751
Epoch 1/15, Step 3/242, Loss: 7.565450668334961
Epoch 1/15, Step 4/242, Loss: 6.954629421234131
Epoch 1/15, Step 5/242, Loss: 6.049954891204834
Epoch 1/15, Step 6/242, Loss: 7.510733127593994
Epoch 1/15, Step 7/242, Loss: 6.589042663574219
Epoch 1/15, Step 8/242, Loss: 6.022075653076172
Epoch 1/15, Step 9/242, Loss: 6.757442951202393
Epoch 1/15, Step 10/242, Loss: 6.997124671936035
Epoch 1/15, Step 11/242, Loss: 6.4465742111206055


KeyboardInterrupt: 

### Vishay Model output function for result

In [10]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch.nn.functional as F

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Function to calculate confidence score
def calculate_confidence(logits, predicted_tokens):
    probabilities = [F.softmax(logit, dim=-1) for logit in logits]
    predicted_probs = [prob[0, token].item() for prob, token in zip(probabilities, predicted_tokens[0])]
    confidence = sum(predicted_probs) / len(predicted_probs)
    return confidence

# Example inference with confidence score
def correct_text_with_confidence(input_text):
    input_ids = tokenizer("correct: " + input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, output_scores=True, return_dict_in_generate=True)
    
    # Decode the generated tokens to get the corrected text
    corrected_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    
    # Calculate the confidence score
    confidence = calculate_confidence(outputs.scores, outputs.sequences[:, 1:])
    
    return corrected_text, confidence

# Example usage
corrected_text, confidence = correct_text_with_confidence("RK73C2BTE1693D *OBS*")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Corrected Text: RK73C2BTE1693D, Confidence: 0.93


In [24]:
corrected_text, confidence = correct_text_with_confidence("293D226X010C2T")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")


Corrected Text: 293D226X010C2T, Confidence: 0.93


### Multi Suppliers Training Model

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import pandas as pd
import random
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv("multi_supplier_sample_train_data.txt", sep='\t', encoding='iso-8859-1')
input_list = df['MPN'].to_list()
output_list = df['SE_PART'].to_list()
supplier_list = df['Supplier_Name'].to_list()  # Assuming Supplier_Name column exists

# Load pre-trained model and tokenizer
model_name = 't5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Prepare data
def preprocess_data(original, corrected, suppliers):
    inputs = ["correct: " + orig + " supplier: " + supplier for orig, supplier in zip(original, suppliers)]
    targets = corrected
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(input_list, output_list, supplier_list)

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
batch_size = 64
epochs = 3
accumulation_steps = 1  # Adjust this to accumulate gradients over multiple steps

# Mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Train the model
model.train()
for epoch in range(epochs):
    for i in range(0, len(input_encodings.input_ids), batch_size):
        input_ids = input_encodings.input_ids[i:i + batch_size]
        labels = target_encodings.input_ids[i:i + batch_size]

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i // batch_size + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        print(f"Epoch {epoch + 1}/{epochs}, Step {i // batch_size + 1}/{len(input_encodings.input_ids) // batch_size}, Loss: {loss.item() * accumulation_steps}", flush=True)

# Save the fine-tuned model
model.save_pretrained("./corrector_model")
tokenizer.save_pretrained("./corrector_model")

# Load the model for inference
model = T5ForConditionalGeneration.from_pretrained("./corrector_model")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model")

# Example inference
def correct_text(input_text, supplier_name):
    input_ids = tokenizer("correct: " + input_text + " supplier: " + supplier_name, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Example usage
print(correct_text("LTC691ISW#", "Analog Devices"))
print(correct_text("0402ZC473KATA", "KYOCERA AVX Components Corporation"))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/15, Step 1/5278, Loss: 6.859422206878662
Epoch 1/15, Step 2/5278, Loss: 2.6417412757873535
Epoch 1/15, Step 3/5278, Loss: 1.6776739358901978
Epoch 1/15, Step 4/5278, Loss: 1.4394644498825073
Epoch 1/15, Step 5/5278, Loss: 1.159550428390503
Epoch 1/15, Step 6/5278, Loss: 1.2424094676971436
Epoch 1/15, Step 7/5278, Loss: 1.1110467910766602
Epoch 1/15, Step 8/5278, Loss: 1.0472608804702759
Epoch 1/15, Step 9/5278, Loss: 0.5300707221031189
Epoch 1/15, Step 10/5278, Loss: 0.4151744246482849
Epoch 1/15, Step 11/5278, Loss: 0.42924851179122925
Epoch 1/15, Step 12/5278, Loss: 0.14078333973884583
Epoch 1/15, Step 13/5278, Loss: 0.12746335566043854
Epoch 1/15, Step 14/5278, Loss: 0.3953395187854767
Epoch 1/15, Step 15/5278, Loss: 0.8659993410110474
Epoch 1/15, Step 16/5278, Loss: 0.05891311168670654
Epoch 1/15, Step 17/5278, Loss: 0.05587388202548027
Epoch 1/15, Step 18/5278, Loss: 0.048178210854530334
Epoch 1/15, Step 19/5278, Loss: 0.030336961150169373
Epoch 1/15, Step 20/5278, Loss: 0

KeyboardInterrupt: 

### Final Model for Multi Suppliers ()

In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch.nn.functional as F

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./corrector_model_Multi")
tokenizer = T5Tokenizer.from_pretrained("./corrector_model_Multi")

# Function to calculate confidence score
def calculate_confidence(logits, predicted_tokens):
    probabilities = [F.softmax(logit, dim=-1) for logit in logits]
    predicted_probs = [prob[0, token].item() for prob, token in zip(probabilities, predicted_tokens[0])]
    confidence = sum(predicted_probs) / len(predicted_probs)
    return confidence

# Updated inference function with confidence score
def correct_MPN_Multi_with_confidence(input_text, supplier_name):
    # Combine input text with supplier name
    combined_input = f"correct: {input_text} supplier: {supplier_name}"
    
    # Tokenize the combined input
    input_ids = tokenizer(combined_input, return_tensors="pt").input_ids
    
    # Generate output with scores
    outputs = model.generate(input_ids, output_scores=True, return_dict_in_generate=True)
    
    # Decode the generated tokens to get the corrected text
    corrected_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    
    # Calculate the confidence score
    confidence = calculate_confidence(outputs.scores, outputs.sequences[:, 1:])
    
    return corrected_text, confidence

# Example usage
corrected_text, confidence = correct_MPN_Multi_with_confidence("RK73C2BTE1693D *OBS*", "KOA")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Corrected Text: RC1206FR-07169KL, Confidence: 0.75


In [56]:
corrected_text, confidence = correct_MPN_Multi_with_confidence("27146-02016D223KAT", "KYOCERA AVX Components Corporation")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")

Corrected Text: CC0603KRX7R9BB223, Confidence: 0.83


In [80]:
corrected_text, confidence = correct_MPN_Multi_with_confidence("ST81D275BFD35PB NO REV", "Amphenol")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")

Corrected Text: ST81D275BFD35PB, Confidence: 0.95


In [84]:
corrected_text, confidence = correct_MPN_Multi_with_confidence("RC0805FR-07221KL//YAGEO CORP", "Yageo")
print(f"Corrected Text: {corrected_text}, Confidence: {confidence:.2f}")

Corrected Text: RC0805FR-07221KL, Confidence: 1.00


### Run Model of Multi Suppliers on list of Parts (MPN,SE_MAN_NAME)

In [87]:
import pandas as pd
df=pd.read_csv("bom_Yageo_Parts_need_ML.txt",encoding='ISO-8859-1',sep='\t')
MPNlist=df['MPN'].to_list()
SE_MAN=df['SE_MAN_NAME'].to_list()
for MPN,SE_MAN  in zip(MPNlist,SE_MAN):
    corrected_Part, confidence = corrected_text, confidence = correct_MPN_Multi_with_confidence(MPN, SE_MAN)
    print(f"{MPN}\t{corrected_Part}\t{confidence:.2f}")



RD 50T	RD50T	0.73
2322734_1002L	RC1206FR-071KL	0.69
232271191005L	RC1206FR-071ML	0.76
2322704_4642L	RC0603FR-0746K4L	0.84
2322704_4641L	RC1206FR-074K64L	0.83
GP 491-0 T9	GP491-0 T9	0.86
2322704_3163L	RC1206FR-073K16L	0.71
2322704_4643L	RC0603FR-07464KL	0.80
RGC 502-0 J	RGC502-0JT	0.77
2322704_1472L	RC1206FR-071K7L	0.84
RD 1/4W	RD14WT1K4L	0.65
2322704_3161L	RC1206FR-073K16L	0.80
2322704_2153L	RC1206FR-07215KL	0.83
RC0805FR series	RC0805FR-071ML	0.85
2322704_5113L	RC1206FR-07511RL	0.74
2322704_2612L	RC0603FR-072K6L	0.80
MRS25	MRS2512KRX7R8BB104	0.70
2322704_1211L	RC1206FR-071K8L	0.77
2322724_1001L	RC1206FR-071KL	0.73
2322704_3164L	RC1206FR-0731K4L	0.82
2322704_7501L	RC1206FR-077K5L	0.79
2322704_5112L	RC1206FR-071K12L	0.80
2322704_1474L	RC1206FR-071M74L	0.77
2322734_1211L	RC1206FR-071K2L	0.77
2322724_6814L	RC0805FR-076K14L	0.81
2322724_1002L	RC1206FR-071KL	0.70
2322734_5112L	RC1206FR-071K12L	0.77
2322704_3831L	RC0805FR-0738K3L	0.80
22 241 75767	CC1206JRNPO9BN470	0.73
22 241 7567	CC1206JR-

### Identifying Not Accept Parts

In [1]:
import pandas as pd

df=pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv",sep='\t',encoding='ISO-8859-1')
df.head()

Unnamed: 0,COM_PARTNUM
0,00001-103
1,014-103
2,015UW
3,01C1001FF
4,01C1001FP


In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd



df=pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv",sep='\t',encoding='ISO-8859-1')
df.head()
# Example Vishay part numbers
Vishay_parts = df["COM_PARTNUM"].to_list()

# Vectorize the part numbers using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
print("before_vectorizer")
X_train = vectorizer.fit_transform(Vishay_parts)
print(X_train.shape)
print("after_vectorizer")
# Create and train the One-Class SVM model
oc_svm = OneClassSVM(kernel='linear', gamma='auto', nu=0.1)
print("before_train")
oc_svm.fit(X_train)
print("after_vectorizer")
# Example new part numbers
new_parts = [
    'R234-5678-90', 'B123-4567-89', 'C890-9876-54', 
    'Z123-4567-89', 'R456-7890-12'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

before_vectorizer
(536267, 223037)
after_vectorizer
before_train


### Train on Sample of vishay CM parts SVC

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Example Yageo part numbers
df=pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv",sep='\t',encoding='ISO-8859-1')
df.head()
#Vishay_parts_sample=df.sample(10000)
Vishay_parts = df["COM_PARTNUM"].to_list()

# Vectorize the part numbers using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(3,3),lowercase=True)
print("before_vectorizer")
X_train = vectorizer.fit_transform(Vishay_parts)
print(X_train.shape)
print("after_vectorizer")
# Create and train the One-Class SVM model
oc_svm = OneClassSVM(kernel='linear', gamma='auto', nu=0.1,verbose=True)
print("before_train")
oc_svm.fit(X_train)
print("after_train")
# Example new part numbers
new_parts = [
    'R234-5678-90', 'B123-4567-89', 'C890-9876-54', 
    'Z123-4567-89', 'R456-7890-12'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

before_vectorizer
(536267, 31967)
after_vectorizer
before_train
[LibSVM]

In [31]:
new_parts = [
    'CRCW04022', '1233-443211-3344 ver', 'CRCW', 
    'Z123-4567-89', 'RC0603'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

CRCW04022 is likely NOT related to Vishay.
1233-443211-3344 ver is likely NOT related to Vishay.
CRCW is likely NOT related to Vishay.
Z123-4567-89 is likely NOT related to Vishay.
RC0603 is likely related to Vishay.


In [36]:
new_parts = [
    'CRCW04022', '1233-443211-3344 ver', 'CRCW', 
    'Z123-4567-89', 'RC0603'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

CRCW04022 is likely related to Vishay.
1233-443211-3344 ver is likely related to Vishay.
CRCW is likely NOT related to Vishay.
Z123-4567-89 is likely NOT related to Vishay.
RC0603 is likely related to Vishay.


In [2]:
new_parts = [
    'CRCW04022', '1233-443211-3344 ver', 'CRCW06', 
    'Z123-4567-89', 'rc0608'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

CRCW04022 is likely related to Vishay.
1233-443211-3344 ver is likely NOT related to Vishay.
CRCW06 is likely NOT related to Vishay.
Z123-4567-89 is likely NOT related to Vishay.
rc0608 is likely NOT related to Vishay.


In [4]:
new_parts = [
    'CRCW04022', '1233-443211-3344 ver', 'CRCW06', 
    'Z123-4567-89', 'rc0608'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

CRCW04022 is likely NOT related to Vishay.
1233-443211-3344 ver is likely NOT related to Vishay.
CRCW06 is likely NOT related to Vishay.
Z123-4567-89 is likely NOT related to Vishay.
rc0608 is likely NOT related to Vishay.


In [40]:
new_parts = [
    'CRCW04022', '12323-445411-3344ver', 'CRCW060', 
    'Z123-4567-89', '13508582-3 REV B'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

CRCW04022 is likely related to Vishay.
12323-445411-3344ver is likely related to Vishay.
CRCW060 is likely NOT related to Vishay.
Z123-4567-89 is likely NOT related to Vishay.
13508582-3 REV B is likely NOT related to Vishay.


In [1]:
import pandas as pd
# Load your data
df = pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv", sep='\t', encoding='ISO-8859-1')

Vishay_parts = df["COM_PARTNUM"].sample(100000)

### Other Model Forest

In [13]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse as sp


# Load your data
#df = pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv", sep='\t', encoding='ISO-8859-1')

#Vishay_parts = df["COM_PARTNUM"].sample(100000)

# Vectorize the part numbers using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,3), lowercase=True)
print("before_vectorizer")
X_train = vectorizer.fit_transform(Vishay_parts).toarray()
print(X_train.shape)
print("after_vectorizer")
# Ensure the matrix is in CSR format
#X_train_csr = csr_matrix(X_train)
print(type(X_train))


# Create and train the IsolationForest model
rf_One_class = IsolationForest(contamination=0.1,random_state=42,n_jobs=-1,n_estimators=50, max_samples='auto')
print("before_train")
rf_One_class.fit(X_train)
print("after_train")

# Example new part numbers
new_parts = [
    'R234-5678-90', 'B123-4567-89', 'C890-9876-54', 
    'Z123-4567-89', 'R456-7890-12'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Convert the test data to CSR format as well
X_test_csr = X_test.tocsr()

# Predict using the trained IsolationForest model
predictions = rf_One_class.predict(X_test_csr)
scores=rf_One_class.decision_function(X_test_csr)
# Interpret the predictions
for part, pred , score in zip(new_parts, predictions,scores):
    if pred == 1:
        print(f"{part} related. with score {score:.4f}")
    else:
        print(f"{part} NOT related. with score {score:.4f}")


before_vectorizer
(100000, 24708)
after_vectorizer
<class 'numpy.ndarray'>
before_train
after_train
R234-5678-90 related. with score 0.0131
B123-4567-89 related. with score 0.0082
C890-9876-54 related. with score 0.0131
Z123-4567-89 related. with score 0.0082
R456-7890-12 related. with score 0.0082


In [4]:
Vishay_parts

407370         RW79U6381B
182328         MMBZ5260-G
27523         2KBP005M/45
204213     P0603C2152BBPA
62948     BSI06324000JA22
               ...       
317800       RNN70C2771BM
282752         RN60D44R2F
268690       RLR05C39R2FR
394749         RW78U5350D
293853    RNC55H5941FSB14
Name: COM_PARTNUM, Length: 100000, dtype: object

In [8]:
# Vectorize the new part numbers
new_parts = [
    'CRCW04022', 'CMF55748R00BEEA','L-1225M1R10FBBS','WSC0001392R0DEA','MCRL00701R000JHB00','1233-443211-3344 ver','CRCW08054321FRT1', 'SMM02040D2051BB000', 
    'Z123-4567-89', 'RC0603','BSI06324000JA22','RNC55H5941FSB14','2KBP005M/45','111111111','555555432-8'
]
X_test = vectorizer.transform(new_parts)

# Convert the test data to CSR format as well

# Predict using the trained IsolationForest model
predictions = rf_One_class.predict(X_test)
scores=rf_One_class.decision_function(X_test)
# Interpret the predictions
for part, pred ,score in zip(new_parts, predictions,scores):
    if pred == 1:
        print(f"{part} related score {score:.4f}")
    else:
        print(f"{part} NOT related  score {score:.4f}")

CRCW04022 NOT related  score -0.0085
CMF55748R00BEEA related score 0.0000
L-1225M1R10FBBS related score 0.0000
WSC0001392R0DEA related score 0.0000
MCRL00701R000JHB00 related score 0.0000
1233-443211-3344 ver related score 0.0000
CRCW08054321FRT1 NOT related  score -0.0028
SMM02040D2051BB000 related score 0.0000
Z123-4567-89 related score 0.0000
RC0603 related score 0.0000
BSI06324000JA22 related score 0.0000
RNC55H5941FSB14 related score 0.0000
2KBP005M/45 related score 0.0000
111111111 related score 0.0000
555555432-8 related score 0.0000


In [57]:
predictions

array([ 1,  1,  1, -1,  1, -1,  1,  1])

### Get most important features


In [24]:
# Fit the vectorizer to the data
#X_tfidf = vectorizer.fit_transform(Vishay_parts)
# Get feature names (n-grams)
feature_names = vectorizer.get_feature_names_out()
# Get the IDF values for each n-gram
idf_values = vectorizer.idf_

# Combine the feature names with their corresponding IDF scores
features_with_idf = list(zip(feature_names, idf_values))
# Sort features by IDF values (higher IDF means more unique/important)
sorted_features = sorted(features_with_idf, key=lambda x: x[1], reverse=True)

# Get the top 10 most important features
top_n = 50
most_important_features = sorted_features[:5000]

# Print the top features
for feature, idf in most_important_features:
    print(f"{feature}, IDF: {idf}")

 % a2, IDF: 15.993645111206595
 % b0 , IDF: 15.993645111206595
 % b0 2, IDF: 15.993645111206595
 % b1, IDF: 15.993645111206595
 % b1 , IDF: 15.993645111206595
 % b1 2, IDF: 15.993645111206595
 % bl, IDF: 15.993645111206595
 % bl , IDF: 15.993645111206595
 % bl 2, IDF: 15.993645111206595
 % bo, IDF: 15.993645111206595
 % bo5, IDF: 15.993645111206595
 % bo50, IDF: 15.993645111206595
 % et5e, IDF: 15.993645111206595
 % jo, IDF: 15.993645111206595
 % jo1, IDF: 15.993645111206595
 % m3 , IDF: 15.993645111206595
 % m3 2, IDF: 15.993645111206595
 % rt7, IDF: 15.993645111206595
 % tu, IDF: 15.993645111206595
 %ef4, IDF: 15.993645111206595
 %ef4e, IDF: 15.993645111206595
 %ef4e3, IDF: 15.993645111206595
 (/t), IDF: 15.993645111206595
 (die, IDF: 15.993645111206595
 (die), IDF: 15.993645111206595
 (roh, IDF: 15.993645111206595
 (rohs, IDF: 15.993645111206595
 (rohs), IDF: 15.993645111206595
 *** , IDF: 15.993645111206595
 *** 4, IDF: 15.993645111206595
 *** 4., IDF: 15.993645111206595
 + 80, IDF

Try Forest Model by all data

In [2]:
import pandas as pd

# Load your data

df = pd.read_csv(r"Vishay_ALL_SE_PARTS.tsv", sep='\t', encoding='ISO-8859-1')
df.info()
Vishay_parts = df["COM_PARTNUM"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6496617 entries, 0 to 6496616
Data columns (total 1 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   COM_PARTNUM  object
dtypes: object(1)
memory usage: 49.6+ MB


In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
import pandas as pd
from scipy.sparse import csr_matrix
import scipy.sparse as sp


# Load your data
df = pd.read_csv(r"Vishay_ALL_SE_PARTS.tsv", sep='\t', encoding='ISO-8859-1')

Vishay_parts = df["COM_PARTNUM"].sample(10000).to_list()

# Vectorize the part numbers using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(5,5), lowercase=True)

print("before_vectorizer")
X_train = vectorizer.fit_transform(Vishay_parts)
print(X_train.shape)
print("after_vectorizer")
# Ensure the matrix is in CSR format
#X_train_csr = csr_matrix(X_train)
print(type(X_train))
X_train=X_train.toarray()
print(type(X_train))

# Create and train the IsolationForest model
rf_One_class = IsolationForest(contamination=0.2,random_state=42,n_jobs=-1)
print("before_train")
rf_One_class.fit(X_train)
print("after_train")

# Example new part numbers
new_parts = [
    'R234-5678-90', 'B123-4567-89', 'C890-9876-54', 
    'Z123-4567-89', 'R456-7890-12'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Convert the test data to CSR format as well
X_test_csr = X_test.tocsr()

# Predict using the trained IsolationForest model
predictions = rf_One_class.predict(X_test_csr)
scores=rf_One_class.decision_function(X_test_csr)
# Interpret the predictions
for part, pred , score in zip(new_parts, predictions,scores):
    if pred == 1:
        print(f"{part} related. with score {score:.4f}")
    else:
        print(f"{part} NOT related. with score {score:.4f}")

before_vectorizer
(10000, 57872)
after_vectorizer
<class 'scipy.sparse._csr.csr_matrix'>
<class 'numpy.ndarray'>
before_train
after_train
R234-5678-90 related. with score 0.0032
B123-4567-89 related. with score 0.0032
C890-9876-54 related. with score 0.0032
Z123-4567-89 related. with score 0.0032
R456-7890-12 related. with score 0.0032


In [25]:
# Fit the vectorizer to the data
#X_tfidf = vectorizer.fit_transform(Vishay_parts)
# Get feature names (n-grams)
feature_names = vectorizer.get_feature_names_out()
# Get the IDF values for each n-gram
idf_values = vectorizer.idf_

# Combine the feature names with their corresponding IDF scores
features_with_idf = list(zip(feature_names, idf_values))
# Sort features by IDF values (higher IDF means more unique/important)
sorted_features = sorted(features_with_idf, key=lambda x: x[1], reverse=True)

# Get the top 10 most important features
top_n = 50
most_important_features = sorted_features[:10]

# Print the top features
for feature, idf in most_important_features:
    print(f"{feature}, IDF: {idf}")

 % a2, IDF: 15.993645111206595
 % b0 , IDF: 15.993645111206595
 % b0 2, IDF: 15.993645111206595
 % b1, IDF: 15.993645111206595
 % b1 , IDF: 15.993645111206595
 % b1 2, IDF: 15.993645111206595
 % bl, IDF: 15.993645111206595
 % bl , IDF: 15.993645111206595
 % bl 2, IDF: 15.993645111206595
 % bo, IDF: 15.993645111206595


In [27]:
feature_names.tolist().index('crcw0')

5793262

In [21]:
feature_names[179658]

'crcw'

In [41]:

# Vectorize the part numbers using TF-IDF
#vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(8,9), lowercase=True)

print("before_vectorizer")
#X_train = vectorizer.fit_transform(Vishay_parts)
print(X_train.shape)
#X_train=X_train.toarray()
# Create and train the IsolationForest model
rf_One_class = IsolationForest(contamination=0.22,random_state=42,n_jobs=-1,max_features=10)
print("before_train")
rf_One_class.fit(X_train)
print("after_train")

# Example new part numbers
new_parts = [
    'R234-5678-90', 'B123-4567-89', 'C890-9876-54', 
    'Z123-4567-89', 'R456-7890-12'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Convert the test data to CSR format as well
X_test_csr = X_test.tocsr()

# Predict using the trained IsolationForest model
predictions = rf_One_class.predict(X_test_csr)
scores=rf_One_class.decision_function(X_test_csr)
# Interpret the predictions
for part, pred , score in zip(new_parts, predictions,scores):
    if pred == 1:
        print(f"{part} related. with score {score:.18f}")
    else:
        print(f"{part} NOT related. with score {score:.18f}")

before_vectorizer
(10000, 138582)
before_train
after_train
R234-5678-90 related. with score 0.000000000000000000
B123-4567-89 related. with score 0.000000000000000000
C890-9876-54 related. with score 0.000000000000000000
Z123-4567-89 related. with score 0.000000000000000000
R456-7890-12 related. with score 0.000000000000000000


In [46]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from scipy.sparse import csr_matrix, isspmatrix_csr

# Load your data
df = pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv", sep='\t', encoding='ISO-8859-1')
Vishay_parts = df["COM_PARTNUM"].to_list()

# Vectorize the part numbers using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), lowercase=True)
X_train = vectorizer.fit_transform(Vishay_parts)

# Convert to CSR format if it's not already
if not isspmatrix_csr(X_train):
    X_train = X_train.tocsr()

# Create a label array for KNN (assuming you want to classify all parts as 1 for Vishay)
y_train = np.ones(X_train.shape[0])

# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=3)
print("before_train")
knn.fit(X_train, y_train)
print("after_train")

# Example new part numbers
new_parts = ['R234-5678-90', 'B123-4567-89', 'C890-9876-54', 'Z123-4567-89', 'R456-7890-12']

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Convert the test data to CSR format as well
if not isspmatrix_csr(X_test):
    X_test = X_test.tocsr()

# Predict using the trained KNN model
predictions = knn.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")


before_train
after_train
R234-5678-90 is likely related to Vishay.
B123-4567-89 is likely related to Vishay.
C890-9876-54 is likely related to Vishay.
Z123-4567-89 is likely related to Vishay.
R456-7890-12 is likely related to Vishay.


In [45]:
predictions

array([1., 1., 1., 1., 1.])

In [47]:
# Example new part numbers
new_parts = ['R234-5678-90', '11111111111111111', '4455666666666666', 'Z123-4567-89', 'R456-7890-12']

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Convert the test data to CSR format as well
if not isspmatrix_csr(X_test):
    X_test = X_test.tocsr()

# Predict using the trained KNN model
predictions = knn.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} is likely related to Vishay.")
    else:
        print(f"{part} is likely NOT related to Vishay.")

R234-5678-90 is likely related to Vishay.
11111111111111111 is likely related to Vishay.
4455666666666666 is likely related to Vishay.
Z123-4567-89 is likely related to Vishay.
R456-7890-12 is likely related to Vishay.
