In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import time
import joblib

In [7]:
# Example Yageo part numbers
df=pd.read_csv(r"500K_Sample_Vishay_CM_Parts.tsv",sep='\t',encoding='ISO-8859-1')
df.head()
#Vishay_parts_sample=df.sample(10000)
Vishay_parts = df["COM_PARTNUM"].to_list()

In [10]:


# Vectorize the part numbers using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(3,3),lowercase=True)
print("before_vectorizer")
X_train = vectorizer.fit_transform(Vishay_parts)
print(X_train.shape)
print("after_vectorizer")
# Create and train the One-Class SVM model
oc_svm = OneClassSVM(kernel='linear', gamma='auto', nu=0.1,verbose=True)
start_time=time.time()
oc_svm.fit(X_train)
print(f"Elapsed Time is {time.time()-start_time} Seconds")
joblib.dump(oc_svm,"SVM_NOTACCEPT_MODEL.pkl")
# Example new part numbers
new_parts = [
    'R234-5678-90', 'B123-4567-89', 'C890-9876-54', 
    'Z123-4567-89', 'R456-7890-12'
]

# Vectorize the new part numbers
X_test = vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm.predict(X_test)

# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part} related to Vishay.")
    else:
        print(f"{part} NOT related to Vishay.")

before_vectorizer
(1000, 4683)
after_vectorizer
[LibSVM].*Elapsed Time is 0.09565210342407227 Seconds

optimization finished, #iter = 1568
obj = 16.572450, rho = 0.331442
nSV = 771, nBSV = 0
R234-5678-90 NOT related to Vishay.
B123-4567-89 NOT related to Vishay.
C890-9876-54 NOT related to Vishay.
Z123-4567-89 NOT related to Vishay.
R456-7890-12 NOT related to Vishay.


In [14]:
df=pd.read_excel('test_not_accept.xlsx')
df.head()

Unnamed: 0,MPN,Man
0,84277,Vishay
1,2000,Vishay
2,5000,Vishay
3,10000,Vishay
4,84008,Vishay


In [11]:
oc_svm_model=joblib.load('SVM_NOTACCEPT_MODEL.pkl')
tf_vectorizer=joblib.load('Vectroizer_Model.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
new_parts = df["MPN"].to_list()

# Vectorize the new part numbers
X_test = tf_vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm_model.predict(X_test)
# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part}\trelated to Vishay.")
    else:
        print(f"{part}\tNOT related to Vishay.")

84277	NOT related to Vishay.
2000	related to Vishay.
5000	related to Vishay.
10000	related to Vishay.
84008	related to Vishay.
84278	NOT related to Vishay.
99630	NOT related to Vishay.
150329	related to Vishay.
209213	NOT related to Vishay.
1181400	related to Vishay.
1410636	related to Vishay.
11462571	NOT related to Vishay.
222203790046	related to Vishay.
222262908103	related to Vishay.
222262918223	related to Vishay.
222262919103	related to Vishay.
222263008681	related to Vishay.
222263018471	related to Vishay.
222263019102	related to Vishay.
222263051331	related to Vishay.
222268010189	related to Vishay.
222268034339	related to Vishay.
222268234339	related to Vishay.
222268309228	related to Vishay.
222268310189	related to Vishay.
222268334101	related to Vishay.
222268334121	related to Vishay.
222268334151	related to Vishay.
222268334339	related to Vishay.
222268358271	related to Vishay.
222268370471	related to Vishay.
03028-BP821AJZC	related to Vishay.
0402B103K250CT	related to Vish

In [None]:
df=pd.read_excel('test_not_accept.xlsx')
oc_svm_model=joblib.load('SVM_NOTACCEPT_MODEL.pkl')
tf_vectorizer=joblib.load('Vectroizer_Model.pkl')
new_parts = df["MPN"].to_list()

# Vectorize the new part numbers
X_test = tf_vectorizer.transform(new_parts)

# Predict using the trained One-Class SVM model
predictions = oc_svm_model.predict(X_test)
# Interpret the predictions
for part, pred in zip(new_parts, predictions):
    if pred == 1:
        print(f"{part}\trelated to Vishay.")
    else:
        print(f"{part}\tNOT related to Vishay.")

### Output of Not Accept Model with Confidence Score

In [17]:
import pandas as pd
import joblib

# Load the data, model, and vectorizer
df = pd.read_excel('test_not_accept.xlsx')
oc_svm_model = joblib.load('SVM_NOTACCEPT_MODEL.pkl')
tf_vectorizer = joblib.load('Vectroizer_Model.pkl')

new_parts = df["MPN"].to_list()

# Vectorize the new part numbers
X_test = tf_vectorizer.transform(new_parts)

# Get the decision function scores
decision_scores = oc_svm_model.decision_function(X_test)

# Predict using the trained One-Class SVM model
predictions = oc_svm_model.predict(X_test)

# Interpret the predictions along with confidence scores
for part, pred, score in zip(new_parts, predictions, decision_scores):
    if pred == 1:
        print(f"{part}\trelated to Vishay.\tConfidence Score: {score:.2f}")
    else:
        print(f"{part}\tNOT related to Vishay.\tConfidence Score: {score:.2f}")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


84277	NOT related to Vishay.	Confidence Score: -24.24
2000	related to Vishay.	Confidence Score: 36.99
5000	related to Vishay.	Confidence Score: 38.83
10000	related to Vishay.	Confidence Score: 40.06
84008	related to Vishay.	Confidence Score: 2.29
84278	NOT related to Vishay.	Confidence Score: -25.65
99630	NOT related to Vishay.	Confidence Score: -22.94
150329	related to Vishay.	Confidence Score: 8.46
209213	NOT related to Vishay.	Confidence Score: -16.96
1181400	related to Vishay.	Confidence Score: 19.33
1410636	related to Vishay.	Confidence Score: 0.04
11462571	NOT related to Vishay.	Confidence Score: -14.86
222203790046	related to Vishay.	Confidence Score: 30.48
222262908103	related to Vishay.	Confidence Score: 19.96
222262918223	related to Vishay.	Confidence Score: 19.39
222262919103	related to Vishay.	Confidence Score: 19.24
222263008681	related to Vishay.	Confidence Score: 27.58
222263018471	related to Vishay.	Confidence Score: 26.25
222263019102	related to Vishay.	Confidence Scor

In [18]:

df_arrow=pd.read_excel('arrow_parts_vishay.xlsx')
new_parts=df_arrow["MPN"].tolist()
# Vectorize the new part numbers
X_test = tf_vectorizer.transform(new_parts)

# Get the decision function scores
decision_scores = oc_svm_model.decision_function(X_test)

# Predict using the trained One-Class SVM model
predictions = oc_svm_model.predict(X_test)

# Interpret the predictions along with confidence scores
for part, pred, score in zip(new_parts, predictions, decision_scores):
    if pred == 1:
        print(f"{part}\trelated to Vishay.\t{score:.3f}")
    else:
        print(f"{part}\tNOT related to Vishay.\t{score:.3f}")

84277	NOT related to Vishay.	C-24.235
2000	related to Vishay.	36.991
5000	related to Vishay.	38.830
10000	related to Vishay.	40.063
84008	related to Vishay.	2.287
84278	NOT related to Vishay.	C-25.650
99630	NOT related to Vishay.	C-22.936
150329	related to Vishay.	8.465
209213	NOT related to Vishay.	C-16.958
1181400	related to Vishay.	19.329
1410636	related to Vishay.	0.042
11462571	NOT related to Vishay.	C-14.858
222203790046	related to Vishay.	30.478
222262908103	related to Vishay.	19.961
222262918223	related to Vishay.	19.391
222262919103	related to Vishay.	19.235
222263008681	related to Vishay.	27.575
222263018471	related to Vishay.	26.247
222263019102	related to Vishay.	33.532
222263051331	related to Vishay.	24.548
222268010189	related to Vishay.	34.039
222268034339	related to Vishay.	20.156
222268234339	related to Vishay.	21.693
222268309228	related to Vishay.	20.629
222268310189	related to Vishay.	26.307
222268334101	related to Vishay.	30.685
222268334121	related to Vishay.	24.2

In [49]:


df_arrow=pd.read_excel('Vishay_Non_Public_Parts.xlsx')
new_parts=df_arrow["MPN"].tolist()
new_parts=[x if str(x)!=str(np.nan) else "Marzouk" for x in new_parts ]
# Vectorize the new part numbers
X_test = tf_vectorizer.transform(new_parts)

# Get the decision function scores
decision_scores = oc_svm_model.decision_function(X_test)

# Predict using the trained One-Class SVM model
predictions = oc_svm_model.predict(X_test)

# Interpret the predictions along with confidence scores
for part, pred, score in zip(new_parts, predictions, decision_scores):
    if pred == 1:
        print(f"{part}\trelated to Vishay.\t{score:.3f}")
    else:
        print(f"{part}\tNOT related to Vishay.\t{score:.3f}")

2.32216E+11	NOT related to Vishay.	-16.827
22k	NOT related to Vishay.	-23.041
22k	NOT related to Vishay.	-23.041
3.0k	NOT related to Vishay.	-37.538
330	NOT related to Vishay.	-11.749
MEPpart_vangala	NOT related to Vishay.	-40.986
SEE COMMENT	NOT related to Vishay.	-44.052
POWER & WATER SOLUTIONS	NOT related to Vishay.	-41.782
Marzouk	NOT related to Vishay.	-45.365
Marzouk	NOT related to Vishay.	-45.365
Marzouk	NOT related to Vishay.	-45.365
Marzouk	NOT related to Vishay.	-45.365
XXX	NOT related to Vishay.	-24.878
NONE	NOT related to Vishay.	-45.828
NOT FOUND	NOT related to Vishay.	-45.601
BY DESCRIPTION	NOT related to Vishay.	-41.935
SEE REMARKS	NOT related to Vishay.	-41.034
SMD	NOT related to Vishay.	-23.685
SEE ENG. COMMENTS	NOT related to Vishay.	-43.953
N	NOT related to Vishay.	-46.108
NC	NOT related to Vishay.	-46.108
PROCESS SYSTEMS & SOLUTIONS	NOT related to Vishay.	-35.796
not provided	NOT related to Vishay.	-46.108
NOT FOUND	NOT related to Vishay.	-45.601
SEE TABULATION	NOT 

### Try KNN Model

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Example Yageo part numbers
yageo_parts = [
    'R123-4567-89', 'C890-1234-56', 'F789-0123-45',
    'L234-5678-90', 'R654-3210-98', 'C678-9012-34'
]

# Convert to lowercase (uncased characters)
yageo_parts_uncased = [part.lower() for part in yageo_parts]

# Vectorize the part numbers using TF-IDF at the character level
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5))
data = vectorizer.fit_transform(yageo_parts_uncased).toarray()

# Normalize the data (optional but recommended for KNN)
scaler = StandardScaler()
data = scaler.fit_transform(data)

# Fit KNN model (using k=3 for this example)
knn = NearestNeighbors(n_neighbors=3)
knn.fit(data)

# Example new part numbers (some may be anomalies)
new_parts = ['R234-5678-90', 'Z123-4567-89']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)

# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = np.mean(distances) + np.std(distances)  # Adjust based on your needs

# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.mean(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Yageo. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Yageo. (Distance: {distance:.4f})")

R234-5678-90 is likely related to Yageo. (Distance: 11.5907)
Z123-4567-89 is likely related to Yageo. (Distance: 12.6414)


In [10]:
new_parts = ['zzzzzzzzz', 'dummy']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = np.mean(distances) + np.std(distances)  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.mean(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Yageo. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Yageo. (Distance: {distance:.4f})")

[[12.18747275 12.8204253  12.83267992]
 [12.18747275 12.8204253  12.83267992]]
12.91483266019479
zzzzzzzzz is likely related to Yageo. (Distance: 12.6135)
dummy is likely related to Yageo. (Distance: 12.6135)


In [170]:
new_parts = ['CHPHR0805K4990FBW']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data,n_neighbors=5)
print(distances)
print(indices)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 50  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Yageo. (Distance: {distance:.4f})")
        print(vishay_parts[indices[0][0]],vishay_parts[indices[0][1]],vishay_parts[indices[0][2]],vishay_parts[indices[0][3]],vishay_parts[indices[0][4]])
    else:
        print(f"{part} is likely related to Yageo. (Distance: {distance:.4f})")
        print(vishay_parts[indices[0][0]],vishay_parts[indices[0][1]],vishay_parts[indices[0][2]],vishay_parts[indices[0][3]],vishay_parts[indices[0][4]])

[[1.34869915e-06 5.05375350e+01 5.51655363e+01 5.55063114e+01
  5.66720631e+01]]
[[534505 534321 534333 534307 534393]]
50
CHPHR0805K4990FBW is likely related to Yageo. (Distance: 0.0000)
CHPHR0805K4990FBW CHPHR0805K10R0FBW CHPHR0805K1201FBW CHPHR0805K1002FBW CHPHR0805K2001FBW


In [146]:
features_name=vectorizer.get_feature_names_out().tolist()
features_name.index('sam')

32493

In [145]:
features_name[32493]

'sam'

In [189]:
new_parts = ['CHPHR0805K4990FBW_33 SAMPLE']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
#new_data = scaler.transform(new_data)
for i,feature in zip(new_data[0],features_name):
    if i!=0.0:
        print(i,feature)

0.3240747158992717  sa
0.14602962323283106 05k
0.09039454104613268 080
0.1482002444459769 0fb
0.3398917335319423 3 s
0.26737130081290245 33 
0.1423323161353594 499
0.16807924953044984 5k4
0.0910346030257139 805
0.15259557412207453 90f
0.17201809304115565 990
0.3649611133529705 _33
0.20800366754133065 amp
0.17254111322607052 chp
0.19434157875239805 fbw
0.21056248538717906 hph
0.17618531107729957 hr0
0.1774987029917848 k49
0.1817624483885995 phr
0.17237036795508742 r08
0.3649611133529705 sam


In [129]:
print(vishay_parts[306942],vishay_parts[2408552],vishay_parts[534333])


615RX5SAM203EN102M MCT0603-50_2%_RF_33R CHPHR0805K1201FBW


In [41]:
import pandas as pd
# Example Vishay part numbers
df=pd.read_csv(r"Vishay_ALL_SE_PARTS.tsv",sep='\t',encoding='ISO-8859-1')
df.head()
#Vishay_parts_sample=df.sample(10000)
Vishay_parts = df["COM_PARTNUM"].to_list()

### Try KNN with Vishay

In [42]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import joblib

# Example Yageo part numbers
vishay_parts = Vishay_parts

# Convert to lowercase (uncased characters)
vishay_parts_uncased = [part.lower() for part in vishay_parts]

# Vectorize the part numbers using TF-IDF at the character level
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
data = vectorizer.fit_transform(vishay_parts_uncased)
print(data.shape)

# Normalize the data (optional but recommended for KNN)
scaler = StandardScaler(with_mean=False)
data = scaler.fit_transform(data)

# Fit KNN model (using k=3 for this example)
knn = NearestNeighbors(n_neighbors=3)
knn.fit(data)
joblib.dump(knn,"KNN_MODEL.pkl")

# Example new part numbers (some may be anomalies)
new_parts = ['R234-5678-90', 'Z123-4567-89']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)

# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = np.mean(distances) + np.std(distances)  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.mean(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")

(6496617, 37966)
531.149077869715
R234-5678-90 is likely related to Vishay. (Distance: 370.1625)
Z123-4567-89 is likely related to Vishay. (Distance: 526.2399)


In [11]:
df_arrow=pd.read_excel('arrow_parts_vishay.xlsx')
df_arrow=df_arrow["MPN"].tolist()

In [43]:
# Example new part numbers (some may be anomalies)
new_parts = df_arrow
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
new_data = scaler.transform(new_data)

KNN_MODEL=joblib.load("KNN_MODEL.pkl")
# Calculate distances to the nearest neighbors
distances, indices = KNN_MODEL.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 50  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}")

(12219, 37966)
[[ 82.40035422  82.58261526  82.79482253]
 [ 38.92614695  39.65030499  40.43005745]
 [ 38.40444851  40.20425273  40.4616048 ]
 ...
 [ 61.82886607  74.06786264 110.72793237]
 [516.59791736 981.65670553 984.43302209]
 [370.23871733 373.18417293 375.19496822]]
50
84277	Not Related	82.4004
2000	Related	38.9261
5000	Related	38.4044
10000	Related	36.2909
84008	Not Related	61.2681
84278	Not Related	89.4973
99630	Not Related	104.6710
150329	Not Related	71.0368
209213	Not Related	87.1497
1181400	Not Related	55.1776
1410636	Not Related	73.0673
11462571	Not Related	100.5861
222203790046	Not Related	63.7894
222262908103	Not Related	74.8465
222262918223	Related	39.5150
222262919103	Related	16.0649
222263008681	Not Related	82.5030
222263018471	Not Related	73.6748
222263019102	Not Related	54.7083
222263051331	Not Related	56.9140
222268010189	Related	49.1755
222268034339	Not Related	71.1488
222268234339	Not Related	65.0498
222268309228	Not Related	78.9608
222268310189	Not Related	69.602

In [46]:
df_non_punlic=pd.read_excel('Vishay_Non_Public_Parts.xlsx')
df_non_punlic=df_non_punlic["MPN"].tolist()
df_non_punlic

['2.32216E+11',
 '22k',
 '22k',
 '3.0k',
 '330',
 'MEPpart_vangala',
 'SEE COMMENT',
 'POWER & WATER SOLUTIONS',
 'XXX',
 'NONE',
 'NOT FOUND',
 'BY DESCRIPTION',
 'SEE REMARKS',
 'SMD',
 'SEE ENG. COMMENTS',
 'N',
 'NC',
 'PROCESS SYSTEMS & SOLUTIONS',
 'not provided',
 'NOT FOUND',
 'SEE TABULATION',
 '33u',
 'no P/N',
 'DNP',
 'DO NOT BUY',
 'DO NOT BUY',
 'SPACER',
 'TBD',
 'UNKNOWN',
 'UNKNOWN',
 'UNKNOWN',
 'UNKNOWN',
 'SEE TABULATION',
 '.',
 '[blank]',
 '0',
 '0EA',
 '0R',
 '1',
 '15k',
 'TK50',
 'ORDER BY DESCRIPTION',
 'Order by Description',
 'CAP LD PP 630V 1%1200pF axial',
 'CHECK PRICE',
 '1K',
 '1W',
 '20K',
 '240K',
 'NEVER USED',
 'nicht spezifizierbar',
 'A SUPPRIMER',
 'SEE ENG COMMENT',
 'BUY TO DESC.',
 '0',
 '0',
 '0',
 'SEE HONEYWELL DRAWING',
 '2K7',
 '30k',
 '450BXC2R2MEFC8X11.5',
 '47K',
 '820R',
 '9.1K',
 'See Omnify',
 'SEE CHART',
 'CUSTOM',
 'CUSTOM',
 'CUSTOM',
 '?',
 '?',
 '5,6M',
 'TBD',
 'CAP',
 'BC-COMPONENTS',
 'SEE ENG COMMENTS',
 'VISHAY: +ROHS+LÖT

In [47]:
# Example new part numbers (some may be anomalies)
new_parts = df_non_punlic
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
new_data = scaler.transform(new_data)

KNN_MODEL=joblib.load("KNN_MODEL.pkl")
# Calculate distances to the nearest neighbors
distances, indices = KNN_MODEL.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 50  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}")

(1699, 37966)
[[ 245.84589568  248.83148049  249.47132094]
 [  47.8550935    64.76690356   65.1864    ]
 [  47.8550935    64.76690356   65.1864    ]
 ...
 [  27.12259158   33.76604202   34.12635992]
 [1905.9684534  3324.37561389 3324.43645343]
 [1032.4736795  1046.49995962 2489.98577153]]
50
2.32216E+11	Not Related	245.8459
22k	Related	47.8551
22k	Related	47.8551
3.0k	Not Related	622.0366
330	Related	41.8680
MEPpart_vangala	Not Related	1676.9936
SEE COMMENT	Not Related	3358.3244
POWER & WATER SOLUTIONS	Not Related	1332.1303
XXX	Not Related	616.7294
NONE	Not Related	8876.7670
NOT FOUND	Not Related	6568.3644
BY DESCRIPTION	Not Related	3022.6170
SEE REMARKS	Not Related	3659.2069
SMD	Not Related	139.2361
SEE ENG. COMMENTS	Not Related	3542.5188
N	Related	27.1226
NC	Related	27.1226
PROCESS SYSTEMS & SOLUTIONS	Not Related	2502.6472
not provided	Not Related	4809.0711
NOT FOUND	Not Related	6568.3644
SEE TABULATION	Not Related	2081.7574
33u	Not Related	256.7239
no P/N	Related	27.1226
DNP	Not Rel

### Try KNN without scalling, parts without cleaning and vectorizer n_gram is (3,3)

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import joblib

df=pd.read_csv(r"Vishay_ALL_SE_PARTS.tsv",sep='\t',encoding='ISO-8859-1')
#Vishay_parts_sample=df.sample(10000)
Vishay_parts = df["COM_PARTNUM"].to_list()

# Convert to lowercase (uncased characters)
vishay_parts_uncased = [str(part).lower() for part in Vishay_parts]

# Vectorize the part numbers using TF-IDF at the character level
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
data = vectorizer.fit_transform(vishay_parts_uncased)
print(data.shape)
##joblib.dump(vectorizer,"vectorizer_without_scalling.pkl")
# Normalize the data (optional but recommended for KNN)
#scaler = StandardScaler(with_mean=False)
#data = scaler.fit_transform(data)

# Fit KNN model (using k=3 for this example)
knn = NearestNeighbors(n_neighbors=3)
knn.fit(data)
##joblib.dump(knn,"KNN_MODEL.pkl")

# Example new part numbers (some may be anomalies)
new_parts = ['R234-5678-90', 'Z123-4567-89']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)

# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = np.mean(distances) + np.std(distances)  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.mean(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")

(6496617, 37966)
1.2337902679532093
R234-5678-90 is likely related to Vishay. (Distance: 1.1601)
Z123-4567-89 is likely related to Vishay. (Distance: 1.2332)


In [22]:
vectorizer=joblib.load('vectorizer_without_scalling.pkl')
knn=joblib.load('KNN_MODEL_without_scalling.pkl')
new_parts = ['CRCW0603 XXX2FKEXC0']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data,n_neighbors=3)
print(distances)
#print(indices)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(f"threshold:{threshold}")
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])

[[1.18838437 1.19175699 1.19666324]]
threshold:1
CRCW0603 XXX2FKEXC0 is likely NOT related to Vishay. (Distance: 1.1884)
CRCW06031002FKEA CRCW06033402FKEA CRCW060342K2FKEA


In [8]:
new_data.shape

(1, 31967)

In [183]:
new_parts = ['CHPHR0805K4990FBW_33 SAMPLE']
new_parts_uncased = [part.lower() for part in new_parts]

new_data = vectorizer.transform(new_parts_uncased).toarray()
for i,feature in zip(new_data[0],features_name):
    if i!=0.0:
        print(i,feature)

0.3240747158992717  sa
0.14602962323283106 05k
0.09039454104613268 080
0.1482002444459769 0fb
0.3398917335319423 3 s
0.26737130081290245 33 
0.1423323161353594 499
0.16807924953044984 5k4
0.0910346030257139 805
0.15259557412207453 90f
0.17201809304115565 990
0.3649611133529705 _33
0.20800366754133065 amp
0.17254111322607052 chp
0.19434157875239805 fbw
0.21056248538717906 hph
0.17618531107729957 hr0
0.1774987029917848 k49
0.1817624483885995 phr
0.17237036795508742 r08
0.3649611133529705 sam


In [195]:
# Example new part numbers (some may be anomalies)
new_parts = df_non_punlic
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)

KNN_MODEL=joblib.load("KNN_MODEL_without_scalling.pkl")
# Calculate distances to the nearest neighbors
distances, indices = KNN_MODEL.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}")

(1699, 37966)
[[1.07727333 1.10064822 1.10331509]
 [0.87339682 1.06679485 1.07530507]
 [0.87339682 1.06679485 1.07530507]
 ...
 [1.         1.         1.        ]
 [1.09408806 1.18229309 1.26855484]
 [1.16185902 1.20259102 1.20868561]]
1
2.32216E+11	Not Related	1.0773
22k	Related	0.8734
22k	Related	0.8734
3.0k	Related	0.9992
330	Related	0.7924
MEPpart_vangala	Not Related	1.2525
SEE COMMENT	Not Related	1.1942
POWER & WATER SOLUTIONS	Not Related	1.2759
XXX	Related	0.8715
NONE	Not Related	1.1940
NOT FOUND	Not Related	1.1749
BY DESCRIPTION	Not Related	1.2384
SEE REMARKS	Not Related	1.2509
SMD	Not Related	1.0949
SEE ENG. COMMENTS	Not Related	1.2347
N	Related	1.0000
NC	Related	1.0000
PROCESS SYSTEMS & SOLUTIONS	Not Related	1.2772
not provided	Not Related	1.2206
NOT FOUND	Not Related	1.1749
SEE TABULATION	Not Related	1.2339
33u	Not Related	1.0851
no P/N	Related	1.0000
DNP	Not Related	1.0331
DO NOT BUY	Not Related	1.0846
DO NOT BUY	Not Related	1.0846
SPACER	Not Related	1.1899
TBD	Related	1.000

In [200]:
# Example new part numbers (some may be anomalies)

new_parts = pd.read_excel(r"C:\Users\A80843\dst.xlsx")["MPN"].to_list()
new_parts_uncased = [str(part).lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)

KNN_MODEL=joblib.load("KNN_MODEL_without_scalling.pkl")
# Calculate distances to the nearest neighbors
distances, indices = KNN_MODEL.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}")

(161, 37966)
[[1.16171544e+00 1.21304404e+00 1.21498880e+00]
 [1.15548505e+00 1.30803072e+00 1.30929307e+00]
 [1.23343236e+00 1.26335870e+00 1.26483322e+00]
 [1.23391705e+00 1.27770165e+00 1.29182782e+00]
 [1.23549827e+00 1.29989419e+00 1.30607237e+00]
 [1.23666457e+00 1.27638129e+00 1.30675789e+00]
 [1.23959974e+00 1.27783007e+00 1.30848438e+00]
 [1.23608674e+00 1.25838860e+00 1.26048674e+00]
 [1.23642408e+00 1.25675832e+00 1.26901564e+00]
 [1.23971195e+00 1.30855042e+00 1.30980634e+00]
 [1.23820316e+00 1.28019835e+00 1.30350306e+00]
 [1.04645473e+00 1.19069335e+00 1.19853251e+00]
 [1.16984375e+00 1.19183965e+00 1.19990590e+00]
 [1.11502791e+00 1.17100676e+00 1.19938859e+00]
 [1.10175797e+00 1.17277840e+00 1.17578095e+00]
 [1.10137879e+00 1.17283650e+00 1.20258794e+00]
 [1.22051046e+00 1.22056966e+00 1.22138411e+00]
 [1.17421467e+00 1.18880953e+00 1.19188982e+00]
 [1.17443831e+00 1.18534676e+00 1.18537454e+00]
 [1.17288942e+00 1.17474348e+00 1.18310682e+00]
 [1.16939514e+00 1.17053591

### Try Arrow Parts without Scalling

In [16]:

df_arrow=pd.read_excel('arrow_parts_vishay.xlsx')
new_parts=df_arrow["MPN"].tolist()

In [18]:
new_parts_uncased = [str(part).lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
KNN_MODEL=joblib.load('KNN_MODEL_without_scalling.pkl')
distances, indices = KNN_MODEL.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}")

(12219, 37966)
[[1.06736137 1.08094153 1.08358839]
 [0.8534575  0.88768576 0.902256  ]
 [0.81858646 0.826118   0.83032784]
 ...
 [0.56337116 0.56639924 0.73416878]
 [0.97937409 0.98466555 0.9887161 ]
 [1.14276981 1.14439904 1.14787991]]
1
84277	Not Related	1.0674
2000	Related	0.8535
5000	Related	0.8186
10000	Related	0.6947
84008	Related	0.9881
84278	Not Related	1.0374
99630	Related	0.7940
150329	Not Related	1.0603
209213	Related	0.9743
1181400	Related	0.8963
1410636	Related	0.7934
11462571	Not Related	1.0698
222203790046	Related	0.5318
222262908103	Related	0.9153
222262918223	Related	0.6639
222262919103	Related	0.3486
222263008681	Related	0.8457
222263018471	Related	0.7502
222263019102	Related	0.6922
222263051331	Related	0.7670
222268010189	Related	0.6264
222268034339	Related	0.8694
222268234339	Related	0.7604
222268309228	Related	0.9508
222268310189	Related	0.7728
222268334101	Related	0.8143
222268334121	Related	0.7977
222268334151	Related	0.8271
222268334339	Related	0.8364
2222683582

### Get ALL Families on our CM

In [31]:
import pandas as pd

# Load the data
#df = pd.read_csv(r"Vishay_ALL_SE_PARTS.tsv", sep='\t', encoding='ISO-8859-1')

# Remove spaces and extract the first 3 characters from the first column
df['first_3_chars'] = df.iloc[:, 0].str.replace(' ', '').str[:3]

# Get distinct values
distinct_values = df['first_3_chars'].unique()

# Save distinct values to a text file
with open('families_3_char.txt', 'w') as f:
    for value in distinct_values:
        f.write(f"{value}\n")

print("Distinct values have been saved to families_3_char.txt")


Distinct values have been saved to families_3_char.txt


### Try KNN without scalling, parts without cleaning and vectorizer n_gram is (3,3)

In [195]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import joblib
se_parts=pd.read_csv(r"Vishay_ALL_SE_PARTS.tsv",sep='\t',encoding='ISO-8859-1')["COM_PARTNUM"].to_list()
pcn_parts=pd.read_csv(r"Vishay_PCN_NM.txt",sep='\t',encoding='ISO-8859-1')['MPN'].to_list()
ddf_parts=pd.read_csv(r"Vishay_DDF_NM.txt",sep='\t',encoding='ISO-8859-1')['MPN'].to_list()
all_parts= se_parts
all_parts.extend(pcn_parts)
all_parts.extend(ddf_parts)
# Convert to lowercase and remove all spaces
vishay_parts_uncased = [str(part).replace(' ', '').lower() for part in all_parts]

# Vectorize the part numbers using TF-IDF at the character level
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3),encoding='ISO-8859-1')
data = vectorizer.fit_transform(vishay_parts_uncased)
print(data.shape)
##joblib.dump(vectorizer,"vectorizer_without_scalling.pkl")
# Normalize the data (optional but recommended for KNN)
#scaler = StandardScaler(with_mean=False)
#data = scaler.fit_transform(data)

# Fit KNN model (using k=5 for this example)
knn = NearestNeighbors(n_neighbors=5)
knn.fit(data)
##joblib.dump(knn,"KNN_MODEL.pkl")

# Example new part numbers (some may be anomalies)
new_parts = ['R234-5678-90', 'Z123-4567-89']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)

# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")

(7288760, 41021)
1
R234-5678-90 is likely NOT related to Vishay. (Distance: 1.1440)
Z123-4567-89 is likely NOT related to Vishay. (Distance: 1.1072)


### Get Families of All Vishay Parts

In [199]:
# Remove spaces, extract the first 3 characters, and convert to lowercase
first_3_chars = [part[:3] for part in vishay_parts_uncased]

# Get distinct values using set
families = list(set(first_3_chars))  # about 4K families

In [32]:
# Read the families from the text file into a list
with open('families_3_char.txt', 'r') as f:
    families = [str(line.strip()) for line in f.readlines()]

### Function to check if the first 3 characters of new_part are in the list of families

In [5]:
# Function to check if the first 3 characters of new_part are in the list of families
def is_part_in_families(new_part):
    # Extract the first 3 characters of the new part
    first_3_chars = str(new_part).replace(" ","").lower()[:3]
    if first_3_chars in families:
        return "Family_Found"
    else:
        return ""


### Function2 to check if first 3 characters from split words in new_part are in the families list

In [42]:
# Function to check if any 3 characters from split words in new_part are in the families list
def is_part_in_families2(new_part):
    
    # Split the new part by spaces
    words = str(new_part).split()
    
    if len(words)==1:
        # Extract the first 3 characters of the new part
        first_3_chars = str(new_part).replace(" ","")[:3]
        if first_3_chars in families:
            return True
        else:
            return False
    else:
          # Check each word for the first 3 characters
        for word in words:
            # Extract the first 3 characters of the new part
            first_3_chars = str(word).replace(" ","")[:3]
            if first_3_chars in families:
                return True
        return False

### Function3 check for every 3 characters in every word

In [43]:
# Function to check if any 3 characters from split words in new_part are in the families list
def is_part_in_families3(new_part, families_file='families_3_char.txt'):
    # Read the families from the text file into a list
    with open(families_file, 'r') as f:
        families = [line.strip() for line in f.readlines()]

    # Split the new part by spaces
    words = new_part.split()

    # Check each word for the first 3 characters
    for word in words:
        # Take every 3 characters from the word
        for i in range(0, len(word) - 2):  # Ensures you can extract 3 characters at a time
            three_chars = word[i:i+3]
            # Check if these 3 characters are in the families list
            if three_chars in families:
                return True

    # If no matches found
    return False

# Example usage
new_part = 'R234 5678-90 B123'
result = is_part_in_families(new_part)
print(result)  # It will print True or False based on the match


False


In [228]:
new_parts = ['FSTTRK20DALEWWCWW']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)
print(distances)
#print(indices)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(f"threshold:{threshold}")
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])

[[0.88975484 0.91548421 0.91563442 0.92182379 0.92467089]]
threshold:1
CRCW0604 is likely related to Vishay. (Distance: 0.8898)
CRCW0603000J CRCW06036040FRT1 CRCW06036040FRT5


In [30]:
features_name=vectorizer.get_feature_names_out().tolist()

In [107]:
new_parts = ['LCF35UTYAIDNG0H000']
new_parts_uncased = [part.lower() for part in new_parts]

new_data = vectorizer.transform(new_parts_uncased).toarray()
for i,feature in zip(new_data[0],features_name):
    if i>0:
        print(i,feature,len(feature))

In [104]:
new_data[0]

array([0.07976544, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [109]:
countrer=0
for i,feature in zip(new_data[0],features_name):
    print(f"{i}\t{feature}\t{len(feature)}")
    countrer+=1
    if len(feature)==1:
        break

0.0	 	1


In [121]:
vectorizer

In [122]:
features_name=vectorizer.get_feature_names_out()
for i,feature in zip(vectorizer.idf_,features_name):
    print(f"{i}\t{feature}\t{len(feature)}")

1.0		0
5.152438071383017	 	1
13.428695753745057	#	1
15.58818000309843	$	1
5.253830370936696	%	1
14.121842934305002	&	1
15.58818000309843	'	1
12.244141035276224	(	1
12.255975492923225	)	1
13.46791646689834	*	1
10.472184193344347	+	1
10.611446260677855	,	1
3.2327395259174327	-	1
5.799542292241243	.	1
3.599523591321973	/	1
1.1851574475792888	0	1
1.3705713323316249	1	1
1.449607680661264	2	1
1.5972734816610017	3	1
1.8570870978276162	4	1
1.460492096634749	5	1
1.6747302998474831	6	1
2.059296962713326	7	1
2.081364530436592	8	1
2.521039736970132	9	1
13.10327335331043	:	1
15.993645111206595	<	1
15.58818000309843	@	1
11.866510726161502	\	1
15.58818000309843	^	1
7.34867454984853	_	1
2.4025439053444324	a	1
1.708079675309821	b	1
1.8737343851865622	c	1
2.2646853090614445	d	1
2.202628085650466	e	1
2.2192586767322697	f	1
3.3627100411576074	g	1
2.778991564186982	h	1
4.734315961757073	i	1
2.428252411983894	j	1
2.222191984481065	k	1
2.6231560508367626	l	1
1.949792935315354	m	1
2.166407089554261	n	1
5.6075

### Try KNN with Sample of Parts 100 PNs from every Family to Avoid Bias towards features that has large counts

In [212]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import joblib

df=pd.read_csv(r"Vishay_Sample_100PNs_from_every_family.tsv",sep='\t',encoding='ISO-8859-1')
#Vishay_parts_sample=df.sample(10000)
Vishay_parts = df["COM_PARTNUM"].to_list()

# Convert to lowercase (uncased characters)
vishay_parts_uncased = [str(part).lower() for part in Vishay_parts]

# Vectorize the part numbers using TF-IDF at the character level
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3),use_idf=True,encoding='ISO-8859-1')
data = vectorizer.fit_transform(vishay_parts_uncased)
print(data.shape)
##joblib.dump(vectorizer,"vectorizer_without_scalling.pkl")
# Normalize the data (optional but recommended for KNN)
#scaler = StandardScaler(with_mean=False)
#data = scaler.fit_transform(data)

# Fit KNN model (using k=3 for this example)
knn = NearestNeighbors(n_neighbors=5,n_jobs=-1,radius=1)
knn.fit(data)
##joblib.dump(knn,"KNN_MODEL.pkl")

# Example new part numbers (some may be anomalies)
new_parts = ['R234-5678-90', 'Z123-4567-89']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)

# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")

(4387075, 36490)
1
R234-5678-90 is likely NOT related to Vishay. (Distance: 1.1736)
Z123-4567-89 is likely NOT related to Vishay. (Distance: 1.2345)


In [214]:
new_parts = ['LCF35UTYAIDNG0H000']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data,n_neighbors=10)
print(distances)
#print(indices)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(f"threshold:{threshold}")
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])

[[1.28749516 1.28755876 1.29235453 1.29312282 1.29388035 1.29692535
  1.29752685 1.29768971 1.29964619 1.30038146]]
threshold:1
LCF35UTYAIDNG0H000 is likely NOT related to Vishay. (Distance: 1.2875)
MALIEYC07CF356P02 BFC2370CF334 EKS20GE310H00


In [186]:
features_name=vectorizer.get_feature_names_out()

In [209]:
new_parts = ['CRCW0806',' CRCW0402','CRCW0402'] #0.3776961688260855   0.3824950633969466 
new_parts_uncased = [part.lower() for part in new_parts]

new_data = vectorizer.transform(new_parts_uncased).toarray()
for i,feature in zip(new_data[1],features_name):
    if i!=0:
        print(i,feature)

0.7410186978891327  cr
0.2449529926175327 040
0.2471533580796606 402
0.2746462982216868 crc
0.2792994629694129 cw0
0.23714553320960036 rcw
0.34659451137202474 w04


In [213]:

new_parts = ['LCF35UTYAIDNG0H000']  #0.3730003147556022 
new_parts_uncased = [part.lower() for part in new_parts]

new_data = vectorizer.transform(new_parts_uncased).toarray()
for i,feature in zip(new_data[0],features_name):
    if i!=0:
        print(i,feature)

0.09722633468480742 000
0.26007277199937706 0h0
0.22098179674860516 35u
0.31652909318463773 5ut
0.31652909318463773 cf3
0.3764969242262611 dng
0.2353014121432442 f35
0.36671190348981675 g0h
0.1950058390148487 h00
0.3597693207912014 lcf
0.2877773831642504 ng0
0.30302398021711463 yai


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4387075 entries, 0 to 4387074
Data columns (total 1 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   COM_PARTNUM  object
dtypes: object(1)
memory usage: 33.5+ MB


In [9]:
df[df['COM_PARTNUM'].str.startswith('441')]

Unnamed: 0,COM_PARTNUM
270971,4412-11K
270972,4412-12K
270973,4412-13K
270974,4412-1K
270975,4412-2K
270976,4412-3K
270977,4412-4K
270978,4412-5K
270979,4412-6K
270980,4412-7K


In [210]:
features_name=vectorizer.get_feature_names_out()
for i,feature in zip(vectorizer.idf_,features_name):
    print(f"{i}\t{feature}\t{len(feature)}")

8.738268410298504	 % 	3
10.205128628463005	 %a	3
14.68473559147575	 %e	3
10.205128628463005	 %f	3
10.205128628463005	 %l	3
15.601026323349906	 (/	3
15.601026323349906	 (1	3
15.601026323349906	 (d	3
15.601026323349906	 (t	3
15.601026323349906	 **	3
15.601026323349906	 + 	3
14.096948926573631	 +/	3
11.828265385255268	 - 	3
15.19556121524174	 -0	3
13.34973452474341	 -1	3
13.89627823111148	 -2	3
14.68473559147575	 -3	3
12.998336637905522	 -4	3
15.601026323349906	 -5	3
15.19556121524174	 -6	3
15.601026323349906	 -7	3
15.601026323349906	 -t	3
12.683255591265626	 .0	3
12.18329963973654	 .1	3
13.203131050551535	 .2	3
12.998336637905522	 .3	3
13.29844123035586	 .4	3
13.116119673561904	 .5	3
13.729224146448313	 .6	3
15.601026323349906	 .7	3
14.096948926573631	 .8	3
15.601026323349906	 .9	3
13.89627823111148	 / 	3
12.580601437205543	 /1	3
14.214731962230015	 /2	3
12.580601437205543	 /5	3
12.580601437205543	 /7	3
15.601026323349906	 /d	3
13.203131050551535	 0 	3
12.656587344183464	 0,	3
8.46533897

### Try knn with counter vectorizer

In [215]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import joblib

df=pd.read_csv(r"Vishay_Sample_100PNs_from_every_family.tsv",sep='\t',encoding='ISO-8859-1')
#Vishay_parts_sample=df.sample(10000)
Vishay_parts = df["COM_PARTNUM"].to_list()

# Convert to lowercase (uncased characters)
vishay_parts_uncased = [str(part).lower() for part in Vishay_parts]

# Vectorize the part numbers using TF-IDF at the character level
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3),encoding='ISO-8859-1')
data = vectorizer.fit_transform(vishay_parts_uncased)
print(data.shape)
##joblib.dump(vectorizer,"vectorizer_without_scalling.pkl")
# Normalize the data (optional but recommended for KNN)
#scaler = StandardScaler(with_mean=False)
#data = scaler.fit_transform(data)

# Fit KNN model (using k=3 for this example)
knn = NearestNeighbors(n_neighbors=5,n_jobs=-1,radius=1)
knn.fit(data)
##joblib.dump(knn,"KNN_MODEL.pkl")

# Example new part numbers (some may be anomalies)
new_parts = ['R234-5678-90', 'Z123-4567-89']
new_parts_uncased = [part.lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)

# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")

(4387075, 36490)
1
R234-5678-90 is likely NOT related to Vishay. (Distance: 3.3166)
Z123-4567-89 is likely NOT related to Vishay. (Distance: 3.1623)


In [177]:
new_parts = ['CRCW120650100R1%-E3']
new_parts_uncased = [str(part).lower().replace(" ","") for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased).toarray()
#new_data = scaler.transform(new_data)

# Calculate distances to the nearest neighbors
distances, indices = knn.kneighbors(new_data)
print(distances)
#print(indices)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(f"threshold:{threshold}")
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    if distance > threshold:
        print(f"{part} is likely NOT related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])
    else:
        print(f"{part} is likely related to Vishay. (Distance: {distance:.4f})")
        print(Vishay_parts[indices[0][0]],Vishay_parts[indices[0][1]],Vishay_parts[indices[0][2]])

[[1.03618408 1.03625986 1.04081069 1.04474074 1.06401384]]
threshold:1
CRCW120650100R1%-E3 is likely NOT related to Vishay. (Distance: 1.0362)
CRCW1206 100 100R 1% RT1 CRCW1206 100 100R 1% RT5 CRCW1210 100 100R 1% P5


In [45]:

df_arrow=pd.read_excel(r"C:\Users\A80843\sample_from_arrow.xlsx")
new_parts=df_arrow["MPN"].tolist()
new_parts_uncased = [str(part).replace(' ', '').lower() for part in new_parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
distances, indices = knn.kneighbors(new_data)
print(distances)
# Set a threshold for anomaly detection
# Example: mean distance to nearest neighbors
threshold = 1  # Adjust based on your needs
print(threshold)
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    family_check=is_part_in_families(part)
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}\t{family_check}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}\t{family_check}")

(180, 36205)
[[1.29896997e+00 1.30094251e+00 1.30262048e+00 1.30336659e+00
  1.31073732e+00]
 [1.26407513e+00 1.26443001e+00 1.26865215e+00 1.26909143e+00
  1.27114684e+00]
 [1.25517013e+00 1.25554746e+00 1.26003616e+00 1.26050312e+00
  1.26268785e+00]
 [1.29508472e+00 1.29896659e+00 1.30141115e+00 1.30358514e+00
  1.30411159e+00]
 [1.29389409e+00 1.29588493e+00 1.29942154e+00 1.30023948e+00
  1.30624393e+00]
 [1.29294865e+00 1.30818547e+00 1.30818547e+00 1.31500053e+00
  1.31574893e+00]
 [1.29071030e+00 1.29265306e+00 1.29289367e+00 1.29289367e+00
  1.29382378e+00]
 [1.27443217e+00 1.27799670e+00 1.27865664e+00 1.27891953e+00
  1.27932055e+00]
 [1.27460156e+00 1.27816153e+00 1.27882063e+00 1.27908318e+00
  1.27948369e+00]
 [1.28590617e+00 1.28601736e+00 1.28820136e+00 1.29257460e+00
  1.29363165e+00]
 [1.28529974e+00 1.29097094e+00 1.29130424e+00 1.29158789e+00
  1.29277559e+00]
 [1.25868045e+00 1.26108183e+00 1.26158772e+00 1.26185979e+00
  1.26208744e+00]
 [1.28359750e+00 1.29582278

In [47]:
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    family_check=is_part_in_families(part)
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}\t{family_check}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}\t{family_check}")

LOTCHG-DALEFILMMIL	Not Related	1.2990	False
CUSTOM SLR OPTION 1	Not Related	1.2641	False
CUSTOM SLR OPTION 2	Not Related	1.2552	False
LOTCHG-FSTTRK-F1R	Not Related	1.2951	False
LOTCHG-ANGHERMMIL	Not Related	1.2939	False
DONOTUSELOTCHG-VTFSMNETMLD	Not Related	1.2929	False
LOTCHG-VTFSMCHPMIL	Not Related	1.2907	False
DISFFAI-POT-P11 (2590)	Not Related	1.2744	False
DISFFAI-POT-P11 (2580)	Not Related	1.2746	False
LCF35UTYAIDNG0H000	Not Related	1.2859	True
LCF50FSWAGDNG0H000	Not Related	1.2853	True
ZOASO 6,6/30,2UF/P	Not Related	1.2587	False
LCF50BSIAGDNG0H000	Not Related	1.2836	True
DST-1A082-466UA08REVP	Not Related	1.2046	False
LOTCHG-MAGNETICS	Not Related	1.2814	False
FSTTRK20DALEWWCWW	Not Related	1.2815	True
LCF50BSIAGDNGCH000	Not Related	1.2797	True
LCF50BSIAGDNGDH000	Not Related	1.2803	True
TBD CUSTOM TRANSFORMER FOR SRT	Not Related	1.2834	False
LCF50BTIAGDNG0H000	Not Related	1.2783	True
TORPIPFB00A001632	Not Related	1.2611	False
LMF1.1D502W4450	Not Related	1.2648	True
FSTTRK05DALEFFCO

### Saving Final Models and Functions for Not Accpet Predictor Model

In [198]:
joblib.dump(knn,"Final_Model_Not_Accept.pkl")
joblib.dump(vectorizer,"Final_vectorizer_Not_Accept.pkl")
joblib.dump(families,"families.pkl")
#joblib.dump(is_part_in_families(part),'checking_family.pkl')
#joblib.dump(is_part_in_families2(part),'checking_family2.pkl')
#joblib.dump(is_part_in_families3(part),'checking_family3.pkl')
#pcn_ddf=pcn_parts+ddf_parts
joblib.dump(all_parts,'pcn_ddf_list.pkl') 

['families.pkl']

### Running Final Model on new Part List

In [20]:
import pandas as pd

# Loading all Models
knn_model=joblib.load("Final_knn_Model_Not_Accept.pkl")
vectorizer=joblib.load("Final_vectorizer_Not_Accept.pkl")
families=joblib.load("families.pkl")
all_parts=joblib.load("pcn_ddf_list.pkl")


# Function to check if family of new_part is found
def is_part_in_families(new_part):
    # Extract the first 3 characters of the new part
    first_3_chars = str(new_part).replace(" ","").lower()[:3]
    if first_3_chars in families:
        return "Family_Found"
    else:
        return ""

#get accuracy by customized function
def convert_to_percentage(value):
    max_value_distance = 1.2999  # Find the largest value
    # Convert each value to percentage based on the formula
    percentage = ((max_value_distance - value) / (max_value_distance)) * 100 
    return percentage

# checking part if found
def part_found_checking(part,distance):
    if part in all_parts or distance==0.0:
        return "this Part Found"
    else:
        return ""

# read imput list
new_parts=['TBJC106K035CRLB0043','RNC60H9092FSRE4']
# cleaning and preprocessing input parts
new_parts_uncased = [str(part).replace(' ', '').lower() for part in new_parts]



new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
distances, indices = knn_model.kneighbors(new_data)
# Set a threshold for anomaly detection
threshold = 1  # Adjust based on your needs
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    family_check=is_part_in_families(part)
    pcn_ddf_check=part_found_checking(part,distance)
    accuracy=convert_to_percentage(distance)
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}\t{family_check}\t{100-accuracy}\t{pcn_ddf_check}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}\t{family_check}\t{accuracy}\t{pcn_ddf_check}")

(2, 41021)
TBJC106K035CRLB0043	Not Related	1.1601		89.24296967699296	
RNC60H9092FSRE4	Related	0.0000	Family_Found	99.99999918942133	this Part Found


### Accuracy Percentage

In [167]:
def convert_to_percentage(value):
    max_value_distance = 1.2999  # Find the largest value
    # Convert each value to percentage based on the formula
    percentage = ((max_value_distance - value) / (max_value_distance)) * 100 
    return percentage

### Checking Part in PCN or DDF

In [163]:
df_non_punlic=pd.read_excel('Vishay_Non_Public_Parts.xlsx')
df_non_punlic=df_non_punlic["MPN"].tolist()
df_non_punlic
new_parts_uncased = [str(part).replace(' ', '').lower() for part in df_non_punlic]

### Test Final Model on PS Vishay Parts that finished before

In [180]:
PS_Vishay_Parts=pd.read_excel('Vishay_PS_Parts_Test_ML_Not Accept.xlsx')
PS_Vishay_Parts=PS_Vishay_Parts["MPN"].tolist()

new_parts_uncased = [str(part).replace(' ', '').lower() for part in PS_Vishay_Parts]
new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
distances, indices = knn_model.kneighbors(new_data)
# Set a threshold for anomaly detection
threshold = 1  # Adjust based on your needs
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(PS_Vishay_Parts, distances.min(axis=1))):
    family_check=Family_Checking(part)
    accuracy=convert_to_percentage(distance)
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}\t{family_check}\t{100-accuracy}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}\t{family_check}\t{accuracy}")

CRCW 0603 2800F	Not Related	1.0362	True	79.71259933570312


In [186]:
for i, (part, distance) in enumerate(zip(PS_Vishay_Parts, distances.min(axis=1))):
    family_check=Family_Checking(part)
    accuracy=convert_to_percentage(distance)
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}\t{family_check}\t{100-accuracy}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}\t{family_check}\t{accuracy}")

CRCW 0603 2800F	Not Related	1.0362	True	79.71259933570312


### running on 1800 parts of arrwo that old is not related

In [21]:
import pandas as pd

# Loading all Models
knn_model=joblib.load("Final_knn_Model_Not_Accept.pkl")
vectorizer=joblib.load("Final_vectorizer_Not_Accept.pkl")
families=joblib.load("families.pkl")
all_parts=joblib.load("pcn_ddf_list.pkl")


# Function to check if family of new_part is found
def is_part_in_families(new_part):
    # Extract the first 3 characters of the new part
    first_3_chars = str(new_part).replace(" ","").lower()[:3]
    if first_3_chars in families:
        return "Family_Found"
    else:
        return ""

#get accuracy by customized function
def convert_to_percentage(value):
    max_value_distance = 1.2999  # Find the largest value
    # Convert each value to percentage based on the formula
    percentage = ((max_value_distance - value) / (max_value_distance)) * 100 
    return percentage

# checking part if found
def part_found_checking(part,distance):
    if part in all_parts or distance==0.0:
        return "this Part Found"
    else:
        return ""

# read imput list
new_parts=pd.read_excel('1.8K_arrow_Old_Not_Related.xlsx')["MPN"].to_list()
# cleaning and preprocessing input parts
new_parts_uncased = [str(part).replace(' ', '').lower() for part in new_parts]



new_data = vectorizer.transform(new_parts_uncased)
print(new_data.shape)
distances, indices = knn_model.kneighbors(new_data)
# Set a threshold for anomaly detection
threshold = 1  # Adjust based on your needs
# Determine if each part is an anomaly
for i, (part, distance) in enumerate(zip(new_parts, distances.min(axis=1))):
    family_check=is_part_in_families(part)
    pcn_ddf_check=part_found_checking(part,distance)
    accuracy=convert_to_percentage(distance)
    if distance > threshold:
        print(f"{part}\tNot Related\t{distance:.4f}\t{family_check}\t{100-accuracy}\t{pcn_ddf_check}")
    else:
        print(f"{part}\tRelated\t{distance:.4f}\t{family_check}\t{accuracy}\t{pcn_ddf_check}")

(1825, 41021)
84277	Not Related	1.0638		81.83464403252665	
84278	Not Related	1.0181		78.3240655888405	
1SV285TPH3F	Not Related	1.1576		89.05209434677805	
37-307	Related	0.9099		29.99858035300187	
4-794620-2	Not Related	1.0190		78.38924743918898	
ATSAMHA1G16A-MBT	Not Related	1.1800		90.77398772933721	
BCN318RB103J7	Related	0.9093		30.048950234201836	
BD28019273016W	Not Related	1.1283		86.79888212673221	
CASD20TB	Not Related	1.1368		87.44993434922505	
CGA3E2X7R2A103M080AA	Not Related	1.2151		93.47443922696884	
CHS04TA	Not Related	1.1456		88.13094340600277	
CHS-08TB	Not Related	1.1650		89.62249047394864	
EV1HMC973ALP3	Not Related	1.2715		97.81742858233622	
GRM188R61E475KE11D	Not Related	1.1572		89.018906789653	
GRM31CR6YA106KA12L	Not Related	1.1668		89.75709054689949	
H1313NL	Not Related	1.0966		84.35965610910102	
HCSK4026FTL500	Not Related	1.1316		87.0550345389537	
IM830C-37-FH-15.625MHZ	Not Related	1.1500		88.46659804312262	
ISL9R3060P2	Not Related	1.0841		83.39801915520621	
NAS603-8P	N