In [22]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

corpus = ["ai is powerful and useful",
          "smart and adaptive system",
          "very smart and useful",
          "learning AI is very hard",
          "ai can be biased",
          "biased and flawed"]

In [23]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
matrix = X.toarray()
sentence1 = ["ai is a smart system"]

X1 = vectorizer.transform(sentence1)

print("Vocabulary order:", vectorizer.get_feature_names_out())
print("Feature vector:", X1.toarray()[0])

Vocabulary order: ['adaptive' 'ai' 'and' 'be' 'biased' 'can' 'flawed' 'hard' 'is' 'learning'
 'powerful' 'smart' 'system' 'useful' 'very']
Feature vector: [0 1 0 0 0 0 0 0 1 0 0 1 1 0 0]


In [26]:
vector1 = X1.toarray()
sentence2 = ['learning AI is very hard']
X2 = vectorizer.transform(sentence2)
vector2 = X2.toarray()

In [32]:
def euclid_distance(vec1, vec2):
    return np.sqrt(np.sum((vec1 - vec2) ** 2))  

In [29]:
distance1 = euclid_distance(vector1,vector2)
print(distance1)

[2.23606798]


In [34]:
distances = []
for i in range(matrix.shape[0]):
    dist = euclid_distance(vector1, matrix[i, :])
    distances.append((i, dist))

distances.sort(key=lambda x: x[1])
for idx, dist in distances[:3]:
    print(f"- sentence {idx+1}: \"{corpus[idx]}\" (Distance = {dist:.4f})")

- sentence 2: "smart and adaptive system" (Distance = 2.0000)
- sentence 1: "ai is powerful and useful" (Distance = 2.2361)
- sentence 4: "learning AI is very hard" (Distance = 2.2361)


--

In [38]:
df = pd.read_csv('final_dataset.csv')
X_train = df[["perimeter_mean", "area_mean", "compactness_mean"]].values.tolist()
labels = df['diagnosis'].values.tolist()

In [39]:
y_train = []

for label in labels:
    if label == 'B':
        y_train.append(0)
    else:
        y_train.append(1)

print(type(y_train))
y_train

<class 'list'>


[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [40]:
train_data = zip(X_train, y_train)
train_data = list(train_data)
train_data

[([0.29, 0.18, 0.12], 0),
 ([0.11, 0.06, 0.17], 0),
 ([0.34, 0.21, 0.2], 0),
 ([0.26, 0.15, 0.09], 0),
 ([0.22, 0.09, 0.2], 0),
 ([0.58, 0.42, 0.47], 1),
 ([0.56, 0.37, 0.77], 1),
 ([0.48, 0.38, 0.28], 1),
 ([0.63, 0.49, 0.39], 1),
 ([0.54, 0.42, 0.43], 1)]

In [41]:
x = [0.25,0.25,0.25]
def manhattan_distance(x1,x2):
    return np.sum(np.abs(x1-x2))

In [48]:
for i in range(3):
    row = np.array(X_train[i])
    sum_man = manhattan_distance(x, row)
    print(sum_man)

0.24
0.41000000000000003
0.18000000000000002


In [50]:
distances = []
for features, label in train_data:
    dist = manhattan_distance(x, np.array(features))
    distances.append((dist, label))

In [51]:
distances.sort(key=lambda x: x[0])
k = 7
nearest_k = distances[:k]

class_0 = sum(1 for d, label in nearest_k if label == 0)
class_1 = sum(1 for d, label in nearest_k if label == 1)

In [52]:
print(class_0,class_1)

5 2


--

In [53]:
X = vectorizer.fit_transform(corpus)
print("================== Output =================")
display(f"Bộ từ vựng xây dựng từ corpus: {dict(sorted(vectorizer.vocabulary_.items()))}")
print("===========================================")



"Bộ từ vựng xây dựng từ corpus: {'adaptive': 0, 'ai': 1, 'and': 2, 'be': 3, 'biased': 4, 'can': 5, 'flawed': 6, 'hard': 7, 'is': 8, 'learning': 9, 'powerful': 10, 'smart': 11, 'system': 12, 'useful': 13, 'very': 14}"



In [54]:
X = X.toarray()
print("================== Output =================")
print("Vector đại diện cho bộ corpus sau khi vectorization: ")
print(X)
print("===========================================")

Vector đại diện cho bộ corpus sau khi vectorization: 
[[0 1 1 0 0 0 0 0 1 0 1 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 1 1]
 [0 1 0 0 0 0 0 1 1 1 0 0 0 0 1]
 [0 1 0 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 0 0 0]]


In [55]:
C1 = X[0]
C2 = X[4]

In [59]:
print(euclid_distance(C1, X[1]))
print(euclid_distance(C1, X[2]))
print(euclid_distance(C1, X[3]))
print(euclid_distance(C1, X[5]))

2.6457513110645907
2.23606797749979
2.449489742783178
2.449489742783178


In [60]:
print(euclid_distance(C2, X[1]))
print(euclid_distance(C2, X[2]))
print(euclid_distance(C2, X[3]))
print(euclid_distance(C2, X[5]))

2.8284271247461903
2.8284271247461903
2.6457513110645907
2.23606797749979


In [61]:
allocation = []

for i in range(len(X)):
    dist_to_C1 = euclid_distance(X[i], C1)
    dist_to_C2 = euclid_distance(X[i], C2)

    if dist_to_C1 < dist_to_C2:
        allocation.append("C1")
    else:
        allocation.append("C2")

print("Vector biểu diễn phân cụm:")
print(allocation)

Vector biểu diễn phân cụm:
['C1', 'C1', 'C1', 'C1', 'C2', 'C2']


In [62]:
mean_c1 = (X[0]+X[1]+X[2]+X[3]) / 4
mean_c2 = (X[4]+X[5]) / 2
print(mean_c1, mean_c2)

[0.25 0.5  0.75 0.   0.   0.   0.   0.25 0.5  0.25 0.25 0.5  0.25 0.5
 0.5 ] [0.  0.5 0.5 0.5 1.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]


In [63]:
allocation = []

for i in range(len(X)):
    dist_to_C1 = euclid_distance(X[i], mean_c1)
    dist_to_C2 = euclid_distance(X[i], mean_c2)

    if dist_to_C1 < dist_to_C2:
        allocation.append("C1")
    else:
        allocation.append("C2")

print("Vector biểu diễn phân cụm:")
print(allocation)

Vector biểu diễn phân cụm:
['C1', 'C1', 'C1', 'C1', 'C2', 'C2']


In [64]:
for i in range(6):
    print(euclid_distance(mean_c1, X[i]))
    print(euclid_distance(mean_c2, X[i]))
    print('----------------------')

1.4577379737113252
2.29128784747792
----------------------
1.620185174601965
2.29128784747792
----------------------
1.2747548783981961
2.29128784747792
----------------------
1.7677669529663689
2.5
----------------------
2.2638462845343543
1.118033988749895
----------------------
1.9039432764659772
1.118033988749895
----------------------


--

In [70]:
X_train
C1 = np.array(X_train[2])
C2 = np.array(X_train[8])

In [73]:
sum_1 = 0
sum_2 = 0
for i in range(10):
    if(i != 2):
        sum_1 += euclid_distance(C1, np.array(X_train[i]))
    if(i != 8):
        sum_2 += euclid_distance(C2, np.array(X_train[i]))
print(sum_1, sum_2)

2.793110965810563 3.7365721014281092


In [75]:
allocation = []

for i in range(len(X_train)):
    dist_to_C1 = euclid_distance(np.array(X_train[i]), C1)
    dist_to_C2 = euclid_distance(np.array(X_train[i]), C2)

    if dist_to_C1 < dist_to_C2:
        allocation.append(0)
    else:
        allocation.append(1)

print("Vector biểu diễn phân cụm:")
print(allocation)

Vector biểu diễn phân cụm:
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


In [76]:
for i in range(len(X_train)):
    print(euclid_distance(C1, np.array(X_train[i])))
    print(euclid_distance(C2, np.array(X_train[i])))
    print('----------------------')

0.09899494936611669
0.5334791467339656
----------------------
0.2762245463386627
0.7097182539571602
----------------------
0.0
0.44564559910314383
----------------------
0.14866068747318506
0.5852349955359812
----------------------
0.16970562748477142
0.6034898507845845
----------------------
0.41785164831552346
0.1174734012447073
----------------------
0.6315853069855252
0.4045985664828782
----------------------
0.23430749027719963
0.2161018278497431
----------------------
0.44564559910314383
0.0
----------------------
0.37013511046643494
0.1208304597359457
----------------------


In [78]:
X_train = np.array(X_train)
mean_c1 = (X_train[0]+X_train[1]+X_train[2]+X_train[3]+X_train[4]) / 5
mean_c2 = (X_train[5]+X_train[6]+X_train[7]+X_train[8]+X_train[9]) / 5
print(mean_c1, mean_c2)

[0.244 0.138 0.156] [0.558 0.416 0.468]


In [80]:
for i in range(len(X_train)):
    print(euclid_distance(mean_c1, X_train[i]))
    print(euclid_distance(mean_c2, X_train[i]))
    print('----------------------')

0.07194442299441979
0.49862210139543567
----------------------
0.155679157243351
0.6451697451058908
----------------------
0.12781236246936367
0.40222381829026493
----------------------
0.06896375859826669
0.5499490885527497
----------------------
0.06939740629158989
0.5406884500338436
----------------------
0.5394589882465579
0.02244994432064354
----------------------
0.7284751196849485
0.30548977069617234
----------------------
0.36004999652826
0.20669784711022035
----------------------
0.5724124387187965
0.12939860895697447
----------------------
0.4921544473028767
0.04223742416388583
----------------------
