In [None]:
import pandas as pd
import numpy as np

#Below are all the distances that will be used for KNN
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def cosine_similarity(x1, x2):
    dot_product = np.dot(x1, x2)
    norm_a = np.linalg.norm(x1)
    norm_b = np.linalg.norm(x2)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

def cosine_distance(x1, x2):
    return 1 - cosine_similarity(x1, x2)

def jaccard_distance(x1, x2):
    set1 = set(x1)
    set2 = set(x2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    distance = 1.0 - intersection / union
    return distance


#Creating a class for KNN and all its functions for every distasnce
class KNN:
    def __init__(self, k=10):
        self.k = k

    def fit(self, X, Y):
        self.X_train = X
        self.Y_train = Y

    def predictEuclidian(self, X):
        y_pred = [self._predictEuclidian(x) for x in X]
        return np.array(y_pred)
    def predictCosine(self, X):
        y_pred = [self._predictCosine(x) for x in X]
        return np.array(y_pred)
    def predictJaccard(self, X):
        y_pred = [self._predictJaccard(x) for x in X]
        return np.array(y_pred)

    def _predictEuclidian(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.Y_train[i] for i in k_indices]
        most_common = np.argmax(np.bincount(k_nearest_labels))
        return most_common
    def _predictCosine(self, x):
        distances = [cosine_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.Y_train[i] for i in k_indices]
        most_common = np.argmax(np.bincount(k_nearest_labels))
        return most_common
    def _predictJaccard(self, x):
        distances = [jaccard_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.Y_train[i] for i in k_indices]
        most_common = np.argmax(np.bincount(k_nearest_labels))
        return most_common

def train_test_split(X, y, test_size=0.2, random_state=None):
    np.random.seed(random_state)
    shuffled = np.random.permutation(len(X))
    test_set_size = int(len(X) * test_size)
    test = shuffled[:test_set_size]
    train = shuffled[test_set_size:]
    return X[train], X[test], y[train], y[test]

# creating dataset in pandas to test the methods
data = [[2,2,0],[3,1,0],
        [4,1,0],[1,5,0],
        [-1,3,0],[-3,-2,0]]


df = pd.DataFrame( data, columns = ['Test1','Test2','Find'] )


features = df[['Test1', 'Test2']].values
target = df['Find'].values

# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(features, target, random_state=42)

# Creating KNN objects for each distance
k = 10
knnEuclidian = KNN(k=k)
knnCosine = KNN(k=k)
knnJaccard = KNN(k=k)

# KNN methods for Euclidian distance
knnEuclidian.fit(X_train, Y_train)

y_pred = knnEuclidian.predictEuclidian(X_test)
accuracy = np.mean(y_pred == Y_test)
print("Euclidian distance Accuracy:", accuracy)

# KNN methods for Cosine distance
knnCosine.fit(X_train, Y_train)

y_pred = knnCosine.predictCosine(X_test)
accuracy = np.mean(y_pred == Y_test)
print("Cosine distance Accuracy:", accuracy)

# KNN methods for Jaccard distance
knnJaccard.fit(X_train, Y_train)

y_pred = knnJaccard.predictJaccard(X_test)
accuracy = np.mean(y_pred == Y_test)
print("Jaccard distance Accuracy:", accuracy)

Euclidian distance Accuracy: 1.0
Cosine distance Accuracy: 1.0
Jaccard distance Accuracy: 1.0


In [None]:
import pandas as pd
import numpy as np

class NaiveBayes:
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.classes = np.unique(y)
        self.probabilities = self.probabilities()

    def likelihood(self, x, feature_values):
        return (feature_values == x).sum() / len(feature_values)

    def probabilities(self):
        class_counts = {}
        for class_value in self.classes:
            class_indices = np.where(self.y_train == class_value)[0]
            class_counts[class_value] = len(class_indices)
        total_samples = len(self.y_train)
        probabilities = {class_value: count / total_samples for class_value, count in class_counts.items()}
        return probabilities

    def predict(self, X):
        predictions = []
        for sample in X:
            probabilities = {}
            for class_value in self.classes:
                class_indices = np.where(self.y_train == class_value)[0]
                class_data = self.X_train[class_indices]
                for i, value in enumerate(sample):
                    feature_values = class_data[:, i]
                    probabilities[class_value] = probabilities.get(class_value, 1)
                    probabilities[class_value] *= self.likelihood(value, feature_values)

            for class_value in self.classes:
                probabilities[class_value] *= self.probabilities[class_value]

            predicted_class = max(probabilities, key=probabilities.get)
            predictions.append(predicted_class)

        return predictions

# Using pandas to create a dataframe
data = [['Green','Red','Brown','Yes'],
        ['Red','Blue','Black','No'],
        ['Black','Green','Brown','No'],
        ['Green','Green','Blonde','Yes'],
        ['Black','Brown','Blonde','Yes']]

df = pd.DataFrame(data, columns=['Favorite Color', 'Eye Color', 'Hair Color', 'Happy'])

# All categories for the dataset
df_encoded = pd.get_dummies(df, columns=['Favorite Color', 'Eye Color', 'Hair Color'])

features = df_encoded.drop(columns=['Happy']).values
target = df_encoded['Happy'].values

# Create Naive object
gnb = NaiveBayes()

gnb.fit(features, target)
y_pred = gnb.predict(features)
accuracy = np.mean(y_pred == target)
print("Accuracy:", accuracy)



Accuracy: 1.0
