# Implement a K-Nearest Neighbors (KNN) Algorithm

In [1]:
from typing import List, Tuple
import math
from collections import Counter

def euclidean_distance(point1: Tuple[float, float], point2: Tuple[float, float]) -> float:
    """Calculate the Euclidean distance between two points."""
    return math.sqrt((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2)

def knn_classifier(data_points: List[Tuple[float, float, str]], new_point: Tuple[float, float], k: int = 3) -> str:
    """KNN classifier to predict the label of the new point using k nearest neighbors."""
    
    # Step 1: Calculate the Euclidean distance between the new point and all data points
    distances = []
    for point in data_points:
        distance = euclidean_distance(new_point, (point[0], point[1]))
        distances.append((distance, point[2]))  # Store the distance along with the label
    
    # Step 2: Sort the distances in ascending order
    distances.sort(key=lambda x: x[0])
    
    # Step 3: Get the labels of the k nearest neighbors
    k_nearest_labels = [label for _, label in distances[:k]]
    
    # Step 4: Perform a majority vote to determine the most common label
    most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
    
    return most_common_label


In [2]:
# Data points with their labels
data_points = [
    (1.0, 2.0, 'A'), 
    (2.0, 3.0, 'B'), 
    (3.0, 4.0, 'A'), 
    (5.0, 6.0, 'B'), 
    (1.5, 1.8, 'A')
]

# New point to classify
new_point = (2.5, 3.5)

# Predict the label for the new point using KNN
predicted_label = knn_classifier(data_points, new_point, k=3)
print(predicted_label) 

A


# Remove Outliers from Data

In [3]:
from typing import List
import statistics

def remove_outliers(data: List[float]) -> List[float]:
    """Removes outliers from the data. A number is considered an outlier if it is 
    more than 2 * standard deviation away from the mean of the list."""
    
    if len(data) == 0:
        return []
    
    # Step 1: Calculate the mean and standard deviation
    mean = statistics.mean(data)
    std_dev = statistics.stdev(data)
    
    # Step 2: Define the threshold for outliers (mean ± 2 * std_dev)
    lower_bound = mean - 2 * std_dev
    upper_bound = mean + 2 * std_dev
    
    print(f"Mean: {mean}, Std Dev: {std_dev}")
    print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")
    
    # Step 3: Remove any number outside the bounds
    filtered_data = [x for x in data if lower_bound <= x <= upper_bound]
    
    print(f"Filtered Data: {filtered_data}")
    
    return filtered_data


In [4]:
data = [10, 12, 14, 16, 18, 100]
cleaned_data = remove_outliers(data)
print("Cleaned Data:", cleaned_data)


Mean: 28.333333333333332, Std Dev: 35.223098481914775
Lower Bound: -42.11286363049622, Upper Bound: 98.77953029716288
Filtered Data: [10, 12, 14, 16, 18]
Cleaned Data: [10, 12, 14, 16, 18]


# 3. Optimize a Matrix Multiplication for Neural Network

In [5]:
from typing import List

def matrix_multiply(mat1: List[List[int]], mat2: List[List[int]]) -> List[List[int]]:
    # Check if matrices can be multiplied (columns in mat1 must equal rows in mat2)
    if len(mat1[0]) != len(mat2):
        raise ValueError("Incompatible matrices: Cannot multiply matrices with dimensions {}x{} and {}x{}.".format(len(mat1), len(mat1[0]), len(mat2), len(mat2[0])))

    # Initialize the result matrix with zeros (size: m x p)
    result = [[0] * len(mat2[0]) for _ in range(len(mat1))]

    # Perform matrix multiplication
    for i in range(len(mat1)):  # iterate over rows of mat1
        for j in range(len(mat2[0])):  # iterate over columns of mat2
            for k in range(len(mat2)):  # iterate over rows of mat2 (or columns of mat1)
                result[i][j] += mat1[i][k] * mat2[k][j]

    return result


In [6]:
mat1 = [
    [1, 2],
    [3, 4],
    [5, 6]
]

mat2 = [
    [7, 8, 9],
    [10, 11, 12]
]

result = matrix_multiply(mat1, mat2)
for row in result:
    print(row)


[27, 30, 33]
[61, 68, 75]
[95, 106, 117]


# 4. Word Embedding Similarity

In [7]:
from typing import List
import math

def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    # Check if vectors have the same length
    if len(vec1) != len(vec2):
        raise ValueError("Vectors must have the same length.")
    
    # Step 1: Compute dot product
    dot_product = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
    
    # Step 2: Compute magnitudes of vec1 and vec2
    magnitude_vec1 = math.sqrt(sum(v1 ** 2 for v1 in vec1))
    magnitude_vec2 = math.sqrt(sum(v2 ** 2 for v2 in vec2))
    
    # Step 3: Handle zero magnitude (which means vector is 0, leading to undefined similarity)
    if magnitude_vec1 == 0 or magnitude_vec2 == 0:
        raise ValueError("One of the vectors has zero magnitude, so cosine similarity is undefined.")
    
    # Step 4: Compute cosine similarity
    cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
    
    return cosine_sim


In [8]:
vec1 = [1.0, 2.0, 3.0]
vec2 = [4.0, 5.0, 6.0]

similarity = cosine_similarity(vec1, vec2)
print("Cosine Similarity:", similarity)


Cosine Similarity: 0.9746318461970762


# 5. Implement a Min-Heap Using a Priority Queue 

In [9]:
import heapq

class MinHeap:
    def __init__(self):
        # Initialize an empty list to store the heap
        self.heap = []
    
    def insert(self, value: int) -> None:
        """Insert a value into the min-heap."""
        heapq.heappush(self.heap, value)
    
    def get_min(self) -> int:
        """Return the smallest value in the min-heap without removing it."""
        if not self.heap:
            raise IndexError("Heap is empty.")
        return self.heap[0]
    
    def extract_min(self) -> int:
        """Remove and return the smallest value from the min-heap."""
        if not self.heap:
            raise IndexError("Heap is empty.")
        return heapq.heappop(self.heap)


In [10]:
# Create an instance of MinHeap
min_heap = MinHeap()

# Insert elements into the heap
min_heap.insert(5)
min_heap.insert(3)
min_heap.insert(8)
min_heap.insert(1)

# Get the minimum element
print("Minimum Element:", min_heap.get_min())  

# Extract the minimum element
print("Extracted Minimum:", min_heap.extract_min())  

# Get the new minimum element after extraction
print("New Minimum Element:", min_heap.get_min()) 


Minimum Element: 1
Extracted Minimum: 1
New Minimum Element: 3


# 6. Implement a Support Vector Machine (SVM) Classifier

In [11]:
from typing import List, Tuple

def svm_classifier(data_points: List[Tuple[float, float, int]], new_point: Tuple[float, float]) -> int:
    # Step 1: Define the hyperplane (manually setting weights and bias for simplicity)

    # For simplicity, we'll use some predefined weights and bias
    w = [1.0, -1.0] 
    b = 0.0         
    
    # Step 2: Calculate the decision function for the new point
    decision_value = w[0] * new_point[0] + w[1] * new_point[1] + b
    
    # Step 3: Return the class based on the decision value
    if decision_value >= 0:
        return 1  # Class +1
    else:
        return -1  # Class -1


In [12]:
data_points = [
    (1.0, 2.0, 1),    # (x, y, label)
    (2.0, 3.0, 1),
    (3.0, 3.0, -1),
    (4.0, 1.0, -1)
]

# New point to classify
new_point = (2.5, 2.5)

# Call the SVM classifier
predicted_label = svm_classifier(data_points, new_point)
print("Predicted Label:", predicted_label)


Predicted Label: 1


# 7. Calculate the Z-Score of Data

In [13]:
from typing import List

def calculate_z_scores(data: List[float]) -> List[float]:
    # Step 1: Calculate the mean of the data
    mean = sum(data) / len(data)
    
    # Step 2: Calculate the standard deviation
    variance = sum((x - mean) ** 2 for x in data) / len(data)
    std_dev = variance ** 0.5
    
    # Step 3: Calculate the Z-score for each data point
    z_scores = [(x - mean) / std_dev for x in data]
    
    return z_scores


In [14]:
data = [10, 12, 14, 16, 18, 100, 102]
z_scores = calculate_z_scores(data)
print("Z-Scores:", z_scores)


Z-Scores: [-0.7328081543718783, -0.6820194704055105, -0.6312307864391427, -0.5804421024727748, -0.5296534185064071, 1.5526826241146732, 1.603471308081041]


# 8. K-Means Clustering Implementation

In [15]:
import random
from typing import List, Tuple

def euclidean_distance(p1: Tuple[float, float], p2: Tuple[float, float]) -> float:
    # Calculate Euclidean distance between two points
    return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5

def calculate_centroid(cluster: List[Tuple[float, float]]) -> Tuple[float, float]:
    # Calculate the centroid (mean) of a cluster
    x_coords = [point[0] for point in cluster]
    y_coords = [point[1] for point in cluster]
    return (sum(x_coords) / len(x_coords), sum(y_coords) / len(y_coords))

def k_means_clustering(data_points: List[Tuple[float, float]], k: int) -> List[Tuple[float, float]]:
    # Step 1: Randomly initialize k centroids from the data points
    centroids = random.sample(data_points, k)
    
    for _ in range(10):  # Limit to 10 iterations for simplicity
        clusters = [[] for _ in range(k)]
        
        # Step 2: Assign each point to the nearest centroid
        for point in data_points:
            distances = [euclidean_distance(point, centroid) for centroid in centroids]
            closest_centroid_idx = distances.index(min(distances))
            clusters[closest_centroid_idx].append(point)
        
        # Step 3: Update centroids based on the new cluster
        centroids = [calculate_centroid(cluster) for cluster in clusters]
    
    return centroids


In [16]:
data_points = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (5.0, 6.0), 
               (8.0, 8.0), (9.0, 10.0), (10.0, 10.0), (10.0, 9.0)]

k = 2 
centroids = k_means_clustering(data_points, k)
print("Final Centroids:", centroids)


Final Centroids: [(2.75, 3.75), (9.25, 9.25)]


# 9. Evaluate Classification Model Using F1 Score

In [17]:
from typing import List

def f1_score(true_labels: List[int], predicted_labels: List[int]) -> float:
    # Count True Positives (TP), False Positives (FP), and False Negatives (FN)
    tp = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == 1 and pred == 1)
    fp = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == 0 and pred == 1)
    fn = sum(1 for true, pred in zip(true_labels, predicted_labels) if true == 1 and pred == 0)
    
    # Calculate precision and recall
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    
    # Calculate F1 score
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


In [18]:
true_labels = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
predicted_labels = [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]

f1 = f1_score(true_labels, predicted_labels)
print("F1 Score:", f1)


F1 Score: 0.8000000000000002


# 10. Visualize Data Distribution Using a Histogram

In [19]:
from typing import List, Dict

def create_histogram(data: List[float], bins: int) -> Dict[str, int]:
    # Find the minimum and maximum values in the data
    min_val = min(data)
    max_val = max(data)
    
    # Calculate the width of each bin
    bin_width = (max_val - min_val) / bins
    
    # Create an empty dictionary to store the histogram
    histogram = {}
    
    # Create the bin ranges and initialize their counts to 0
    for i in range(bins):
        bin_start = min_val + i * bin_width
        bin_end = bin_start + bin_width
        bin_range = f"{round(bin_start, 2)} - {round(bin_end, 2)}"
        histogram[bin_range] = 0
    
    # Assign each data point to the appropriate bin
    for value in data:
        bin_index = int((value - min_val) / bin_width)
        # If the value is exactly max_val, put it in the last bin
        if bin_index == bins:
            bin_index -= 1
        bin_start = min_val + bin_index * bin_width
        bin_end = bin_start + bin_width
        bin_range = f"{round(bin_start, 2)} - {round(bin_end, 2)}"
        histogram[bin_range] += 1
    
    return histogram


In [20]:
data = [1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 10]
bins = 4

hist = create_histogram(data, bins)
for bin_range, count in hist.items():
    print(f"{bin_range}: {count}")


1.0 - 3.25: 4
3.25 - 5.5: 3
5.5 - 7.75: 2
7.75 - 10.0: 4


# 11. Implement a Decision Tree Classifier

In [21]:
from typing import List, Tuple, Any

class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, data: List[Tuple[List[float], str]]):
        """Build the decision tree using the training data."""
        self.tree = self._build_tree(data)

    def _build_tree(self, data: List[Tuple[List[float], str]]) -> Any:
        """Recursively build the decision tree."""
        labels = [label for _, label in data]
        
        # If all labels are the same, return that label (leaf node)
        if len(set(labels)) == 1:
            return labels[0]
        
        # If there are no features left, return the most common label
        if not data:
            return max(set(labels), key=labels.count)

        # Find the best feature to split on
        best_feature_index = self._best_feature_to_split(data)

        # Create a tree node
        tree = {"feature_index": best_feature_index, "children": {}}

        # Split the data based on the best feature
        feature_values = set([point[0][best_feature_index] for point in data])
        for value in feature_values:
            # Create sub-datasets for each value
            subset = [point for point in data if point[0][best_feature_index] == value]
            # Recursively build the tree for the subset
            tree["children"][value] = self._build_tree(subset)

        return tree

    def _best_feature_to_split(self, data: List[Tuple[List[float], str]]) -> int:
        """Find the best feature to split the data on."""
        # For simplicity, we can return the first feature
        return 0  

    def predict(self, point: List[float]) -> str:
        """Predict the label of a new point using the decision tree."""
        return self._predict(self.tree, point)

    def _predict(self, tree: Any, point: List[float]) -> str:
        """Recursively predict the label for the given point."""
        if isinstance(tree, str): 
            return tree
        
        # Get the feature index to split on
        feature_index = tree["feature_index"]
        feature_value = point[feature_index]
        
        # Traverse the tree
        if feature_value in tree["children"]:
            return self._predict(tree["children"][feature_value], point)
        else:
            return "Unknown"  


if __name__ == "__main__":
    # Training data: (features, label)
    data = [
        ([1, 2], 'A'),
        ([1, 3], 'A'),
        ([2, 2], 'B'),
        ([2, 3], 'B'),
        ([3, 3], 'A'),
    ]

    # Create and train the decision tree
    tree = DecisionTree()
    tree.fit(data)

    # Predicting a new point
    new_point = [2, 2]
    predicted_label = tree.predict(new_point)
    print("Predicted label for", new_point, "is:", predicted_label)


Predicted label for [2, 2] is: B


# 12. Normalize Data Using Min-Max Scaling

In [22]:
from typing import List

def min_max_normalization(data: List[float]) -> List[float]:
    """Normalize the data to a range between 0 and 1 using Min-Max scaling."""
    if not data:  # Check if the input list is empty
        return []

    min_value = min(data)
    max_value = max(data)  

    # Handle case where max_value equals min_value to avoid division by zero
    if min_value == max_value:
        return [0.0] * len(data) 

    # Normalize each value
    normalized_data = [(value - min_value) / (max_value - min_value) for value in data]
    
    return normalized_data

# Example Usage
if __name__ == "__main__":
    data = [10, 15, 20, 25, 30]
    normalized_data = min_max_normalization(data)
    print("Original Data:", data)
    print("Normalized Data:", normalized_data)


Original Data: [10, 15, 20, 25, 30]
Normalized Data: [0.0, 0.25, 0.5, 0.75, 1.0]


# 13. Calculate Euclidean Distance Between Two Points

In [23]:
from typing import List
import math

def euclidean_distance(point1: List[float], point2: List[float]) -> float:
    if len(point1) != len(point2):
        raise ValueError("Points must have the same number of dimensions")

    # Calculate the sum of squared differences
    sum_squared_diff = sum((x - y) ** 2 for x, y in zip(point1, point2))

    # Return the square root of the sum of squared differences
    return math.sqrt(sum_squared_diff)

# Example Usage
if __name__ == "__main__":
    point_a = [1.0, 2.0, 3.0]
    point_b = [4.0, 5.0, 6.0]
    distance = euclidean_distance(point_a, point_b)
    print(f"The Euclidean distance between {point_a} and {point_b} is: {distance}")


The Euclidean distance between [1.0, 2.0, 3.0] and [4.0, 5.0, 6.0] is: 5.196152422706632
