In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


def task1():
    print(f"{format('Task 1', '*^100')}")
    # 1. Load the dataset in Python
    data = pd.read_csv("diabetes.csv")

    # 2. Clean the data
    # Replace columns like Glucose, BloodPressure, SkinThickness, Insulin, BMI where value is 0 with mean of that column
    checkForZeroes = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
    for col in checkForZeroes:
        data[col] = data[col].replace(0, np.NaN)
        mean = int(data[col].mean(skipna=True))
        data[col] = data[col].replace(np.NaN, mean)

    # 3. Separate the dependent and independent variables
    x = data.iloc[:, 0:8]
    y = data.iloc[:, 8]

    # 4. Explore the data
    # Heatmap
    sns.heatmap(data.corr(), annot=True)
    print(data.head())

    # 5. Split the data into training and testing sets

    X_train, X_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=0
    )

    # 6. Implement the KNN algorithm
    def KNNClassifier(k):
        # Create the KNN classifier with the specified value of K
        knn = KNeighborsClassifier(n_neighbors=k)

        # Fit the classifier to the training data
        knn.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = knn.predict(X_test)

        # Print actual and predicted values
        print("Actual values:", y_test.values)
        print("Predicted values:", y_pred)

    # Call the KNNClassifier function with different values of K
    kValues = [1, 3, 5, 7, 9]
    for k in kValues:
        print(f"K = {k}")
        KNNClassifier(k)
        print("\n")


def task2():
    print(f"{format('Task 2', '*^100')}")
    # Load the dataset
    data = pd.read_csv("Dataset2.csv")

    # Encoding categorical variables using one-hot encoding
    data_encoded = pd.get_dummies(data)

    def initialize_centroids(data, k):
        initial_centroids = data.sample(n=k).values.astype(float)
        return initial_centroids

    def euclidean_distance(x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def assign_clusters(data, centroids):
        clusters = []
        for index, row in data.iterrows():
            distances = [
                euclidean_distance(row.values, centroid) for centroid in centroids
            ]
            closest_centroid = np.argmin(distances)
            clusters.append(closest_centroid)
        return clusters

    def update_centroids(data, clusters, k):
        new_centroids = []
        df = pd.concat([data, pd.DataFrame(clusters, columns=["cluster"])], axis=1)
        for i in range(k):
            cluster_data = df[df["cluster"] == i].drop("cluster", axis=1)
            if not cluster_data.empty:
                new_centroid = cluster_data.mean(axis=0)
                new_centroids.append(new_centroid.values.astype(float))
            else:
                new_centroid = data.sample(1).values[0].astype(float)
                new_centroids.append(new_centroid)
        return np.array(new_centroids)

    def k_means_clustering(data, k, max_iters=100):
        centroids = initialize_centroids(data, k)
        for _ in range(max_iters):
            clusters = assign_clusters(data, centroids)
            new_centroids = update_centroids(data, clusters, k)
            if np.allclose(centroids, new_centroids, equal_nan=True):
                break
            centroids = new_centroids
        return clusters, centroids

    k = 10  # Number of clusters
    clusters, centroids = k_means_clustering(data_encoded, k)
    data_encoded["cluster"] = clusters

    # Displaying clusters
    for i in range(k):
        cluster_data = data_encoded[data_encoded["cluster"] == i]
        print(f"Cluster {i+1} Data:")
        print(cluster_data.drop("cluster", axis=1))
        print("\n")


if __name__ == "__main__":
    try:
        task1()
        task2()
    except KeyboardInterrupt:
        print("Execution interrupted by user")
        exit(1)
