In [None]:
import numpy as np
import matplotlib.pyplot as plt

Function to load the dataset from a CSV file

In [None]:
def load_dataset(Iris):
    data = []
    species = []
    with open('D:\Internships\Iris.csv') as file:
        lines = file.readlines()
        header = lines[0].strip().split(',')
        for line in lines[1:]:
            values = line.strip().split(',')
            data.append([float(x) for x in values[1:5]])  # Exclude the Id column
            species.append(values[5])
    return np.array(data), species

Function to initialize the centroids randomly

In [None]:
def initialize_centroids(data, k):
    indices = np.random.choice(len(data), size=k, replace=False)
    centroids = data[indices]
    return centroids

Function to calculate the Euclidean distance between two points

In [None]:
def euclidean_distance(p1, p2):
    return np.sqrt(np.sum((p1 - p2) ** 2))

Function to assign each data point to the nearest centroid

In [None]:
def assign_clusters(data, centroids):
    clusters = []
    for point in data:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        cluster = np.argmin(distances)
        clusters.append(cluster)
    return np.array(clusters)

Function to update the centroids based on the assigned data points

In [None]:
def update_centroids(data, clusters, k):
    centroids = np.zeros((k, data.shape[1]))
    for cluster in range(k):
        points = data[clusters == cluster]
        centroids[cluster] = np.mean(points, axis=0)
    return centroids

Function to perform K-Means Clustering

In [None]:
def kmeans_clustering(data, k):
    # Initialize centroids
    centroids = initialize_centroids(data, k)

    # Iterate until convergence
    while True:
        # Assign data points to the nearest centroid
        clusters = assign_clusters(data, centroids)

        # Update centroids
        new_centroids = update_centroids(data, clusters, k)

        # Check for convergence
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return clusters, centroids

Function to visualize the results

In [None]:
def plot_results(data, species, clusters, centroids):
    # Plot the clusters
    plt.scatter(data[:, 0], data[:, 1], c=clusters)
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x')
    plt.xlabel('Sepal Length (cm)')
    plt.ylabel('Sepal Width (cm)')
    plt.title('K-Means Clustering')
    plt.show()

    # Plot the actual species
    unique_species = np.unique(species)
    species_colors = ['red', 'green', 'blue']
    species_markers = ['o', 's', 'D']
    for i, species_name in enumerate(unique_species):
        species_data = data[species == species_name]
        plt.scatter(species_data[:, 0], species_data[:, 1], c=species_colors[i], marker=species_markers[i],
                    label=species_name)
    plt.xlabel('Sepal Length (cm)')
    plt.ylabel('Sepal Width (cm)')
    plt.title('Actual Species')
    plt.legend()
    plt.show()

Main function

In [None]:
def main():
    # Load the dataset
    filename = 'Iris.csv'
    data, species = load_dataset('D:\Internships\Iris.csv')

    # Set the number of clusters
    k = 3

    # Perform K-Means Clustering
    clusters, centroids = kmeans_clustering(data, k)

    # Visualize the results
    plot_results(data, species, clusters, centroids)

Run the main function

In [None]:
if __name__ == '__main__':
    main()