# K-means clustering

## Importing required libraries

In [1]:
# To load breast cancer dataset
from sklearn import datasets  

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from matplotlib import pyplot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import time

# To execute a cell line by line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

## Question 1 : K-Means Clustering

### Import Data

In [2]:
# Loading the breat cancer dataset from sklearn
data = datasets.load_breast_cancer()
print(data.keys()) # To check all the features and target in the dataset

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
# Loading the dataset as a dataframe with required features
df_cancer = pd.DataFrame(data.data, columns=data.feature_names)

# Adding the target column to this dataframe
df_cancer['target'] = data.target

In [4]:
print("The number of rows in data : {}".format(df_cancer.shape[0]))
print("The number of columns in data : {}".format(df_cancer.shape[1]))
print("The first few rows of the data:")
df_cancer.head()

The number of rows in data : 569
The number of columns in data : 31
The first few rows of the data:


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [5]:
# Exploring the datatypes and null count of the columns
print('Datatypes of Columns:',df_cancer.info())            

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

We observe that all the features are float type and the target is integer type and there are no missing values in the data, so there is no need to treat for missing values

### Implementing K-Means clustering from scratch

In [6]:
# Creating a separate dataset for features in the data
df_features = df_cancer.loc[:,df_cancer.columns != 'target']
features_array = np.array(df_features)  # Converting this feature dataset into an array
target = df_cancer['target']   # Creating an array of the 'target' column

In [7]:
# Fucntion to generate random centroids
def calc_centroid(df,k):
  rand_values = random.sample(range(0, len(df)), k) # Selecting k random indices from the dataset
  centroids = []
  for i in rand_values:
      centroids.append(df.loc[i])  # Getting all the column values for a particular centroid index we have chosen
  centroids = np.array(centroids)
  return centroids

In [8]:
# Function to calculate distance between every data point and assigned centroid
def calc_distance(a, b):
  distance = np.linalg.norm(a-b)
  return distance

In [9]:
# Function to find the nearest centroid for each data point
def calc_nearestCentroid(df_array,centroids):
    nearest_centroid = []
    for i in df_array:
        distance=[]
        for j in centroids:
            distance.append(calc_distance(i, j))
        nearest_centroid.append(np.argmin(distance))
    return nearest_centroid

In [10]:
# Function to calculate new centroids to all points which are nearer than the previous centroids
def calc_newCentroids(nearest_centroids, df_array):
    new_centroids = []
    clusters = []
    new_df = pd.concat([pd.DataFrame(df_array), pd.DataFrame(nearest_centroids, columns=['cluster'])],axis=1)
    for i in set(new_df['cluster']):
        current_cluster = new_df[new_df['cluster'] == i][new_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        new_centroids.append(cluster_mean)
    clusters = np.unique(new_df['cluster'])
    label = new_df['cluster']
    return new_centroids,clusters,label,new_df

In [11]:
# Function to calculate the accuracy
def calc_accuracy(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return (correct/float(len(actual)))*100.0

### Random Initialisation 1

In [29]:
# Initialising random centroids
centroids = calc_centroid(df_features,2)

for i in range(10):
  nearest_centroids = calc_nearestCentroid(features_array,centroids)
  new_centroids,clusters,label,new_df = calc_newCentroids(nearest_centroids, features_array)

accuracy = calc_accuracy(target, label)
print("The accuracy for first random centroid initialisation is {:.2f}%".format(accuracy))

The accuracy for first random centroid initialisation is 40.42%


### Random Intialisation 2

In [15]:
# Initialising random centroids
centroids = calc_centroid(df_features,2)

for i in range(10):
  nearest_centroids = calc_nearestCentroid(features_array,centroids)
  new_centroids,clusters,label,new_df = calc_newCentroids(nearest_centroids, features_array)

accuracy = calc_accuracy(target, label)
print("The accuracy for second random centroid initialisation is {:.2f}%".format(accuracy))

The accuracy for second random centroid initialisation is 80.49%


### Random Intialisation 3

In [20]:
# Initialising random centroids
centroids = calc_centroid(df_features,2)

for i in range(3):
  nearest_centroids = calc_nearestCentroid(features_array,centroids)
  new_centroids,clusters,label,new_df = calc_newCentroids(nearest_centroids, features_array)

accuracy = calc_accuracy(target, label)
print("The accuracy for third random centroid initialisation is {:.2f}%".format(accuracy))

The accuracy for third random centroid initialisation is 90.51%


As the centroids are randomly generated, the accuracy keeps on changing for every initialisation. When the centroids are very random, the accuracy maybe very low, whereas when the centroids are initialised correctly, we can obtain good accuracy. The only way to get good accuracy with random initialisation is to repeat the process until required accuracy is obtained. 


### Implementing a Supervised learning algorithm
Logistic Regression

In [30]:
# Split the data into training and testing sets
X = df_features
y = df_cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

In [31]:
# Perform Feature Scaling for better results
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [32]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
def accuracy(conf_mat):
    correct = conf_mat[0][0] + conf_mat[1][1]
    total = conf_mat[0][0] + conf_mat[0][1] + conf_mat[1][0] + conf_mat[1][1]
    return correct / total * 100

In [38]:
# Making the Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy(conf_mat)
print("The accuracy for Logistic Regression is {:.2f}%".format(accuracy))

The accuracy for Logistic Regression is 97.20%


As we can see, we are able to get a better accuracy with Logistic Regression (Supervised learning algorithm). This is because we are using already define labels to train the model in supervised learning which will give better accuracy than a model used to identify the patterns as in the case of K-Means (unsupervised learning)