In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
#    #  Attribute                     Domain
#    -- -----------------------------------------
#    1. Sample code number            id number
#    2. Clump Thickness               1 - 10
#    3. Uniformity of Cell Size       1 - 10
#    4. Uniformity of Cell Shape      1 - 10
#    5. Marginal Adhesion             1 - 10
#    6. Single Epithelial Cell Size   1 - 10
#    7. Bare Nuclei                   1 - 10
#    8. Bland Chromatin               1 - 10
#    9. Normal Nucleoli               1 - 10
#   10. Mitoses                       1 - 10
#   11. Class:                        (2 for benign, 4 for malignant)

col_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 
             'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
             'Mitoses', 'Class']

In [3]:
df = pd.read_csv('breast-cancer-wisconsin.data', names=col_names)

In [4]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
for column in df.columns:
    if column != 'Sample code number':
        print("uniques({}):{}".format(column, df[column].unique()))

uniques(Clump Thickness):[ 5  3  6  4  8  1  2  7 10  9]
uniques(Uniformity of Cell Size):[ 1  4  8 10  2  3  7  5  6  9]
uniques(Uniformity of Cell Shape):[ 1  4  8 10  2  3  5  6  7  9]
uniques(Marginal Adhesion):[ 1  5  3  8 10  4  6  2  9  7]
uniques(Single Epithelial Cell Size):[ 2  7  3  1  6  4  5  8 10  9]
uniques(Bare Nuclei):['1' '10' '2' '4' '3' '9' '7' '?' '5' '8' '6']
uniques(Bland Chromatin):[ 3  9  1  2  4  5  7  8  6 10]
uniques(Normal Nucleoli):[ 1  2  7  4  5  3 10  6  9  8]
uniques(Mitoses):[ 1  5  4  2  3  7 10  8  6]
uniques(Class):[2 4]


In [6]:
to_drop = df.index[df['Bare Nuclei'] == '?'].tolist()

In [7]:
df.drop(to_drop, axis=0, inplace=True)

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(675, 11)

In [10]:
len(df['Sample code number'].unique())

630

In [11]:
df['Bare Nuclei'] = df['Bare Nuclei'].astype('int64')

In [12]:
df.drop('Sample code number', axis=1, inplace = True)

In [13]:
x = df.drop('Class', axis=1)
y = df['Class']

In [14]:
cluster_algo = KMeans(n_clusters=2)

In [15]:
y = cluster_algo.fit_predict(x)

In [16]:
y

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,

In [17]:
x['clusters'] = y

In [18]:
x.to_excel('cancer_clusters.xlsx')