In [22]:
import numpy as np
import os
import pandas as pd

In [23]:
# Path to folder containing the raw data
DATA_DIR = '../data/'
# File name of the raw data
DATA_FILENAME = 'ALL_vs_AML_train_set_38_sorted.res'
# Load the data into a dataframe
df = pd.read_csv(
    # This creates the full path to the file
    '{}{}'.format(DATA_DIR, DATA_FILENAME)
    # Ignore lines that start with '!'
    , comment='\t',
    # the character used to separate values
    sep='\t',  
)

# drop first row 
df = df[1:]
# drop description and last column
df = df.drop([df.columns[78],df.columns[0]],axis = 1)

#drop the unnamed columns
for column in df:
    if column[:7] == 'Unnamed':
        df = df.drop(column,axis = 1)

#change column name to accession  
df = df.set_index('Accession')

# drop the control
for i, row in df.iterrows():
    if 'control' in i :
        df.drop([i], inplace=True)
df.head()
# Scikit learn requires samples to be rows in the matrix
#  For this reason we transpose the dataframe
df = df.transpose()
df.head()

Accession,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,AFFX-BioB-5_st,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
ALL_19769_B-cell,-214.0,-153.0,-58.0,88.0,-295.0,-558.0,199.0,-176.0,252.0,206.0,...,185.0,511.0,-125.0,389.0,-37.0,793.0,329.0,36.0,191.0,-37.0
ALL_23953_B-cell,-135.0,-114.0,265.0,12.0,-419.0,-585.0,158.0,-253.0,49.0,31.0,...,240.0,835.0,218.0,174.0,-110.0,627.0,170.0,-50.0,126.0,-91.0
ALL_28373_B-cell,-106.0,-125.0,-76.0,168.0,-230.0,-284.0,4.0,-122.0,70.0,252.0,...,156.0,649.0,57.0,504.0,-26.0,250.0,314.0,14.0,56.0,-25.0
ALL_9335_B-cell,-72.0,-144.0,238.0,55.0,-399.0,-551.0,131.0,-179.0,126.0,-20.0,...,30.0,819.0,-178.0,151.0,-18.0,1140.0,482.0,10.0,369.0,-42.0
ALL_9692_B-cell,-413.0,-260.0,7.0,-2.0,-541.0,-790.0,-275.0,-463.0,70.0,-169.0,...,289.0,629.0,-86.0,302.0,23.0,1798.0,446.0,59.0,781.0,20.0


In [24]:
# Add an extra column to the dataframe containing the sample types
df['type_code'] = 1
for index, row in df.iterrows():
    if index[:3] == "AML":
        df.at[index,'type_code'] = 0


In [25]:
# extract the expression levels as features from the dataframe
features = df.drop(columns=['type_code']).values
labels = df['type_code'].values
print('features:', features.shape)
print('labels:', labels.shape)

features: (38, 7129)
labels: (38,)


In [30]:
from sklearn.cluster import KMeans

# Create a new clustering model and fit the features to it
# K means requires the number of clusters to be specified
kmeans = KMeans(n_clusters=2, random_state=0).fit(features)
print(kmeans.labels_)
print(labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1
 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 0]


In [29]:
from sklearn.cluster import AgglomerativeClustering

# The same process but with a different clustering model
clustering = AgglomerativeClustering(n_clusters=2).fit(features)

print(clustering.labels_)
print(labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0
 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 0]
