# Unsupervised machine learning

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import jaccard_score, accuracy_score, precision_score, recall_score

In [None]:
def generate_biomodal_2d_data():
    import numpy as np
        
    rs = np.random.RandomState(seed=0)

    x1 = rs.normal(3, 1, (150,2))
    x2 = rs.normal(8, 1.5, (150,2))

    x_all = np.concatenate((x1, x2), axis=0)
    rs.shuffle(x_all)
    return x_all

In the following data set, we are going to simulate patients with Myeloid Leukemia. We are going to analyze two features, Progression and Mutational Signature. Patients with a faster progression and higher mutational signature are considered with Acute Myeloid Leukemia (AML). 

In [None]:
data = generate_biomodal_2d_data()

plt.scatter(data[:, 0], data[:, 1], c='#DDDDDD')
plt.xlabel('progression')
plt.ylabel('mutational signature')

To get a more detailed insight into the data, we print out the first entries.

In [None]:
pd.DataFrame(data[:20], columns=["progression", "mutational signature"])

## Separating test and validation data
Before we train our k-means method, we need to split the annotated data into two subsets. Goal is to enable unbiased validation. We train on the first half of the annotated data points and measure the quality on the second half.

In [None]:
train_data = data[:200]
validation_data = data[200:250]

## Training
With the selected data we can train our k-means model

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0,  n_init="auto").fit(train_data)

In [None]:
result = kmeans.predict(train_data)

colors = ['orange', 'blue']
predicted_colors = []
for i in result:
  predicted_colors.append(colors[i-1])

plt.scatter(train_data[:, 0], train_data[:, 1], c=predicted_colors)
plt.xlabel('progression')
plt.ylabel('mutational signature')

centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="black",
    zorder=10,
)

## Validation
We can now apply the classifier to the validation data.

In [None]:
result = kmeans.predict(validation_data)

colors = ['orange', 'blue']
predicted_colors = []
for i in result:
  predicted_colors.append(colors[i-1])

plt.scatter(validation_data[:, 0], validation_data[:, 1], c=predicted_colors)
plt.xlabel('progression')
plt.ylabel('mutational signature')

## Prediction
After training and validation of the classifier, we can reuse it to process other data sets. 
It is uncommon to classify test- and validation data, as those should be used for making the classifier only. We here apply the classifier to the remaining data points.

In [None]:
remaining_data = data[250:]

prediction = kmeans.predict(remaining_data)

In [None]:
predicted_colors = [colors[i-1] for i in prediction]

plt.scatter(remaining_data[:, 0], remaining_data[:, 1], c=predicted_colors)
plt.xlabel('progression')
plt.ylabel('mutational signature')