In [1]:
import pandas as pd
from functools import reduce
import numpy as np
import gc
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)

### Read data from a feather binary file with Pandas

In [2]:
data = pd.read_feather("../data/finaldata/fact_data.feather")

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

### Find all numeric columns and fill all NA/NAN values with zeros

In [None]:
tmp_data = data._get_numeric_data()
tmp_data = tmp_data.fillna(0)
tmp_data.head()

## K Means with number of alerts and product volume

### Print product_volume column's statistics

In [None]:
tmp_data['product_volume'].describe()

### Keep only number_of_alerts and product_volume columns

In [None]:
simpleDataProdVolume = tmp_data.filter(['number_of_alerts', 'product_volume'])

### Split the whole simple dataset into a training set (50%) and a testing set (50%)

In [None]:
train_data_prod_volume, test_data_prod_volume = train_test_split(simpleDataProdVolume, test_size=0.5)

### Run K-means clustering method with n_clusters=4 and collect predicted labels

In [None]:
kmeans_prod_volume = KMeans(n_clusters=4, random_state=0).fit(train_data_prod_volume)
labels_prod_volume = kmeans_prod_volume.labels_

In [None]:
kmeans_prod_volume.fit(np.array(train_data_prod_volume['product_volume'].values).reshape(-1, 1).astype('int'))
y_kmeans_prod_volume = kmeans_prod_volume.predict(np.array(train_data_prod_volume['product_volume'].values).reshape(-1, 1).astype('int'))


### Collect and print K-means centers. Draw the training data in the figure.

In [None]:
centers_prod_volume = kmeans_prod_volume.cluster_centers_
print(centers_prod_volume)
#plt.scatter(centers_prod_volume, centers_prod_volume, c='black', s=200, alpha=0.5)
plt.scatter(train_data_prod_volume['product_volume'], train_data_prod_volume['number_of_alerts'], c=y_kmeans_prod_volume, s=50, cmap='viridis')
plt.xlabel('product_volume')
plt.ylabel('number_of_alerts')
plt.title('number_of_alerts versus product_volume')


## K Means with number of alerts and min temperature

### All steps are almost the same compared with previous code blocks, except that the traning data contains different columns. Note that the number of classes could be different based on which data column is used as label.

In [None]:
tmp_data['min_basin_temperature'].describe()

In [None]:
simpleDataMinTemp = tmp_data.filter(['number_of_alerts', 'min_basin_temperature'])

In [None]:
train_data_min_temp, test_data_min_temp = train_test_split(simpleDataMinTemp, test_size=0.5)

In [None]:
kmeans_min_temp = KMeans(n_clusters=4, random_state=0).fit(train_data_min_temp)
labels_min_temp = kmeans_min_temp.labels_

In [None]:
kmeans_min_temp.fit(train_data_min_temp['min_basin_temperature'].values.reshape(-1, 1).astype('float'))
y_kmeans_min_temp = kmeans_min_temp.predict(train_data_min_temp['min_basin_temperature'].values.reshape(-1, 1).astype('float'))


In [None]:
centers_min_temp = kmeans_min_temp.cluster_centers_
print(centers_min_temp)
plt.scatter(centers_min_temp, centers_min_temp, c='black', s=200, alpha=0.5)
plt.scatter(train_data_min_temp['min_basin_temperature'], train_data_min_temp['number_of_alerts'], c=y_kmeans_min_temp, s=50, cmap='viridis')
plt.xlabel('min_basin_temperature')
plt.ylabel('number_of_alerts')
plt.title('number_of_alerts versus min_basin_temperature')


## K Means with number of alerts and max temperature

### All steps are almost the same compared with previous code blocks, except that the traning data contains different columns. Note that the number of classes could be different based on which data column is used as label.

In [None]:
tmp_data['max_basin_temperature'].describe()

In [None]:
simpleDataMaxTemp = tmp_data.filter(['number_of_alerts', 'max_basin_temperature'])

In [None]:
train_data_max_temp, test_data_max_temp = train_test_split(simpleDataMaxTemp, test_size=0.5)

In [None]:
kmeans_max_temp = KMeans(n_clusters=5, random_state=0).fit(train_data_max_temp)
labels_max_temp = kmeans_max_temp.labels_

In [None]:
kmeans_max_temp.fit(train_data_max_temp['max_basin_temperature'].values.reshape(-1, 1).astype('float'))
y_kmeans_max_temp = kmeans_max_temp.predict(train_data_max_temp['max_basin_temperature'].values.reshape(-1, 1).astype('float'))

In [None]:
centers_max_temp = kmeans_max_temp.cluster_centers_
print(centers_max_temp)
plt.scatter(centers_max_temp, centers_max_temp, c='black', s=200, alpha=0.5)
plt.scatter(train_data_max_temp['max_basin_temperature'], train_data_max_temp['number_of_alerts'], c=y_kmeans_max_temp, s=50, cmap='viridis')
plt.xlabel('max_basin_temperature')
plt.ylabel('number_of_alerts')
plt.title('number_of_alerts versus max_basin_temperature')



## K Means with number of alerts and temp out of range exceptions

### All steps are almost the same compared with previous code blocks, except that the traning data contains different columns. Note that the number of classes could be different based on which data column is used as label.

In [None]:
tmp_data['number_of_temp_out_of_range_exceptions'].describe()

In [None]:
simpleDataBadTemp = tmp_data.filter(['number_of_alerts', 'number_of_temp_out_of_range_exceptions'])

In [None]:
train_data_bad_temp, test_data_bad_temp = train_test_split(simpleDataBadTemp, test_size=0.5)

In [None]:
kmeans_bad_temp = KMeans(n_clusters=2, random_state=0).fit(train_data_bad_temp)
labels_bad_temp = kmeans_bad_temp.labels_

In [None]:
kmeans_bad_temp.fit(np.array(train_data_bad_temp['number_of_temp_out_of_range_exceptions'].values).reshape(-1, 1).astype('float'))
y_kmeans_bad_temp = kmeans_bad_temp.predict(np.array(train_data_bad_temp['number_of_temp_out_of_range_exceptions'].values).reshape(-1, 1).astype('float'))


In [None]:
centers_bad_temp = kmeans_bad_temp.cluster_centers_
print(centers_bad_temp)
plt.scatter(centers_bad_temp, centers_bad_temp, c='black', s=200, alpha=0.5)
plt.scatter(train_data_bad_temp['number_of_temp_out_of_range_exceptions'], train_data_bad_temp['number_of_alerts'], c=y_kmeans_bad_temp, s=50, cmap='viridis')

plt.xlabel('number_of_temp_out_of_range_exceptions')
plt.ylabel('number_of_alerts')
plt.title('number_of_alerts versus number_of_temp_out_of_range_exceptions')



## K Means with number of alerts and run duration

### All steps are almost the same compared with previous code blocks, except that the traning data contains different columns. Note that the number of classes could be different based on which data column is used as label.

In [None]:
tmp_data['run_duration_minutes'].describe()

In [None]:
simpleDataRunDuration = tmp_data.filter(['number_of_alerts', 'run_duration_minutes'])

In [None]:
train_data_run_duration, test_data_run_duration = train_test_split(simpleDataRunDuration, test_size=0.5)

In [None]:
train_data_run_duration.head(10)

In [None]:
kmeans_run_duration = KMeans(n_clusters=7, random_state=0).fit(train_data_run_duration)

In [None]:
labels_run_duration = kmeans_run_duration.labels_

In [None]:
kmeans_run_duration.fit(train_data_run_duration['run_duration_minutes'].values.reshape(-1, 1).astype('float'))
y_kmeans_run_duration = kmeans_run_duration.predict(train_data_run_duration['run_duration_minutes'].values.reshape(-1, 1).astype('float'))

In [None]:
centers_run_duration = kmeans_run_duration.cluster_centers_
print(centers_run_duration)
#plt.scatter(centers2, centers2, c='black', s=200, alpha=0.5)
plt.scatter(train_data_run_duration['run_duration_minutes'], train_data_run_duration['number_of_alerts'], c=y_kmeans_run_duration, s=50, cmap='viridis')


plt.xlabel('run_duration_minutes')
plt.ylabel('number_of_alerts')
plt.title('number_of_alerts versus run_duration_minutes')



## K Means with number of alerts and procedure duration minutes

### All steps are almost the same compared with previous code blocks, except that the traning data contains different columns. Note that the number of classes could be different based on which data column is used as label.

In [None]:
tmp_data['procedure_duration_minutes'].describe()

In [None]:
simpleDataProcedureDuration = tmp_data.filter(['number_of_alerts', 'procedure_duration_minutes'])

In [None]:
train_data_procedure_duration, test_data_procedure_duration = train_test_split(simpleDataProcedureDuration, test_size=0.5)

In [None]:
kmeans_procedure_duration = KMeans(n_clusters=3, random_state=0).fit(train_data_procedure_duration)

In [None]:
labels_procedure_duration = kmeans_procedure_duration.labels_

In [None]:
kmeans_procedure_duration.fit(test_data_procedure_duration['procedure_duration_minutes'].values.reshape(-1, 1).astype('float'))
y_kmeans_procedure_duration = kmeans_procedure_duration.predict(test_data_procedure_duration['procedure_duration_minutes'].values.reshape(-1, 1).astype('float'))


In [None]:
centers_procedure_duration = kmeans_procedure_duration.cluster_centers_
print(centers_procedure_duration)
plt.scatter(test_data_procedure_duration['procedure_duration_minutes'], test_data_procedure_duration['number_of_alerts'], c=y_kmeans_procedure_duration, s=50, cmap='viridis')

plt.xlabel('procedure_duration_minutes')
plt.ylabel('number_of_alerts')
plt.title('number_of_alerts versus procedure_duration_minutes')

