In [1]:

import pickle
import timeit
from copy import deepcopy

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from pre_processing import *
from utils import load_data
from clustering import KMeans

In [2]:
X_train, X_test, y_train, X_cols, train_idxs, test_idxs = load_data()

In [3]:
preprocessor_pipe = Pipeline(
    steps=[
        (FeatureDropper, None),
        (ToNaNFiller, None),
        (MissingValuesDropper, None),
        (ConstantFeaturesRemover, None),
        (FillImputer, {"strategy": "median"}),
        (PropPredictorsRemover, None),
        (Standardizer, {"with_nan": False}),
        (OneHotEncoder, {"drop_last": True}),
    ],
)
print(f"{X_train.shape=}")
start_preprocessing = timeit.default_timer()
X_train, X_train_cols = preprocessor_pipe.fit_transform(X_train, deepcopy(X_cols))
end_preprocessing = timeit.default_timer()
print(f"{X_train.shape=}")
print(f"{end_preprocessing-start_preprocessing=}")

X_train.shape=(328135, 321)
Processing step: FeatureDropper
Processing step: ToNaNFiller
Processing step: MissingValuesDropper
(67,)
Processing step: ConstantFeaturesRemover
Processing step: FillImputer
Processing step: PropPredictorsRemover
Processing step: Standardizer
Processing step: OneHotEncoder
X_train.shape=(328135, 670)
end_preprocessing-start_preprocessing=32.19036596500001


In [4]:
kmeans = KMeans(n_clusters=10, max_iter=100, verbose=True)

In [5]:
kmeans.fit(X_train[y_train == -1])
with open("kmeans_5c_m1_s2.pkl", "wb") as f:
    pickle.dump((kmeans.centroids, kmeans.last_centroid_idxs), f)

In [6]:
np.unique(kmeans.last_centroid_idxs, return_counts=True)

(array([0, 1, 2, 3, 4]), array([23481, 92516, 63127, 61878, 58158]))

In [7]:
kmeans.centroids.shape

(5, 670)

In [8]:
for i in range(5):
    print(np.linalg.norm(kmeans.centroids - kmeans.centroids[i], axis=1))

[0.         6.55726268 7.23401881 7.45271627 6.98584155]
[6.55726268 0.         3.5231781  4.69302515 4.06758154]
[7.23401881 3.5231781  0.         3.86543096 5.65424628]
[7.45271627 4.69302515 3.86543096 0.         4.35627962]
[6.98584155 4.06758154 5.65424628 4.35627962 0.        ]
