In [84]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

# Unsupervised Learning: with Cats and Dogs

We will be using Mini Batch Kmeans as our unsupervised learning algorithm. 

To give a bit of background, Mini Batch K-means algorithm‘s main idea is to use small random batches of data of a fixed size, so they can be stored in memory. Each iteration a new random sample from the dataset is obtained and used to update the clusters and this is repeated until convergence. Each mini batch updates the clusters using a convex combination of the values of the prototypes and the data, applying a learning rate that decreases with the number of iterations. This learning rate is the inverse of the number of data assigned to a cluster during the process. As the number of iterations increases, the effect of new data is reduced, so convergence can be detected when no changes in the clusters occur in several consecutive iterations.

In [85]:
training_data = np.load('training_data3.npy',allow_pickle=True)

In [86]:
X_train =np.array([i[0] for i in training_data])
Y_train =np.array([i[1] for i in training_data])
X_train=X_train.reshape(25000,2500)

In [87]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [88]:
%%time
pipeline = Pipeline([
 ("std_scaler", StandardScaler()),
 ("pca", PCA(n_components=0.90, random_state=0,svd_solver='full')),("miniBatchKmeans",MiniBatchKMeans(n_clusters=8,random_state=42,batch_size=2000)),
 ("lg",LogisticRegression(solver='liblinear',penalty='l2'))])
pipeline.fit(X_train2,y_train2)
#x_train_r = pipeline.fit_transform(X_train2)
#x_test_r = pipeline.transform(X_test2)

CPU times: total: 1min 37s
Wall time: 27.6 s


Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca',
                 PCA(n_components=0.9, random_state=0, svd_solver='full')),
                ('miniBatchKmeans',
                 MiniBatchKMeans(batch_size=2000, random_state=42)),
                ('lg', LogisticRegression(solver='liblinear'))])

In [89]:
y_pred=pipeline.predict(X_test2)

In [90]:
precision = precision_score(y_test2, y_pred, average='micro')
precision

0.5784

The ultimate score here is doing worse than the supervised models we have used. We will not be using this model for our test model.