In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [2]:
from future.utils import iteritems

from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn


In [3]:
class NaiveBayes(object):
    def fit(self, X, Y, smoothing=1e-2):
        self.gaussians = dict()
        self.priors = dict()
        N, D = X.shape
        labels = set(Y)
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': np.cov(current_x.T) + np.eye(D)*smoothing,
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))
        for c, g in iteritems(self.gaussians):
            mean, cov = g['mean'], g['var']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=cov) + np.log(self.priors[c])
        return np.argmax(P, axis=1)

In [4]:
def get_data(path = 'data/fashion-mnist_train.csv', limit=None):
    print("Reading in and transforming data...")
    df = pd.read_csv(path)
    data = df.values
    np.random.shuffle(data)
    X = data[:, 1:] / 255.0 # data is from 0..255
    Y = data[:, 0]
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

In [8]:
#model = NaiveBayes()
#model = SVC()
    #model = KNeighborsClassifier(5)
    #model = BaggingClassifier(NaiveBayes(),
    #                         max_samples=1.0, max_features=1.0)
    
    #estimators = [
        #('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
     #('svr', LinearSVC(random_state=42)), ('svr2', SVC()), ('knn', KNeighborsClassifier(5)), ('logreg',LogisticRegression()) ]
    #model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    #model = RandomForestClassifier(n_estimators=20, random_state=42)
    #model = LinearSVC(random_state=42)

In [5]:
X, Y = get_data()
pca = PCA(n_components=300)
X = pca.fit_transform(X)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y)  

Reading in and transforming data...


In [6]:
X.shape

(60000, 300)

In [7]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y)  

In [8]:
Xtrain.shape

(45000, 300)

In [18]:
df = pd.read_csv('data/new_test.csv')
data = df.values
real_testX = data / 255.0 # data is from 0..255

In [19]:
real_testX.shape

(10000, 784)

In [20]:
real_testX = pca.transform(real_testX)
real_testX.shape

(10000, 300)

In [12]:

#Xtrain = preprocessing.scale(Xtrain)
#Xtest = preprocessing.scale(Xtest)

svm_clf = SVC(random_state=42)

# Create regularization penalty space
degree = [3, 5,6]

# Create regularization hyperparameter distribution using uniform distribution
C = [0.1,0.5,1,2,3,10, 100]
kernel  = ['rbf','poly', 'linear']
hyperparameters = [dict(C=C, degree=degree, kernel = kernel)] 

tuned_svm = GridSearchCV(svm_clf, hyperparameters, cv=3, verbose=1, n_jobs=-1)
#tuned_svm = svm_clf
    
t0 = datetime.now()
tuned_svm.fit(Xtrain, Ytrain)
best_logreg = tuned_svm

print("Training time:", (datetime.now() - t0))

#t0 = datetime.now()
#print("Train accuracy:", best_logreg.score(Xtrain, Ytrain))
#print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy:", best_logreg.score(Xtest, Ytest))
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

Fitting 3 folds for each of 63 candidates, totalling 189 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 189 out of 189 | elapsed: 13.7min finished


Training time: 0:13:49.239417
Test accuracy: 0.8716
Time to compute test accuracy: 0:00:04.609141 Test size: 2500


In [15]:
best_svm = best_logreg.best_estimator_
best_svm

SVC(C=3, random_state=42)

In [10]:
model = best_svm

t0 = datetime.now()
best_logreg = model.fit(Xtrain, Ytrain)
print("Training time:", (datetime.now() - t0))

#t0 = datetime.now()
#print("Train accuracy:", model.score(Xtrain, Ytrain))
#print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy:", model.score(Xtest, Ytest))
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

NameError: name 'best_svm' is not defined

In [None]:
X, Y = get_data(10000)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y)  
#Xtrain = preprocessing.scale(Xtrain)
#Xtest = preprocessing.scale(Xtest)

logreg = LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C = [0.1,0.5,1,2,3]
iterations  = [100,200, 50, 150]
hyperparameters = [dict(C=C, penalty=penalty, max_iter = iterations)] 

tuned_logreg = GridSearchCV(logreg, hyperparameters, cv=3, verbose=1, n_jobs=-1)
#tuned_logreg = LogisticRegression()
    
t0 = datetime.now()
best_logreg = tuned_logreg.fit(Xtrain, Ytrain)
print("Training time:", (datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy:", best_logreg.score(Xtrain, Ytrain))
print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy:", best_logreg.score(Xtest, Ytest))
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

In [44]:
knn = KNeighborsClassifier()
neighbors = [1,2,3,4,5,7]
hyperparameters = [{'n_neighbors': neighbors}]
tuned_knn = RandomizedSearchCV(knn, hyperparameters, random_state=1, n_iter=10, cv=4, verbose=1, n_jobs=-1)

    
t0 = datetime.now()
best_knn = tuned_knn.fit(Xtrain, Ytrain)
print("Training time:", (datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy:", best_knn.score(Xtrain, Ytrain))
print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy:", best_knn.score(Xtest, Ytest))
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.6min finished


Training time: 0:02:37.569246
Train accuracy: 0.8844
Time to compute train accuracy: 0:01:25.679305 Train size: 7500
Test accuracy: 0.818
Time to compute test accuracy: 0:00:28.331947 Test size: 2500


In [64]:
best_knn.best_estimator_

KNeighborsClassifier(n_neighbors=4)

In [54]:
best_logreg.best_params_

{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}

In [16]:
estimators = [('svr2', best_svm), ('logreg',LogisticRegression(C = 0.1, max_iter=100, penalty = 'l2')) ]
stacking = StackingClassifier(estimators=estimators, final_estimator=SVC())

t0 = datetime.now()
#stacking.fit(Xtrain, Ytrain)
stacking.fit(X, Y)
print("Training time:", (datetime.now() - t0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Training time: 0:02:03.493982


In [None]:
stackingdatetime.now()
print("Train accuracy:", stacking.score(Xtrain, Ytrain))
print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))


In [17]:
t0 = datetime.now()
print("Test accuracy:", stacking.score(Xtest, Ytest))
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

Test accuracy: 0.9468
Time to compute test accuracy: 0:00:06.833413 Test size: 2500


In [21]:
preds = stacking.predict(real_testX)

In [22]:
preds

array([3, 6, 3, ..., 0, 7, 8])

In [23]:
k = list(zip(range(1,len(preds)+1), preds))

In [24]:
res = pd.DataFrame(k, columns = ["id", "label"])
res.head()

Unnamed: 0,id,label
0,1,3
1,2,6
2,3,3
3,4,6
4,5,1


In [25]:
res.to_csv("submits/subm_4.csv", index=False)

In [35]:
res.id

0           1
1           2
2           3
3           4
4           5
5           6
6           7
7           8
8           9
9          10
10         11
11         12
12         13
13         14
14         15
15         16
16         17
17         18
18         19
19         20
20         21
21         22
22         23
23         24
24         25
25         26
26         27
27         28
28         29
29         30
        ...  
9970     9971
9971     9972
9972     9973
9973     9974
9974     9975
9975     9976
9976     9977
9977     9978
9978     9979
9979     9980
9980     9981
9981     9982
9982     9983
9983     9984
9984     9985
9985     9986
9986     9987
9987     9988
9988     9989
9989     9990
9990     9991
9991     9992
9992     9993
9993     9994
9994     9995
9995     9996
9996     9997
9997     9998
9998     9999
9999    10000
Name: id, Length: 10000, dtype: int64

In [None]:
import pickle

filename = 'models/model_88acc_stacking_svc_linearsvc_logreg.sav'
pickle.dump(stacking, open(filename, 'wb'))

In [39]:
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

StackingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=4)),
                               ('logreg', LogisticRegression(C=0.1))],
                   final_estimator=LogisticRegression())

In [67]:
from matplotlib import pyplot as plt

label = 9
D = len(model.gaussians[label]['mean'])
var = np.zeros((D, D))
var += np.eye(D) * model.gaussians[label]['var']

sample = 255*np.random.multivariate_normal(model.gaussians[label]['mean'], var)

pixels = sample.reshape((28, 28))
plt.imshow(pixels, cmap='gray')
plt.show()

AttributeError: 'RandomizedSearchCV' object has no attribute 'gaussians'