In [55]:
#load packages
#%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier


from skimage.io import imread, imshow

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model

from tensorflow.keras.losses import SparseCategoricalCrossentropy


import matplotlib.pyplot as plt
import seaborn as sns

import glob

from ipynb.fs.defs.CNN import check_grey_imgs, transform_image

In [2]:
#path to data

TRAIN_LABELS_PATH = './data/street-view-getting-started-with-julia/TrainLabelsExtended.csv'
TRAIN_IMG_PATH = './data/street-view-getting-started-with-julia/trainResized/'
TEST_IMG_PATH = './data/street-view-getting-started-with-julia/testResized/'

In [3]:
train_df = pd.read_csv(TRAIN_LABELS_PATH)

### Label Encoding

In [4]:
train_df_enc = train_df.copy()

In [5]:
label_enc = LabelEncoder()
train_df_enc['Label'] = label_enc.fit_transform(train_df_enc['Class'])
train_df_enc

Unnamed: 0,ID,Class,Label
0,5944,H,17
1,11109,Y,34
2,11177,Y,34
3,1380,I,18
4,9773,F,15
...,...,...,...
18737,10112,K,20
18738,4076,S,28
18739,2999,S,28
18740,5404,h,43


In [6]:
replaced_labels = dict(enumerate(label_enc.classes_))
# replaced_labels

### Load transformed data

In [7]:
transformed_data = np.load('transformed_data.npz')
transformed_data.files

['X_train_', 'X_test_', 'y_train_', 'y_test_']

In [8]:
X_train_ = transformed_data[transformed_data.files[0]] 
X_test_ = transformed_data[transformed_data.files[1]]
y_train_ = transformed_data[transformed_data.files[2]].flatten()
y_test_ = transformed_data[transformed_data.files[3]].flatten()

### CNN model

In [9]:
num_classes = len(np.unique(y_train_))
num_classes

62

In [10]:
model = Sequential([
    layers.Conv2D(20, (3, 3), padding= 'same', activation='relu', input_shape=(20, 20, 3)),
    layers.BatchNormalization(),
    layers.Conv2D(20, (3, 3), padding= 'same', activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.2),
    layers.Conv2D(32, (3, 3), padding= 'same', activation='relu'),
    layers.BatchNormalization(),
#     layers.Conv2D(32, (3, 3), padding= 'same', activation='relu'),
#     layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.2),
    layers.Conv2D(64, (3, 3), padding= 'same', activation='relu'),
#     layers.Conv2D(64, (3, 3), padding= 'same', activation='relu'),
    layers.BatchNormalization(),
#     layers.Conv2D(64, (3, 3), padding= 'same', activation='relu'),
    layers.MaxPooling2D((2, 2), name='feature_layer'),
    layers.Dropout(0.2),
#     layers.Conv2D(128, (3, 3), padding= 'same', activation='relu'),
#     layers.Conv2D(128, (3, 3), padding= 'same', activation='relu'),
#     layers.MaxPooling2D((2, 2)),
    layers.Flatten(name='feature_layer2'),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
#     layers.Dropout(0.2),
    layers.Dense(num_classes)
])

In [11]:
model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy', 'categorical_accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 20, 20, 20)        560       
_________________________________________________________________
batch_normalization (BatchNo (None, 20, 20, 20)        80        
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 20, 20, 20)        3620      
_________________________________________________________________
batch_normalization_1 (Batch (None, 20, 20, 20)        80        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 10, 10, 20)        0         
_________________________________________________________________
dropout (Dropout)            (None, 10, 10, 20)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 10, 10, 32)        5

In [None]:
# history = model.fit(X_train_, y_train_, epochs=12,# batch_size=64,
#                     validation_data=(X_test_, y_test_))

### Get extracted features

In [13]:
extracted_imgs = np.load('extracted_imgs.npz')
extracted_imgs.files

['train_x_compr', 'test_x_compr']

In [14]:
train_x_compr = extracted_imgs[extracted_imgs.files[0]]
test_x_compr = extracted_imgs[extracted_imgs.files[1]]

print(train_x_compr.shape, test_x_compr.shape)

(14993, 256) (3749, 256)


### Build models

#### SVM

In [60]:
svm_clf = SVC(kernel='poly', degree=2, C=10, decision_function_shape='ovo', probability=True).fit(train_x_compr, y_train_)
#found with gridsearch

accuracy_poly = svm_clf.score(test_x_compr, y_test_)
print('Accuracy Polynomial Kernel:', accuracy_poly)

Accuracy Polynomial Kernel: 0.9431848492931448


#### KNN

In [19]:
knn_clf = KNeighborsClassifier(n_neighbors=num_classes, 
                               weights='distance', 
                               algorithm='kd_tree', 
                               n_jobs=-1).fit(train_x_compr, y_train_)

accuracy_knn = knn_clf.score(test_x_compr, y_test_)
print('Accuracy KNN:', accuracy_knn)

Accuracy KNN: 0.8591624433182182


#### Logistic Regression

In [33]:
logreg_clf = LogisticRegression(solver="liblinear", random_state=42, max_iter=150).fit(train_x_compr, y_train_)

accuracy_logreg = logreg_clf.score(test_x_compr, y_test_)
print('Accuracy logreg:', accuracy_logreg)

Accuracy logreg: 0.9141104294478528


In [24]:
y_train_

array([22, 42, 46, ..., 61, 16, 42])

#### NeuralNet

In [30]:
def build_nn():
    
    model = Sequential([
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
    #     layers.Dropout(0.2),
        layers.Dense(num_classes)
    ])
    
    model.compile(optimizer='adam',
                  loss=SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy', 'categorical_accuracy'])
    
    return model

In [37]:
nn_clf = build_nn()
history = nn_clf.fit(train_x_compr, y_train_, 
                    epochs = 6,
                    validation_data = (test_x_compr, y_test_))

Train on 14993 samples, validate on 3749 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [38]:
history.history.keys()

dict_keys(['loss', 'accuracy', 'categorical_accuracy', 'val_loss', 'val_accuracy', 'val_categorical_accuracy'])

In [43]:
print('Train accuracy:', history.history['accuracy'][-1], '\nValid accuracy:',  history.history['val_accuracy'][-1]) 

Train accuracy: 0.9785233 
Valid accuracy: 0.9253134


In [45]:
#wrapper call converts the keras model into a scikit-learn model
keras_clf = keras.wrappers.scikit_learn.KerasClassifier(
                            build_nn,
                            epochs=6,
                            verbose=False)

keras_clf._estimator_type = "classifier"

In [53]:
keras_clf.fit(train_x_compr, y_train_)
keras_clf.score(test_x_compr, y_test_)

0.9250467

### Voting classifier

In [61]:
voting = VotingClassifier(
             estimators=[('svm', svm_clf),
                         ('knn', knn_clf),
                         ('lr', logreg_clf),
                         ('keras',keras_clf)], 
             voting='soft',
             flatten_transform=True,
             verbose = True)

In [62]:
%%time
vote_clf = voting.fit(train_x_compr, y_train_)

[Voting] ...................... (1 of 4) Processing svm, total= 1.2min
[Voting] ...................... (2 of 4) Processing knn, total=   2.7s
[Voting] ....................... (3 of 4) Processing lr, total= 2.7min
[Voting] .................... (4 of 4) Processing keras, total=   8.5s
Wall time: 4min 5s


#### Accuracy

In [63]:

for clf in (svm_clf, knn_clf, logreg_clf, keras_clf, vote_clf):
    y_pred = clf.predict(test_x_compr)
    print(clf.__class__.__name__, accuracy_score(y_test_, y_pred))

SVC 0.9431848492931448
KNeighborsClassifier 0.8591624433182182
LogisticRegression 0.9141104294478528
KerasClassifier 0.9250466791144305
VotingClassifier 0.9333155508135503


In [77]:
# vote_clf.score(test_x_compr, y_test_)

In [74]:
cv = StratifiedKFold(n_splits=3, random_state=12, shuffle=True)

In [79]:
train_acc_arr = []
coun = 1
for k_train_index, k_val_index in cv.split(train_x_compr, y_train_):
    print(f'Fold {coun}')
    
    voting.fit(train_x_compr[k_train_index,:], y_train_[k_train_index])
    
    train_acc = voting.score(train_x_compr[k_val_index,:], y_train_[k_val_index])
    train_acc_arr.append(train_acc)
    
    print('-------------')
    coun +=1

Fold 1
[Voting] ...................... (1 of 4) Processing svm, total=  41.9s
[Voting] ...................... (2 of 4) Processing knn, total=   1.4s
[Voting] ....................... (3 of 4) Processing lr, total= 1.7min
[Voting] .................... (4 of 4) Processing keras, total=   5.6s
-------------
Fold 2
[Voting] ...................... (1 of 4) Processing svm, total=  41.1s
[Voting] ...................... (2 of 4) Processing knn, total=   1.6s
[Voting] ....................... (3 of 4) Processing lr, total= 1.7min
[Voting] .................... (4 of 4) Processing keras, total=   5.3s
-------------
Fold 3
[Voting] ...................... (1 of 4) Processing svm, total=  40.5s
[Voting] ...................... (2 of 4) Processing knn, total=   1.6s
[Voting] ....................... (3 of 4) Processing lr, total= 1.6min
[Voting] .................... (4 of 4) Processing keras, total=   5.2s
-------------


### Prepare data for submission

In [64]:
def get_imnames():
    txtfiles = []
    for file in glob.glob(TEST_IMG_PATH + "*.Bmp"):
        file = file.split('\\')[-1][:-4] #get only filename
        txtfiles.append(file)
        
    return txtfiles

txtfiles = get_imnames()
len(txtfiles)

6220

In [65]:
presub_df = pd.DataFrame()
presub_df['id'] = txtfiles
presub_df['id'] = pd.to_numeric(presub_df['id'])
presub_df['img'] = [TEST_IMG_PATH + str(id_) + '.Bmp' for id_ in presub_df['id'].values]
presub_df.sort_values(by=['id'], inplace=True)
presub_df.reset_index(inplace=True, drop=True)

#### Check grey imgs

In [66]:
grey_imgs = check_grey_imgs(presub_df[['img']])
grey_imgs

[]

In [67]:
X_sub = np.asarray([transform_image(i).tolist() for i in presub_df['img'].values])
X_sub.shape

(6220, 20, 20, 3)

### Extract features from X_sub

In [68]:
feature_extr_model = keras.models.load_model('./feature_extr_model')
feature_extr_model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_23_input (InputLayer) [(None, 20, 20, 3)]       0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 20, 20, 20)        560       
_________________________________________________________________
batch_normalization_15 (Batc (None, 20, 20, 20)        80        
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 20, 20, 20)        3620      
_________________________________________________________________
batch_normalization_16 (Batc (None, 20, 20, 20)        80        
_________________________________________________________________
max_pooling2d_14 (MaxPooling (None, 10, 10, 20)        0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 10, 10, 20)        0   

In [69]:
X_sub_compr = feature_extr_model.predict(X_sub)
X_sub_compr.shape

(6220, 256)

### Make prediction

In [80]:
vote_sub_pred = vote_clf.predict(X_sub_compr)
vote_sub_labels_inv = label_enc.inverse_transform(vote_sub_pred)
vote_sub_labels_inv



array(['H', 'E', 'I', ..., 'R', 'N', 'M'], dtype=object)

### Create submission file

In [81]:
sub_df = pd.DataFrame()
sub_df['ID'] = presub_df['id']
sub_df['Class'] = vote_sub_labels_inv
sub_df.set_index('ID', inplace=True)
sub_df

Unnamed: 0_level_0,Class
ID,Unnamed: 1_level_1
6284,H
6285,E
6286,I
6287,d
6288,T
...,...
12499,0
12500,o
12501,R
12502,N


In [82]:
sub_df.to_csv('submission_cnn_ensemble.csv')