In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.python import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, Conv2D, Dropout, MaxPooling2D
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_model(connected=True)

ModuleNotFoundError: No module named 'tensorflow'

In [4]:
IMG_ROWS = 28
IMG_COLS = 28
NUM_CLASSES = 10
TEST_SIZE = 0.2
RANDOM_STATE = 2022

NO_EPOCHS = 50
BATCH_SIZE = 128
IS_LOCAL = False

if (IS_LOCAL):
    PATH="../input/fashionmnist/"
else:
    PATH="../input/"
print(os.listdir(PATH))

FileNotFoundError: [Errno 2] No such file or directory: '../input/'

In [5]:
train_file = PATH+"fashion-mnist_train.csv"
test_file = PATH+"fashion-mnist_test.csv"

train_data = pd.read_csv(train_file) 
test_data = pd.read_csv(test_file)

FileNotFoundError: File b'../input/fashion-mnist_train.csv' does not exist

In [None]:
labels = {0: "T-shirt/top", 1: "Trouser", 2: "Pullover", 3: "Dress", 4: "Coat", 5: "Sandal", 6: "Shirt", 7: "Sneaker", 8: "Bag", 9: "Ankle Boot"}

def get_classes_distribution(data):
    # Count for each label
    label_counts = data["label"].value_counts()
    
    # Get total number of samples
    total_samples = len(data)
    
    # Count the number of items in each class
    for i in range(len(label_counts)):
        label = labels[label_counts.index[i]]
        count = label_count.values[i]
        percent = (count/total_samples) * 100
        print("{:<20}: {} or {}%".format(label, count, percent))

get_classes_distribution(train_data)

In [None]:
def plot_label_per_class(data):
    f, ax = plt.subplots(1,1,figsize=(12,4))
    g = sns.countplot(data.label, order = data["label"].value_counts().index)
    g.set_title("Number of labels for each class")
    
    for p, label in zip(g.patches, data["label"].value_counts().index):
        g.annotate(labels[label], (p.get_x(), p.get_height()+0.1))
    plt.show()
plot_label_per_class(train_data)

In [None]:
# test set images class distribution
get_classes_distribution(test_data)

In [None]:
plot_label_per_class(test_data)

In [None]:
# train set images
def sample_images_data(data):
    sample_images = []
    sample_labels = []
    
    for k in labels.keys():
        samples = data[data["label"] == k].head(4)
        for j, s in enumerate(samples.values):
            img = np.array(samples.ilov[j, 1:]).reshape(IMG_ROWS, IMG_COLS)
            sample_images.append(img)
            sample_labels.append(samples.iloc[j, 0])
    print("Total number of sample images to plot: ", len(sample_images))
    return sample_images, sample_labels
train_sample_images, train_sample_labels = sample_images_data(train_data)

In [None]:
def plot_sample_images(data_sample_images, data_sample_labels, cmpa="Blues"):
    # Plot the sample images
    f, ax = plt.supplots(5,8, figsize=(16,10))
    
    for i, img in enumerate(data_sample_images):
        ax[i//8, i%8].imshow(img, cmap=cmap)
        ax[i//8, i%8].axis('off')
        ax[i//8, i%8].set_title(labels[data_sample_labels[i]])
    plt.show()
plot_sample_images(train_sample_images, train_sample_labels, "Green")

In [None]:
# test set images
test_sample_images, test_sample_labels = sample_images_data(test_data)
plot_sample_images(test_sample_images, test_sample_labels)

In [None]:
# data processing to prepare for the model
# reshape the columns from 784 to (28,28,1).
def data_processing(raw):
    out_y = keras.utils.to_categorical(raw.label, NUM_CLASSES)
    num_images = raw.shape[0]
    x_as_array = raw.values[:,1:]
    x_shaped_array = x_as_array.reshape(num_images, IMG_ROWS, IMG_COLS, 1)
    out_x = x_shaped_array / 255
    return out_x, out_y

In [None]:
# process both the train_data and the test_data
X, y = data_processing(train_data)
X_test, y_test = data_processing(test_data)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [None]:
print("Fashion MNIST train - rows:", X_train.shape[0], " columns:", X_train.shape[1:4])
print("Fashion MNIST valid - rows:", X_val.shape[0], " columns:", X_val.shape[1:4])
print("Fashion MNIST test - rows:", X_test.shape[0], " columns:", X_test.shape[1:4])

In [None]:
# check the class inbalance for the resulted training set
def plot_count_per_class(yd):
    ydf = pd.DataFrame(yd)
    f, ax = plt.subplots(1, 1, figsize=(12, 4))
    g = sns.countplot(ydf[0], order = np.arange(0, 10))
    g.set_title("Number of items for each class")
    g.set_xlabel("Category")
    
    for p, label in zip(g.patches, np.arange(0, 10)):
        g.annotate(labels[label], (p.get_x(), p.get_height()+0.1))
    plt.show()
    
def get_count_per_class(yd):
    ydf = pd.DataFrame(yd)
    # Get some count for each label
    label_counts = ydf[0].value_counts()
    total_samples = len(yd)
    
    for i in range(len(label_counts)):
        label = labels[label_counts.index[i]]
        count = label_counts.values[i]
        percent = (count / total_samples) * 100
        print("{:<20s}:    {} or  {}%".format(label, count, percent))
        
plot_count_per_class(np.argmax(y_train, axis=1))
get_count_per_class(np.argmax(y_train, axis=1))

In [None]:
# validation set, inbalance check
plot_count_per_class(np.argmax(y_val, axis=1))
get_count_per_class(np.argmax(y_val, axis=1))

In [None]:
# Model
model = sequential()

# Adding convolution 2d
model.add(Conv2D(32, kernel_size=(3, 3),
                activation='relu',
                kernel_initializer='he_normal',
                input_shape=(IMG_ROWS, IMG_COLS, 1)))
model.add(MaxPooling2D((2,2)))
model.add(Conv2D(64,
                kernel_size=(3, 3),
                activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(NUM_CLASSES, activation='softmax')
model.compile(loss=keras.losses.categorical_corossentropy,
             optimizer='adam',
             metrics=['accuracy'])

# Checking the model we initialized
model.summary()

In [None]:
# Plotting the model
plot_model(model, to_file='model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
# Running the model with the training set
train_model = model.fit(X_train, y_train,
                       batch_size=BATCH_SIZE,
                       epochs=NO_EPOCHS,
                       verbose=1,
                       validation_data=(X_val, y_val))

In [None]:
# Test prediction accuracy
score = model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', score[0])
print('Test Accuracy:', score[1])

In [None]:
# plot the train and validation accuracy and loss
def create_trace(x, y, ylabel, color):
    trace = go.Scatter(
        x = x, y = y,
        name = ylabel,
        marker = dict(color=color),
        mode = "marker+lines",
        text = x)
    return trace

def plot_accuracy_and_loss(train_model):
    hist = train_model.history
    acc = hist['acc']
    val_acc = hist['val_acc']
    loss = hist['loss']
    val_loss = hist['val_loss']
    epochs = list(range(1, len(acc)+1))
    trace_ta = create_trace(epochs, acc, "Training accuracy", "Green")
    trace_va = create_trace(epochs, val_acc, "Validation accuracy", "Red")
    trace_tl = create_trace(epochs, loss, "Training Loss", "Blue")
    trace_vl = create_trace(epochs, val_loss, "Validation Loss", "Magenta")
    fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Training and Validation Accuracy',
                                                                 'Training and Validation Loss'))
    fig.append_trace(trace_ta, 1, 1)
    fig.append_trace(trace_va, 1, 1)
    fig.append_trace(trace_tl, 1, 2)
    fig.append_trace(trace_vl, 1, 2)
    fig['layout']['xaxis'].update(title="Epoch")
    fig['layout']['xaxis2'].update(title="Epoch")
    fig['layout']['yaxis'].update(title="Accuracy", range=[0,1])
    fig['layout']['yaxis2'].update(title="Loss", range=[0,1])
    iplot(fig, filename="accuracy-loss")

plot_accuracy_and_loss(train_model)

In [None]:
# Adding several dropout layers to help avoid overfitting.
model = Sequential()
model.add(Conv2D(32, kernel_size =(3, 3),
                activation='relu',
                kernel_initializer='he_normal',
                input_shape=(IMG_ROWS, IMG_COLS, 1)))
model.add(MaxPooling2D((2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(64,
                kernel_size=(3,3),
                activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(128, (3,3), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss=keras.losses.categorical_corssentropy,
             optimizer='adam',
             matrics=['accuracy'])
model.summary()

In [None]:
# Plotting the mdoel
plot_model(model, to_file='model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
# Running the new model
train_model = model.fit(X_train, y_train,
                       batch_size=BATCH_SIZE,
                       epochs=NO_EPOCHS,
                       verbose=1,
                       validation_data=(X_val, y_val))

In [None]:
# Re-evaluate the prediction accuracy with the new model
plot_accuracy_and_loss(train_model)

In [None]:
# Re-evaluate the test prediction accuracy with the new model
score = model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', score[0])
print('Test Accuracy:', score[1])

In [None]:
# prediction for test data
prediced_classes = model.predict_classes(X_test)
# get the indices to be plotted
y_true = test_data.iloc[:, 0]

In [None]:
p = predicted_classes[:10000]
y = y_true[:10000]
correct = np.nonzero(p==y)[0]
incorrect = np.nonzero(p!=y)[0]

In [None]:
print("Correct predicted classes:", correct.shape[0])
print("Incorrect predicted classes:",incorrect.shape[0])

In [None]:
target_names = ["Class {} ({}) :".format(i, labels[i]) for i in range(NUM_CLASSES)]
print(classification_report(y_true, predicted_classes, target_naems=target_names))

In [None]:
# Visualize correctly and incorrectly classified images
def plot_images(data_index, cmap="Blues"):
    f, ax = plt.subplot(4,4, figsize=(15, 15))
    for i, indx in enumerate(data_index[:16]):
        ax[i//4, i%4].imshow(X_test[indx].reshape(IMG_ROWS, IMG_COLS), cmap=cmap)
        ax[i//4, i%4].axis('off')
        ax[i//4, i%4].set_title("True:{}  Pred:{}".format(labels[y_true[indx]], labels[prediced_classes[indx]]))
    plt.show()
# Correct => Green
plot_images(correct, "Greens")
# Incorrect => Red
plot_images(incorrect, "Reds")

In [None]:
# Link of the article used for the code
# https://www.kaggle.com/code/gpreda/cnn-with-tensorflow-keras-for-fashion-mnist/notebook