In [166]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import bson
import base64
import os
import tqdm

In [167]:
# Constants
IMAGES_FOLDER = "data/images"
NUMBER_IMAGE_MINIMUM = 0
MAX_RECORD = 50000

## Read the BSON file and create a Pandas DataFrame out of it

In [168]:
categories_df = pd.read_csv("data/category_names.csv")
data = bson.decode_file_iter(open("data/train.bson", "rb"))

In [169]:
def encode_b64(data) :
    encoded = base64.b64encode(data)
    return encoded

def create_dataframe(data_gen) :
    ids = []
    category_ids = []
    imgs = []
    i=0
    counter = 0
    
    for c, d in enumerate(data_gen):
        i += 1
        
        if (i % (MAX_RECORD/10)) == 0:
            counter += 10
            print(str(counter) + "%")
        
        if i > MAX_RECORD:
            break
        product_id = d["_id"]
        category_id = d["category_id"]
        for img_dict in d["imgs"]:
            img = encode_b64(img_dict["picture"])
            decoded_pic = img.decode("utf-8")
            
            ids.append(product_id)
            category_ids.append(category_id)
            imgs.append(decoded_pic)
    return pd.DataFrame({"_id":ids, "category_id":category_ids, "picture":imgs})

In [170]:
df = create_dataframe(data)
df.to_csv("data/data.csv", index=False)

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%


**The data contained in the BSON file has been extracted in a Pandas DataFrame**

In [171]:
df.head(5)

Unnamed: 0,_id,category_id,picture
0,0,1000010653,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,1,1000010653,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,2,1000004079,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,3,1000004141,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,4,1000015539,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


## Exploration

The data is read with pandas from the csv that we extracted from the BSON file

In [172]:
data_df = pd.read_csv("data/data.csv")

In [173]:
def decode(data):
    arr = np.asarray(bytearray(data), dtype=np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 

In [174]:
data_df.head()

Unnamed: 0,_id,category_id,picture
0,0,1000010653,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,1,1000010653,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,2,1000004079,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,3,1000004141,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,4,1000015539,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [175]:
data_df.iloc[0]

_id                                                            0
category_id                                           1000010653
picture        /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
Name: 0, dtype: object

In [176]:
def get_category_from_level(categories_df, id, level):
    return categories_df[categories_df["category_id"] == 1000010653]["category_level1"].values[0]

In [177]:
print("Unique categories: ", len(categories_df['category_id'].unique()))
print("Unique level 1 categories: ", len(categories_df['category_level1'].unique()))
print("Unique level 2 categories: ", len(categories_df['category_level2'].unique()))
print("Unique level 3 categories: ", len(categories_df['category_level3'].unique()))

Unique categories:  5270
Unique level 1 categories:  49
Unique level 2 categories:  483
Unique level 3 categories:  5263


### Categories levels
There three levels of categories. The first level being more generic (less number of different categories) and the thrid level being more specific (higher number of different categories)

In [178]:
for level in range(1, 4):   
    num_category = len(categories_df['category_level' + str(level)].unique())
    print("Number of categories for the level {}: {}".format(str(level), str(num_category)))

Number of categories for the level 1: 49
Number of categories for the level 2: 483
Number of categories for the level 3: 5263


We can visualize the categories levels for the level 1 being ELECTRONIQUE. We can clearly see that for one level 1 category, there are a high number of level 3 categories

In [179]:
categories_df[categories_df["category_level1"] == "ELECTRONIQUE"].head(8)

Unnamed: 0,category_id,category_level1,category_level2,category_level3
2110,1000013222,ELECTRONIQUE,AUTOMATISME,AUTOMATE PROGRAMMABLE
2111,1000013224,ELECTRONIQUE,AUTOMATISME,C-CONTROL
2112,1000013228,ELECTRONIQUE,AUTOMATISME,MAINTENANCE A DISTANCE
2113,1000013230,ELECTRONIQUE,AUTOMATISME,MESURE ET REGULATION
2114,1000013232,ELECTRONIQUE,AUTOMATISME,MOTORISATION
2115,1000013234,ELECTRONIQUE,AUTOMATISME,SIGNALISATION
2116,1000013237,ELECTRONIQUE,BOITIERS ET COFFRETS,ACCESSOIRES BOITIERS
2117,1000013239,ELECTRONIQUE,BOITIERS ET COFFRETS,BOITIER D'INSTALLATION MURAL


In [180]:
keep_columns = ["picture", "category_level1"]
imgs_with_labels = data_df.merge(categories_df, left_on="category_id", right_on="category_id")[keep_columns]
imgs_with_labels

Unnamed: 0,picture,category_level1
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,TELEPHONIE - GPS
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,TELEPHONIE - GPS
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,TELEPHONIE - GPS
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,TELEPHONIE - GPS
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,TELEPHONIE - GPS
...,...,...
84133,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,ART DE LA TABLE - ARTICLES CULINAIRES
84134,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,INSTRUMENTS DE MUSIQUE
84135,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,INSTRUMENTS DE MUSIQUE
84136,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,INSTRUMENTS DE MUSIQUE


**We will focus on the first level for our image classifer**

### Keeping categories having enough images

In [181]:
df_count = data_df.groupby(['category_id'])['_id'].count()
df_category_count = pd.DataFrame(df_count)
df_category_count.columns = ["images_count"]
df_category_count.head()

Unnamed: 0_level_0,images_count
category_id,Unnamed: 1_level_1
1000000237,1
1000000249,13
1000000253,6
1000000271,9
1000000285,3


In [182]:
df_count_N = df_category_count[df_category_count["images_count"] >= NUMBER_IMAGE_MINIMUM]

In [183]:
# List of remaining categories (first level)
list_category_id_N = list(df_count_N.index.values)
print("Remaining categories having at least {} images: {}" .format(str(NUMBER_IMAGE_MINIMUM), len(list_category_id_N)))

Remaining categories having at least 0 images: 2886


## Creating the image data

### Creating ID folders

In [184]:
def create_category_id_folders(list_category_id, img_folder) :
    make_folder_if_not_exist(img_folder)
    for cat_id in list_category_id :
        dest_path = os.path.join(img_folder,str(cat_id))
        make_folder_if_not_exist(dest_path)
        
def make_folder_if_not_exist(img_folder):
    if not os.path.exists(img_folder):
        os.mkdir(img_folder)
categories = list(imgs_with_labels["category_level1"].unique())

# delete IMAGES_FOLDER if it exists
if not os.path.exists(IMAGES_FOLDER):
    os.remove(IMAGES_FOLDER)
create_category_id_folders(categories, IMAGES_FOLDER)

### Export images from their binary representation in their respective folders

In [185]:
def write_images_to_category_folder(data, img_folder) :
    i = 0
    counter = 0
    for j, d in data.iterrows() :
        
        if (i % (MAX_RECORD/10)) == 0:
            counter += 10
            print(str(counter) + "%")
            
        dest_path = os.path.join(img_folder,str(d['category_level1']),str(d['category_level1'])+'_'+str(i)+'.jpg')
        data_ = d['picture']
        imgdata = base64.b64decode(data_)
        with open(dest_path, 'wb') as f:
            f.write(imgdata)
            i+=1
            
write_images_to_category_folder(imgs_with_labels, IMAGES_FOLDER)

10%
20%
30%
40%
50%
60%
70%
80%
90%
100%
110%
120%
130%
140%
150%
160%
170%
