<a href="https://colab.research.google.com/github/Gabbryan/plane_classification/blob/main/notebooks/train_classification_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!curl -O https://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz
!tar xzf fgvc-aircraft-2013b.tar.gz
!mv fgvc-aircraft-2013b dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2625M  100 2625M    0     0  31.2M      0  0:01:23  0:01:23 --:--:-- 31.7M


## Imports

In [2]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.models import Sequential, load_model
from keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout
#from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

from PIL import Image

## Constantes


In [6]:
DATA_DIR = pathlib.Path('dataset/data')

IMAGE_WIDTH = 128
IMAGE_HEIGHT = IMAGE_WIDTH
IMAGE_DEPTH = 3

In [7]:
manufacturer_df = pd.read_csv(DATA_DIR / 'images_manufacturer_train.txt', sep=' ',
            names=['image_id', 'manufacturer'],
            usecols=['image_id', 'manufacturer'], # usecols for v1.4 compatibility
            dtype={'image_id': str},              # ids are not int but string
           )

## Verify data



In [12]:
manufacturer_df['manufacturer'].value_counts(dropna=False)

Boeing         733
Airbus         434
Embraer        233
McDonnell      232
de             167
Canadair       134
Douglas        133
Cessna         133
British        133
Lockheed       102
Fokker         100
Dassault        67
Gulfstream      67
Beechcraft      67
Saab            67
Tupolev         66
ATR             66
Panavia         34
Antonov         34
Dornier         34
Yakovlev        34
Bombardier      33
Ilyushin        33
Fairchild       33
Piper           33
Cirrus          33
Eurofighter     33
Supermarine     33
Robin           33
Name: manufacturer, dtype: int64

In [13]:
manufacturer_df.isna().sum()


image_id        0
manufacturer    0
dtype: int64

In [16]:
assert manufacturer_df['image_id'].isna().sum() == 0, "Valeur manquante dans image_id"
assert manufacturer_df['manufacturer'].isna().sum() == 0, "Valeur manquante dans manufacturer"

## Deal with N columns


In [17]:
!grep ',' dataset/data/images_manufacturer_train.txt

In [18]:
# Recherche le caractère T dans le fichier et n'affiche que trois lignes (head -3)
! grep 'T' dataset/data/images_manufacturer_train.txt | head -3

0724121 ATR
0619697 ATR
2243949 ATR


In [24]:
# wc : compte le nombre d'éléments (-l : ligne, -c : caractère, -w : word)
! grep 'T' dataset/data/images_manufacturer_train.txt | wc -l

132


In [25]:
!cut -f 1 -d ' ' dataset/data/images_manufacturer_train.txt | head -3


1025794
1340192
0056978


In [53]:
manufacturer_df = pd.read_csv(DATA_DIR / 'images_manufacturer_train.txt', sep='\t',
            names=['all'],
            dtype={'all': str},              # ids are not int but string
           )

# La fonction split() découpe une chaîne de caractères
manufacturer_df['image_id'] = manufacturer_df['all'].apply(lambda x: x.split(' ')[0])

# La fonction '<car>'.join(liste) concatène les éléments de liste en utilisant le séparateur <car>
manufacturer_df['manufacturer'] = manufacturer_df['all'].apply(lambda x: ' '.join(x.split(' ')[1:]))

In [54]:
manufacturer_df['manufacturer']

0         Boeing
1         Boeing
2         Boeing
3         Boeing
4         Boeing
          ...   
3329    Yakovlev
3330    Yakovlev
3331    Yakovlev
3332    Yakovlev
3333    Yakovlev
Name: manufacturer, Length: 3334, dtype: object

In [55]:
manufacturer_df['manufacturer'].unique()

array(['Boeing', 'Airbus', 'ATR', 'Antonov', 'British Aerospace',
       'Beechcraft', 'Lockheed Corporation', 'Douglas Aircraft Company',
       'Canadair', 'Cessna', 'McDonnell Douglas', 'de Havilland', 'Robin',
       'Dornier', 'Embraer', 'Eurofighter', 'Lockheed Martin',
       'Dassault Aviation', 'Fokker', 'Bombardier Aerospace',
       'Gulfstream Aerospace', 'Ilyushin', 'Fairchild', 'Piper',
       'Cirrus Aircraft', 'Saab', 'Supermarine', 'Panavia', 'Tupolev',
       'Yakovlev'], dtype=object)

In [56]:
manufacturer_df.head()

Unnamed: 0,all,image_id,manufacturer
0,1025794 Boeing,1025794,Boeing
1,1340192 Boeing,1340192,Boeing
2,0056978 Boeing,56978,Boeing
3,0698580 Boeing,698580,Boeing
4,0450014 Boeing,450014,Boeing


In [None]:
def build_image_database(path):
  """Build a pandas dataframe with target class and access path to images.

  Parameters
  ----------
  path (Path): path pattern to read csv file containing images information.

  Returns
  -------
  A pandas dataframe, including target class and path to image.
  """
  manufacturer_df = pd.read_csv(path, sep='\t',
              names=['all'],
              dtype={'all': str},              # ids are not int but string
            )

  # La fonction split() découpe une chaîne de caractères
  manufacturer_df['image_id'] = manufacturer_df['all'].apply(lambda x: x.split(' ')[0])

  # La fonction '<car>'.join(liste) concatène les éléments de liste en utilisant le séparateur <car>
  manufacturer_df['manufacturer'] = manufacturer_df['all'].apply(lambda x: ' '.join(x.split(' ')[1:]))

  # La colonne path contient le chemin d'accès à l'image
  manufacturer_df['path'] = manufacturer_df['image_id'].apply(lambda x: pathlib.Path('dataset/data/images') / (x + '.jpg'))

  return manufacturer_df