# Data manipulation

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import pandas as pd
import numpy  as np

In [3]:
# Fetch data from kaggle using kaggle-cli
# kg download -c dog-breed-identification -u Bobox214 -p XXXXX

In [4]:
PATH = 'data'

## Labels.csv

labels.csv associate the ID of a training file with a breed

In [5]:
labels = pd.read_csv(f'{PATH}/labels.csv')

In [6]:
labels.sample()

Unnamed: 0,id,breed
9076,e4245709e4060e08146b5fe1af72385d,flat-coated_retriever


In [7]:
labels['breed'].value_counts().shape

(120,)

In [8]:
labels['breed'].value_counts().min()

66

In [9]:
labels['breed'].value_counts().max()

126

# Submissions.csv

In [10]:
sample_sub = pd.read_csv(f'{PATH}/sample_submission.csv')

In [11]:
sample_sub.shape

(10357, 121)

In [12]:
sample_sub.sample()

Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
8359,ceda04a783e5a886f3f0375e51570667,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,...,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333


## FastAi Image Classifier

In [13]:
# This file contains all the main external libs we'll use
from fastai.imports import *

In [14]:
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

`PATH` is the path to your data - if you use the recommended setup approaches from the lesson, you won't need to change this. `sz` is the size that the images will be resized to in order to ensure that the training runs quickly. We'll be talking about this parameter a lot during the course. Leave it at `224` for now.

In [15]:
PATH = "data/"
sz=224

In [16]:
arch=resnet34
tfms = tfms_from_model(resnet34, sz, aug_tfms=transforms_side_on, max_zoom=1.1)

In [17]:
data = ImageClassifierData.from_csv(PATH,'train',f'{PATH}/labels.csv',bs=64,tfms=tfms,suffix='.jpg',test_name='test')

In [18]:
learn = ConvLearner.pretrained(arch, data, precompute=True)

In [19]:
learn.fit(0.01, 2)

epoch      trn_loss   val_loss   accuracy                   
    0      2.098472   1.046649   0.757031  
    1      1.099402   0.706769   0.808333                   



[0.706769, 0.808333333581686]

In [20]:
preds = learn.predict(is_test=True)

In [21]:
preds.shape

(10357, 120)

In [22]:
sub = pd.DataFrame(np.exp(preds),columns=sample_sub.columns[1:])

In [23]:
sub.sample()

Unnamed: 0,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,beagle,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
1356,1.7e-05,1.4e-05,5.7e-05,0.921669,3e-06,2e-05,0.000367,6e-06,2e-06,9e-06,...,3e-06,9e-06,0.000435,1.2e-05,5.7e-05,8e-05,2e-06,6e-06,0.007638,5.3e-05


In [24]:
full_sub = pd.concat((sample_sub['id'],sub),axis='columns')

In [25]:
full_sub.to_csv(f'{PATH}/1_data_manipulation.csv',index=False)

In [26]:
# Submit data to kaggle using kaggle-cli
# kg submit data/1_data_manipulation.csv -c titanic -u Bobox214 -p XXXXX