#### Preparing data
Run the next cells to download and prepare the datasets and util functions.

In [1]:
import sys
download_data = False

if 'google.colab' in sys.modules:
    download_data = True

In [2]:
if download_data:
    !wget --no-cache -O init.py -q https://raw.githubusercontent.com/DaielChom/ann_leaf_classification/master/init.py
    from init import download_utils, unzip_leaf_dataset, dataset_dir
    download_utils(force_download=False)

In [3]:
if download_data:
    !mkdir ./local/datasets/
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1g5MPhz2YEW5nyuUqacQ_GVWpeDjVG-Cf' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1g5MPhz2YEW5nyuUqacQ_GVWpeDjVG-Cf" -O local/datasets/leaf.zip && rm -rf /tmp/cookies.txt

In [4]:
if download_data:
    unzip_leaf_dataset()

#### Notebook start

In [1]:
import os
import random
import numpy as np
import pandas as pd
import progressbar
import matplotlib.pyplot as plt

import local.lib as lib

from init import dataset_dir
from skimage import io
from skimage.transform import resize
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from importlib import reload
reload(lib)

import warnings
warnings.filterwarnings('ignore')

# Baseline submission


## 1. Get datasets

In [2]:
X_train, _,_, y_train, X_test, _,_, y_test, species, _, _, _ = lib.get_splitted_data(data_dir=dataset_dir, split=1, check_id_sets=True, verbose=1)

The intersection between train and test set is 0
There are 99 classes for the classification task.


In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((990, 194), (990,), (0, 194), (0,))

## 2. ML model

In [4]:
model = LinearDiscriminantAnalysis()

In [5]:
model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [6]:
tr_pred = model.predict(X_train)
train_accuracy = (tr_pred == y_train).mean()
train_accuracy

1.0

## 3. Get submission data

In [7]:
X_test, _, _, submission_ids = lib.get_submission_data(data_dir=dataset_dir)

In [8]:
proba = model.predict_proba(X_test)

In [10]:
if not os.path.exists(dataset_dir+"/submissions/"):
    os.makedirs(dataset_dir+"/submissions/")

In [12]:
submission = pd.DataFrame(proba, columns=list(species.keys()))
submission.insert(0, 'id', submission_ids)
submission.reset_index()
submission.to_csv(dataset_dir+'/submissions/baseline.csv', index=False)
submission.head()

Unnamed: 0,id,Quercus_Crassifolia,Quercus_Agrifolia,Fagus_Sylvatica,Prunus_Avium,Quercus_x_Turneri,Eucalyptus_Urnigera,Quercus_Kewensis,Acer_Platanoids,Quercus_Hartwissiana,...,Cercis_Siliquastrum,Cornus_Macrophylla,Quercus_Phellos,Quercus_Brantii,Alnus_Maximowiczii,Sorbus_Aria,Salix_Intergra,Quercus_Crassipes,Pterocarya_Stenoptera,Tilia_Tomentosa
0,4,2.64809e-156,1.0,6.5400060000000005e-62,4.834572e-270,2.4238139999999998e-148,1.782032e-142,1.617913e-100,1.0017530000000001e-156,2.617869e-117,...,9.395233e-107,5.43251e-158,2.401802e-172,3.013098e-107,9.004461e-205,1.3953020000000002e-165,9.535157000000001e-118,2.186546e-150,3.617717e-143,1.497404e-103
1,7,3.103759e-116,1.4117269999999999e-111,8.93493e-77,4.584482e-157,1.046607e-67,2.555602e-69,2.5244119999999997e-35,1.043289e-59,6.482838000000001e-106,...,1.1221130000000001e-117,4.0789770000000006e-128,5.070281e-151,3.5366689999999996e-57,3.481799e-169,1.72971e-192,3.045398e-89,3.6679790000000005e-159,5.783611e-141,4.531027e-82
2,9,1.917798e-212,3.538238e-171,9.152259e-132,8.21683e-173,1.2065800000000001e-159,1.8380699999999998e-167,1.976879e-131,2.368988e-195,2.0409209999999998e-184,...,5.726368e-161,1.7917290000000002e-243,4.909446999999999e-236,4.70467e-150,1.3190279999999998e-100,2.394493e-227,1.147642e-161,1.8111819999999998e-236,1.386462e-115,5.877185e-92
3,12,9.013856999999999e-178,2.202597e-190,1.27128e-122,1.565585e-37,4.523027e-123,3.118825e-160,2.280348e-106,2.4294619999999998e-168,3.842818e-116,...,2.328514e-176,7.517263999999999e-195,7.934266e-211,1.952768e-102,1.121829e-112,6.786354e-206,1.1504540000000001e-163,2.4343850000000002e-262,3.3343680000000003e-144,1.383964e-97
4,13,1.566975e-219,6.822746000000001e-220,3.294463e-186,4.9613329999999994e-203,2.1126049999999997e-217,2.141278e-206,4.192888e-170,6.406200000000001e-194,4.087356e-216,...,9.487092e-222,5.79325e-256,5.801011e-293,1.1353870000000001e-175,1.0655739999999999e-63,1.054451e-190,1.438696e-192,3.3514619999999997e-239,1.736805e-125,1.938564e-99


## 4. Send submission
To submit you need [kaggle-api](https://github.com/Kaggle/kaggle-api)

In [36]:
! kaggle competitions submit -c leaf-classification -f {dataset_dir+"/submissions/baseline.csv"} -m "baseline"

100%|███████████████████████████████████████| 1.26M/1.26M [00:02<00:00, 479kB/s]
Successfully submitted to Leaf Classification

![score](./imgs/baseline_submission.png)