In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.datasets as datasets
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
#Display all rows
pd.set_option('display.max_rows',None)


In [None]:
train_data=pd.read_csv('/kaggle/input/leaf-classification/train.csv.zip')
test_data=pd.read_csv('/kaggle/input/leaf-classification/test.csv.zip')
train_data.head(10)


In [None]:
#shape
print(f'data contains {train_data.shape[0]} rows and {train_data.shape[1]} columns \n')
#missing data
print(f'missing data per column is \n {train_data.isna().sum()}')
#duplicate
duplicated_data=train_data.duplicated()
#print(f'Number of duplicated rows = {len(duplicated_data[duplicated_data[1]==True])}')


In [None]:
#using GPU for faster training
device=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
use_cuda=torch.cuda.is_available()


In [None]:
#split into X and y to split to train and test
X=train_data.loc[0:,train_data.columns!='species']
X=X.drop("id",axis=1)
y=LabelEncoder().fit_transform(train_data.loc[0:,train_data.columns=='species'])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

#to (train) tensors
X_train_tensor=torch.tensor(X_train.values)
y_train_tensor=torch.tensor(y_train)
train_tensor=TensorDataset(X_train_tensor,y_train_tensor)

#to (test) tensors
X_test_tensor=torch.tensor(X_test.values)
y_test_tensor=torch.tensor(y_test)
test_tensor=TensorDataset(X_test_tensor,y_test_tensor)

#train and test and kaggle test
batch_size=64
train_dataloader=DataLoader(train_tensor,batch_size=batch_size,shuffle=True)
test_dataloader=DataLoader(test_tensor,batch_size=batch_size,shuffle=True)
test_kaggle_dataloader=DataLoader(test_data,batch_size=batch_size,shuffle=False)


In [None]:
X_train_tensor,y_train_tensor=X_train_tensor.to(device),y_train_tensor.to(device)
X_test_tensor,y_test_tensor=X_test_tensor.to(device),y_test_tensor.to(device)


In [None]:
net=nn.Sequential(nn.Linear(192,5),
                 nn.ReLU(),
                 nn.Linear(5,9),
                 nn.ReLU(),
                 nn.Linear(9,99),
                 nn.Softmax())
net=net.double()
net.to(device)
#net=net.cuda() if use_cuda else net
#train_dataloader=train_dataloader.cuda() if use_cuda  else train_dataloader
#test_dataloader=test_dataloader.cuda() if use_cuda  else test_dataloader


In [None]:
#optimizer and loss
criterion=nn.CrossEntropyLoss()
learning_rate=0.0001
momentum=0.9
optimizer=optim.SGD(net.parameters(),lr=learning_rate,momentum=momentum)


In [None]:
torch.manual_seed(42)

train_losses=[]
test_losses=[]

epochs=10
for epoch in range(epochs):
    #Training
    net.train()
    train_loss=0.0
    for features,target in train_dataloader:
        optimizer.zero_grad()
        features=features.to(device).double()
        target=target.to(device)
        outputs=net(features.double())
        loss=criterion(outputs,target)
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()*features.size(0)
    train_loss/=len(train_dataloader.dataset)
    train_losses.append(train_loss)
        #_,pred=torch.max(outputs,1)
    net.eval()
    test_loss=0.0
    correct=0
    total=0
    with torch.no_grad():
        for features_t,target_t in test_dataloader:
            features_t=features_t.to(device).double()
            target_t=target_t.to(device)
            outputs_t=net(features_t)
            loss_t=criterion(outputs_t,target_t)
            test_loss+=loss_t.item()*features.size(0)
            _,pred_t=torch.max(outputs_t,1)
            total+=target.size(0)
            correct+=(pred_t==target_t).sum().item()
    test_loss/=len(test_dataloader.dataset)
    test_losses.append(test_loss)
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {(100 * correct / total):.2f}%')


In [None]:
index=test_data['id']
test=torch.tensor(test_data.drop('id',axis=1).values)
test=test.to(device)


In [None]:
classes = ['Acer_Capillipes', 'Acer_Circinatum', 'Acer_Mono', 'Acer_Opalus', 'Acer_Palmatum', 'Acer_Pictum', 'Acer_Platanoids', 'Acer_Rubrum', 'Acer_Rufinerve', 'Acer_Saccharinum', 'Alnus_Cordata', 'Alnus_Maximowiczii', 'Alnus_Rubra', 'Alnus_Sieboldiana', 'Alnus_Viridis', 'Arundinaria_Simonii', 'Betula_Austrosinensis', 'Betula_Pendula', 'Callicarpa_Bodinieri', 'Castanea_Sativa', 'Celtis_Koraiensis', 'Cercis_Siliquastrum', 'Cornus_Chinensis', 'Cornus_Controversa', 'Cornus_Macrophylla', 'Cotinus_Coggygria', 'Crataegus_Monogyna', 'Cytisus_Battandieri', 'Eucalyptus_Glaucescens', 'Eucalyptus_Neglecta', 'Eucalyptus_Urnigera', 'Fagus_Sylvatica', 'Ginkgo_Biloba', 'Ilex_Aquifolium', 'Ilex_Cornuta', 'Liquidambar_Styraciflua', 'Liriodendron_Tulipifera', 'Lithocarpus_Cleistocarpus', 'Lithocarpus_Edulis', 'Magnolia_Heptapeta', 'Magnolia_Salicifolia', 'Morus_Nigra', 'Olea_Europaea', 'Phildelphus', 'Populus_Adenopoda', 'Populus_Grandidentata', 'Populus_Nigra', 'Prunus_Avium', 'Prunus_X_Shmittii', 'Pterocarya_Stenoptera', 'Quercus_Afares', 'Quercus_Agrifolia', 'Quercus_Alnifolia', 'Quercus_Brantii', 'Quercus_Canariensis', 'Quercus_Castaneifolia', 'Quercus_Cerris', 'Quercus_Chrysolepis', 'Quercus_Coccifera', 'Quercus_Coccinea', 'Quercus_Crassifolia', 'Quercus_Crassipes', 'Quercus_Dolicholepis', 'Quercus_Ellipsoidalis', 'Quercus_Greggii', 'Quercus_Hartwissiana', 'Quercus_Ilex', 'Quercus_Imbricaria', 'Quercus_Infectoria_sub', 'Quercus_Kewensis', 'Quercus_Nigra', 'Quercus_Palustris', 'Quercus_Phellos', 'Quercus_Phillyraeoides', 'Quercus_Pontica', 'Quercus_Pubescens', 'Quercus_Pyrenaica', 'Quercus_Rhysophylla', 'Quercus_Rubra', 'Quercus_Semecarpifolia', 'Quercus_Shumardii', 'Quercus_Suber', 'Quercus_Texana', 'Quercus_Trojana', 'Quercus_Variabilis', 'Quercus_Vulcanica', 'Quercus_x_Hispanica', 'Quercus_x_Turneri', 'Rhododendron_x_Russellianum', 'Salix_Fragilis', 'Salix_Intergra', 'Sorbus_Aria', 'Tilia_Oliveri', 'Tilia_Platyphyllos', 'Tilia_Tomentosa', 'Ulmus_Bergmanniana', 'Viburnum_Tinus', 'Viburnum_x_Rhytidophylloides', 'Zelkova_Serrata']


In [None]:
output=net(test)
output=output.to(device)


In [None]:
submission = pd.DataFrame(output.detach().cpu().numpy(), columns=classes)
submission.insert(0, 'id', index)
submission.to_csv('submission.csv',index=False)
