In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
import cv2
import os

In [2]:
os.environ['OMP_NUM_THREADS'] = '1'

In [3]:
df_train = pd.read_csv("leaf-classification/train.csv")
df_test = pd.read_csv("leaf-classification/test.csv")

In [4]:
def zero_pad_image(img, target_height, target_width):
    height, width = img.shape[:2]
    pad_bottom = max(0, target_height - height)
    pad_right = max(0, target_width - width)
    return cv2.copyMakeBorder(img, 0, pad_bottom, 0, pad_right, borderType=cv2.BORDER_CONSTANT, value=0)

In [5]:
max_height = 0
max_width = 0
folder = "leaf-classification/images/"
for filename in os.listdir(folder):
    img = cv2.imread(os.path.join(folder, filename), 0)
    height, width = img.shape[:2]
    if height > max_height:
        max_height = height
    if width > max_width:
        max_width = width

In [6]:
height = max_height // 30
width = max_width // 30

In [7]:
height

36

In [8]:
width

56

In [9]:
def extract_hog_features(image):
    hog_features = []
    hog_descriptor = cv2.HOGDescriptor(
        _winSize=(32, 32),
        _blockSize=(16, 16),
        _blockStride=(8, 8),
        _cellSize=(8, 8),
        _nbins=9
    )

    features = hog_descriptor.compute(image)  # compute HOG features
    hog_features.append(features.flatten())  # flatten feature vector
    
    return np.array(hog_features)

In [10]:
df_images_train = np.zeros((1, 1296))
for id in df_train["id"]:
    img = cv2.imread("leaf-classification/images/" + str(id) + ".jpg", 0)
    img = zero_pad_image(img, max_height, max_width)
    img = cv2.resize(img, (width, height), interpolation=cv2.INTER_NEAREST)
    img = extract_hog_features(img)
    df_images_train = np.r_[df_images_train, img]
df_images_train = np.delete(df_images_train, (0), axis=0)

In [11]:
df_images_test = np.zeros((1, 1296))
for id in df_test["id"]:
    img = cv2.imread("leaf-classification/images/" + str(id) + ".jpg", 0)
    img = zero_pad_image(img, max_height, max_width)
    img = cv2.resize(img, (width, height), interpolation=cv2.INTER_NEAREST)
    img = extract_hog_features(img)
    df_images_test = np.r_[df_images_test, img]
df_images_test = np.delete(df_images_test, (0), axis=0)

In [12]:
df_stacked_train = pd.concat((df_train.drop(columns=["id"]), pd.DataFrame(df_images_train)), axis=1)
df_stacked_train.columns = df_stacked_train.columns.astype(str)

In [13]:
df_stacked_test = pd.concat((df_test.drop(columns=["id"]), pd.DataFrame(df_images_test)), axis=1)
df_stacked_test.columns = df_stacked_test.columns.astype(str)

In [14]:
y = df_stacked_train["species"]
X = df_stacked_train.drop(columns=["species"])

In [15]:
# y = df_train["species"]
# X = df_train.drop(columns=["species", "id"])

In [16]:
lgbm_est = HistGradientBoostingClassifier()
lgbm_est.fit(X, y)

In [17]:
class_names = lgbm_est.classes_

In [18]:
yhat_proba = lgbm_est.predict_proba(df_stacked_test)
# yhat_proba = lgbm_est.predict_proba(df_test.drop(columns=["id"]))

In [19]:
submission = pd.DataFrame({"id": df_test["id"]})
for i, col in enumerate(class_names):
    submission[col] = yhat_proba[:, i]

In [20]:
submission

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
0,4,6.927753e-05,0.000055,5.520992e-05,3.174960e-04,6.472397e-05,6.575389e-05,4.997025e-05,5.039980e-05,6.588939e-05,...,0.000050,0.000143,6.909726e-04,4.777825e-05,7.336787e-05,5.521796e-05,4.738555e-05,1.017612e-03,2.184020e-04,4.865267e-05
1,7,4.101836e-06,0.000004,4.424081e-04,3.221350e-05,1.617803e-04,4.811424e-06,6.515726e-05,4.645627e-06,4.350808e-06,...,0.000004,0.000010,4.003843e-06,3.764994e-06,4.097412e-06,5.720897e-06,3.806902e-06,4.326663e-06,3.695812e-06,7.728176e-06
2,9,1.049349e-04,0.968648,1.220921e-04,3.488774e-04,1.354056e-04,1.025333e-04,1.223950e-04,1.939368e-04,4.969197e-04,...,0.000071,0.000126,8.537644e-05,8.032813e-05,1.293319e-04,9.671330e-05,1.246131e-04,1.019872e-04,7.882713e-05,3.709122e-04
3,12,6.652261e-05,0.000063,5.883284e-05,7.282443e-05,1.225482e-04,7.760996e-05,8.769682e-05,1.879543e-04,1.043104e-04,...,0.000065,0.000059,5.410579e-05,5.093823e-05,5.539737e-05,8.886457e-05,9.294619e-05,5.141265e-05,5.000023e-05,9.161189e-05
4,13,8.000323e-05,0.000114,5.561227e-05,3.356700e-05,4.369890e-05,4.448232e-05,8.546386e-05,3.403703e-05,5.254025e-04,...,0.000022,0.000037,4.065251e-05,3.230951e-05,9.743049e-05,8.442769e-04,5.888348e-05,2.152484e-04,3.172391e-05,3.286901e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,1576,5.617695e-06,0.998853,6.532698e-06,3.629647e-05,1.214071e-05,5.484488e-06,4.496632e-06,8.261241e-06,1.537090e-05,...,0.000004,0.000007,4.571859e-06,4.301201e-06,6.747078e-06,5.173620e-06,6.669432e-06,4.341304e-06,4.221931e-06,2.326439e-05
590,1577,4.292461e-05,0.000096,3.358167e-05,5.704559e-05,3.934278e-05,3.826016e-05,3.038425e-05,2.913039e-04,6.075733e-05,...,0.000030,0.000034,3.660638e-05,2.909987e-05,9.233016e-05,3.362297e-05,2.881295e-05,3.430812e-05,2.851393e-05,4.244600e-05
591,1579,2.212681e-04,0.014272,2.342771e-04,2.417747e-04,2.342781e-04,2.589817e-04,2.119153e-04,4.219045e-04,2.342587e-04,...,0.000212,0.000234,2.154338e-04,3.338530e-04,2.264371e-04,2.344442e-04,2.113877e-04,2.047223e-04,2.532181e-04,2.356729e-04
592,1580,1.237427e-03,0.000956,9.148415e-02,9.906728e-04,2.237609e-03,2.208174e-03,8.289053e-04,8.724081e-04,9.172456e-04,...,0.000830,0.001083,8.439142e-04,7.936620e-04,1.247614e-03,1.200703e-03,8.023200e-04,4.904343e-03,9.241415e-04,2.161712e-03


In [21]:
submission.to_csv("submission.csv", index=False)

validation run

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=666)

In [23]:
lgbm_est_val = HistGradientBoostingClassifier()
lgbm_est_val.fit(X_train, y_train)

In [24]:
yhat_train = lgbm_est_val.predict(X_train)
yhat_proba_train = lgbm_est_val.predict_proba(X_train)

In [25]:
accuracy_score(y_train, yhat_train)

1.0

In [26]:
log_loss(y_train, yhat_proba_train)

0.00012907255118423352

In [27]:
yhat_val = lgbm_est_val.predict(X_val)
yhat_proba_val = lgbm_est_val.predict_proba(X_val)

In [28]:
accuracy_score(y_val, yhat_val)

0.9427609427609428

In [29]:
log_loss(y_val, yhat_proba_val)

0.29586131655228515