# Part 1: Data Loading

### Import libraries

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Load training data

In [9]:
train_X = pd.read_csv("../data/binary/X_train.csv")
train_y = pd.read_csv("../data/binary/Y_train.csv")

### Load testing data

In [10]:
test_x = pd.read_csv("../data/binary/X_test.csv")

# Part 2: Data cleaning & create new input features

In [12]:
print("Training set input size = {}, Training set output size = {}".format(train_X.shape, train_y.shape))

Training set input size = (62209, 964), Training set output size = (62209, 1)


Training data summary:
* There are 62,209 rows (images)
* Input columns:
    * first 900 columns = HoG (Histogram of oriented Gradients) extracted from the image (10×10 px cells, 9 orientations, 2×2 blocks).
    * next 16 columns drawn from a normal distribution (µ = 0.5, σ = 2)
    * last 48 columns correspond to 3 colour histograms extracted from the same image, one for each channel (RGB), with 16 bins per channel.
* Output column: class ID (output) for each sample row from X train.csv.

In [44]:
train_X_HoG = train_X.iloc[:, :900]
train_X_normal_dist = train_X.iloc[:, 900:916]
train_X_colour_hists = train_X.iloc[:, 916:]

# 3. Data Visualisation & Analysis

### HoG (Histogram of oriented Gradients)

In [34]:
train_X_HoG.head(5)

Unnamed: 0,1.082472824855012783e-01,4.364114731206997395e-03,6.291975982563380483e-03,1.234358034835941059e-02,1.254194945357323121e-01,2.113676959431135538e-01,3.083075331002742558e-01,3.083075331002742558e-01.1,4.771379763706801169e-02,1.472457237228413651e-01,...,8.686412898108195058e-02,2.316062194613806025e-01.8,1.022186704613833354e-02,1.110932227296936997e-01,3.618340662736342639e-02,1.512644653294893393e-01,1.667434036650611295e-01,2.164031214774310430e-01,2.316062194613806025e-01.9,7.255405245155327476e-02
0,0.245888,0.057265,0.054288,0.073985,0.245888,0.143398,0.113087,0.208721,0.173131,0.245888,...,0.018032,0.204783,0.059727,0.248219,0.248219,0.248219,0.12105,0.067763,0.037096,0.025164
1,0.218803,0.107196,0.056075,0.108894,0.143207,0.079305,0.068837,0.147177,0.089514,0.166596,...,0.232862,0.232918,0.04863,0.105441,0.028698,0.191464,0.218813,0.232918,0.232918,0.092948
2,0.199052,0.099943,0.109036,0.095986,0.218288,0.20551,0.218288,0.113477,0.128925,0.218288,...,0.161302,0.234259,0.134242,0.185844,0.164967,0.234259,0.064108,0.043844,0.234259,0.119083
3,0.234368,0.163826,0.135215,0.234368,0.21965,0.100049,0.056125,0.065694,0.148874,0.234368,...,0.243317,0.243317,0.047693,0.084939,0.123729,0.113036,0.243317,0.194783,0.243317,0.136803
4,0.242391,0.242391,0.205771,0.166023,0.19092,0.12131,0.073872,0.099532,0.142877,0.242391,...,0.227183,0.227183,0.108024,0.141204,0.104388,0.157016,0.114653,0.088492,0.101651,0.187334


In [32]:
train_X_HoG.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62209 entries, 0 to 62208
Columns: 900 entries, 1.082472824855012783e-01 to 7.255405245155327476e-02
dtypes: float64(900)
memory usage: 427.2 MB


### Normal distribution

In [36]:
train_X_normal_dist.head(5)

Unnamed: 0,-1.280041182269348932e+00,1.238382237576325373e+00,2.661622440272609413e+00,-2.942722974904070643e+00,2.566506617912386323e-01,-8.475581233028577355e-01,-1.442717711274470460e-01,-3.628990213276483523e+00,-3.172231510441116953e-01,2.649821696601030396e+00,5.004708088939684885e+00,-4.185131733644973373e+00,-3.196605170193895784e-01,-4.037816158333373817e-01,8.370982363360532919e-01,6.744666618434232985e-01
0,1.734251,3.321642,-1.756527,3.417488,-1.973764,-0.026902,2.733768,1.420462,0.212293,0.360131,1.912276,6.072646,-3.138425,-1.144073,-0.793034,-2.189273
1,-1.218798,-0.706921,-0.970352,1.446063,1.069938,4.533852,2.815861,-0.491769,-0.232733,1.098961,-0.329036,2.557483,-1.199582,0.998466,-1.331319,1.782689
2,-2.192398,-1.735968,2.031139,0.725424,-0.305053,-1.314722,-1.477061,3.391574,3.039285,-0.780832,0.132886,2.852817,4.059947,-0.46299,3.677201,-4.173188
3,-0.691028,-4.928038,-2.162437,1.651693,0.31429,2.053126,6.163571,1.326052,-2.152279,3.103169,-1.078835,-0.374443,1.33932,-0.335326,0.810662,-0.089223
4,2.373451,-0.442523,1.532078,-0.055638,2.619595,-2.02882,0.683255,-0.453235,0.937196,-2.464585,-0.722632,-2.191061,2.198349,2.634078,1.433194,2.380293


In [37]:
train_X_normal_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62209 entries, 0 to 62208
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   -1.280041182269348932e+00  62209 non-null  float64
 1   1.238382237576325373e+00   62209 non-null  float64
 2   2.661622440272609413e+00   62209 non-null  float64
 3   -2.942722974904070643e+00  62209 non-null  float64
 4   2.566506617912386323e-01   62209 non-null  float64
 5   -8.475581233028577355e-01  62209 non-null  float64
 6   -1.442717711274470460e-01  62209 non-null  float64
 7   -3.628990213276483523e+00  62209 non-null  float64
 8   -3.172231510441116953e-01  62209 non-null  float64
 9   2.649821696601030396e+00   62209 non-null  float64
 10  5.004708088939684885e+00   62209 non-null  float64
 11  -4.185131733644973373e+00  62209 non-null  float64
 12  -3.196605170193895784e-01  62209 non-null  float64
 13  -4.037816158333373817e-01  62209 non-null  flo

### Colour histogram

In [39]:
train_X_colour_hists.head(5)

Unnamed: 0,5.700000000000000000e+02,9.570000000000000000e+02,4.480000000000000000e+02,2.140000000000000000e+02,1.790000000000000000e+02,2.270000000000000000e+02,2.430000000000000000e+02,1.970000000000000000e+02,1.520000000000000000e+02,9.500000000000000000e+01,...,1.290000000000000000e+02,1.600000000000000000e+02,2.640000000000000000e+02,2.330000000000000000e+02,1.440000000000000000e+02,1.210000000000000000e+02,1.170000000000000000e+02,8.000000000000000000e+01,3.500000000000000000e+01,2.300000000000000000e+01
0,26.0,99.0,263.0,450.0,656.0,552.0,525.0,349.0,252.0,186.0,...,338.0,277.0,199.0,122.0,94.0,26.0,11.0,8.0,3.0,1.0
1,9.0,42.0,143.0,324.0,668.0,494.0,447.0,407.0,368.0,344.0,...,440.0,351.0,218.0,124.0,63.0,21.0,9.0,7.0,6.0,3.0
2,11.0,83.0,104.0,486.0,1003.0,545.0,724.0,191.0,199.0,88.0,...,871.0,427.0,578.0,292.0,75.0,102.0,81.0,30.0,33.0,10.0
3,4.0,37.0,69.0,112.0,159.0,101.0,145.0,187.0,707.0,1299.0,...,166.0,285.0,605.0,1011.0,822.0,192.0,39.0,14.0,8.0,5.0
4,622.0,1485.0,718.0,450.0,221.0,47.0,12.0,11.0,3.0,5.0,...,978.0,272.0,162.0,134.0,12.0,7.0,2.0,2.0,1.0,1.0


In [38]:
train_X_colour_hists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62209 entries, 0 to 62208
Data columns (total 48 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   5.700000000000000000e+02    62209 non-null  float64
 1   9.570000000000000000e+02    62209 non-null  float64
 2   4.480000000000000000e+02    62209 non-null  float64
 3   2.140000000000000000e+02    62209 non-null  float64
 4   1.790000000000000000e+02    62209 non-null  float64
 5   2.270000000000000000e+02    62209 non-null  float64
 6   2.430000000000000000e+02    62209 non-null  float64
 7   1.970000000000000000e+02    62209 non-null  float64
 8   1.520000000000000000e+02    62209 non-null  float64
 9   9.500000000000000000e+01    62209 non-null  float64
 10  8.700000000000000000e+01    62209 non-null  float64
 11  1.160000000000000000e+02    62209 non-null  float64
 12  5.500000000000000000e+01    62209 non-null  float64
 13  2.900000000000000000e+01    622

### Class ID

In [41]:
train_y.head(5)

Unnamed: 0,background
0,background
1,background
2,background
3,background
4,background


In [40]:
train_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62209 entries, 0 to 62208
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   background  62209 non-null  object
dtypes: object(1)
memory usage: 486.1+ KB


# Part 4: Prepare inputs

# Parts 5/6: Select & train classification models

# Part 7: Evaluating & comparing models performance 

# Part 8: Critical discussion of the results, approach and methods