# Pattern Recognition pipeline
The entire pipeline should be runnable from here.

### Genes data
**Raw genes data**
- Data is expected to be in *raw_data/genes/data.csv*
- Labels are expected to be in *raw_data/genes/labels.csv*

### Image data
**Raw image data**
- Animal images are expected to be in *raw_data/BigCats/[Animal]/*

### Feature selection
- Feature selection functions can be found in *feature_selection/[function].py*

### Classification
- Classification functions can be found in *classification/[function].py*

### Clustering
- Clustering functions can be found in *clustering/[function].py*

# Code


In [1]:
## Import ALL required imports

# Ignore SKlearn's deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Data imports
from raw_data.data_functions import load_num_data, load_img_data
from raw_data.data_load_MI_SVM import getImageData

# Feature selection imports
from feature_selection.pca import pca
from feature_selection.MI import MI_feature_select
from feature_selection.fourier_transform import ft_on_img_data
from feature_selection.edge_detection import edge_detection

# Classification imports
from classification.knn import knn, knn_gridsearch, knn_cross_validation
from classification.svm import svm_genes
from classification.decision_tree import decision_tree, cross_val_decision_tree

# Clustering imports
from clustering.kmeans import kmeans_train
# from clustering.fuzzy_c_means import fuzzy_c_means

# Misc imports
from sklearn.model_selection import train_test_split

## Load Data

In [2]:
print("Loading num data...")
num_data, num_labels = load_num_data()
print("Loading img data...")
img_data, img_labels = load_img_data()
print("Loading img2 data...")
img_data_k, img_labels_k = getImageData("../Data-211216/Data/BigCats/" )  

Loading num data...
Loading img data...
Loading img2 data...
data load succ


## Feature Extraction

### PCA

In [3]:
pca_num_data = pca(num_data.iloc[:,1:])

### MI

In [4]:

mi_num_data=MI_feature_select(num_data,num_labels,thre=0.5)

feature selection start
Feature drop


  y = column_or_1d(y, warn=True)


479 features whose MI score over threshold are selected from 20531 features
feature selection end


### SIFT

### Fourier

In [15]:
ft_img_data = ft_on_img_data(img_data)

### Contour Images

In [3]:
edge_img_data = edge_detection(img_data)

## Classification

### k-NN - Genes - 80/20 split
**Raw data (best k = 15 [0.998090, 0.998437])**

In [6]:
# Create the 80/20 split
x_train, x_test, y_train, y_test = train_test_split(num_data, num_labels, test_size=0.2, random_state=42, stratify=num_labels)

# Tune k using gridsearch on the train set
knn_gridsearch(x_train, y_train[1], [k for k in range(1,21,2)])

# Train using the optimal k (=5) and test on the test set
# acc, f1, pred_labels = knn(x_train, y_train[1], x_test, y_test[1], k=5)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
0,1,1.0,0.998437,3.116707,0.205894,0.487645,0.015434,{'n_neighbors': 1},1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,3,0.999826,0.998437,3.221874,0.226822,0.518735,0.040854,{'n_neighbors': 3},1.0,1.0,...,1.0,1.0,1.0,0.998264,1.0,1.0,1.0,1.0,1.0,0.000521
2,5,0.999826,0.998437,3.275557,0.138798,0.516436,0.025425,{'n_neighbors': 5},1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000521
3,7,0.999479,0.998437,3.478792,0.097929,0.560121,0.029238,{'n_neighbors': 7},1.0,1.0,...,0.998264,1.0,1.0,0.998264,1.0,1.0,1.0,1.0,1.0,0.000796
4,9,0.998437,0.998437,3.355431,0.139151,0.537029,0.036287,{'n_neighbors': 9},1.0,1.0,...,0.998264,0.998264,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.000521
5,11,0.999132,0.998437,3.489189,0.499169,0.539128,0.054879,{'n_neighbors': 11},1.0,1.0,...,0.998264,0.998264,1.0,0.998264,1.0,1.0,1.0,0.998264,1.0,0.000868
7,15,0.99809,0.998437,3.601953,0.501884,0.560422,0.068605,{'n_neighbors': 15},1.0,1.0,...,0.996528,0.998264,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.996528,0.000935
6,13,0.998437,0.996875,3.259862,0.134616,0.53373,0.018484,{'n_neighbors': 13},1.0,1.0,...,0.998264,0.998264,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.000521
8,17,0.998437,0.996875,3.329639,0.213463,0.543327,0.091084,{'n_neighbors': 17},1.0,1.0,...,0.998264,0.998264,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.000521
9,19,0.998437,0.996875,3.165292,0.154157,0.520335,0.031321,{'n_neighbors': 19},1.0,1.0,...,0.998264,0.998264,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.000521


**PCA (best k = 5 [0.998437, 0.998437])**

In [5]:
# Create the 80/20 split
x_train, x_test, y_train, y_test = train_test_split(pca_num_data, num_labels, test_size=0.2, random_state=42, stratify=num_labels)

# Tune k using gridsearch on the train set
knn_gridsearch(x_train, y_train[1], [k for k in range(1,21,2)])

# Train using the optimal k (=k) and test on the test set
# acc, f1, pred_labels = knn(x_train, y_train[1], x_test, y_test[1], k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
1,3,0.999826,0.998437,0.010297,0.0009,0.010697,0.000458,{'n_neighbors': 3},1.0,1.0,...,1.0,1.0,1.0,0.998264,1.0,1.0,1.0,1.0,1.0,0.000521
2,5,0.998437,0.998437,0.010897,0.0017,0.011196,0.001248,{'n_neighbors': 5},1.0,1.0,...,0.998264,0.998264,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.000521
0,1,1.0,0.996875,0.013602,0.007062,0.015389,0.009383,{'n_neighbors': 1},0.984375,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,11,0.997222,0.996875,0.010997,0.001095,0.012296,0.001269,{'n_neighbors': 11},1.0,1.0,...,0.996528,0.996528,0.998264,0.996528,0.998264,0.998264,0.996528,0.996528,0.996528,0.000851
6,13,0.996701,0.996875,0.011497,0.001359,0.012996,0.001548,{'n_neighbors': 13},1.0,1.0,...,0.996528,0.996528,0.996528,0.996528,0.998264,0.996528,0.996528,0.996528,0.996528,0.000521
7,15,0.996701,0.996875,0.011197,0.001249,0.013096,0.00192,{'n_neighbors': 15},1.0,1.0,...,0.996528,0.996528,0.996528,0.996528,0.998264,0.998264,0.996528,0.994792,0.996528,0.000935
8,17,0.996875,0.996875,0.010697,0.0011,0.011796,0.000748,{'n_neighbors': 17},1.0,1.0,...,0.996528,0.996528,0.996528,0.996528,0.998264,0.998264,0.996528,0.996528,0.996528,0.000694
9,19,0.996354,0.996875,0.011696,0.001005,0.012797,0.000749,{'n_neighbors': 19},1.0,1.0,...,0.996528,0.996528,0.996528,0.996528,0.996528,0.996528,0.996528,0.996528,0.994792,0.000521
3,7,0.997917,0.995313,0.010097,0.000943,0.011896,0.0013,{'n_neighbors': 7},0.984375,1.0,...,0.998264,0.998264,0.998264,0.994792,0.998264,0.998264,0.998264,0.998264,0.998264,0.001042
4,9,0.996875,0.995313,0.012396,0.001743,0.013495,0.002246,{'n_neighbors': 9},0.984375,1.0,...,0.998264,0.996528,0.998264,0.996528,0.996528,0.996528,0.996528,0.996528,0.994792,0.001042


**Mutual Information (best k = 5 [0.998437, 0.998437])**

In [7]:
# Create the 80/20 split
x_train, x_test, y_train, y_test = train_test_split(mi_num_data, num_labels, test_size=0.2, random_state=42, stratify=num_labels)

# Tune k using gridsearch on the train set
knn_gridsearch(x_train, y_train[1], [k for k in range(1,21,2)])

# Train using the optimal k (=k) and test on the test set
# acc, f1, pred_labels = knn(x_train, y_train[1], x_test, y_test[1], k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
0,1,1.0,0.998437,0.078874,0.005681,0.016495,0.001118,{'n_neighbors': 1},1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,3,0.998785,0.998437,0.076375,0.004714,0.015895,0.001043,{'n_neighbors': 3},1.0,1.0,...,0.998264,1.0,1.0,0.998264,0.998264,1.0,0.998264,0.998264,0.998264,0.000796
2,5,0.998437,0.998437,0.076176,0.005191,0.016495,0.001118,{'n_neighbors': 5},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
3,7,0.998437,0.998437,0.077775,0.006143,0.016295,0.001187,{'n_neighbors': 7},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
4,9,0.998437,0.998437,0.075376,0.005218,0.016495,0.001359,{'n_neighbors': 9},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
5,11,0.998437,0.998437,0.075575,0.00427,0.015695,0.00064,{'n_neighbors': 11},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
6,13,0.998437,0.998437,0.076375,0.004629,0.016595,0.001113,{'n_neighbors': 13},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
7,15,0.998437,0.998437,0.077775,0.006733,0.016694,0.001417,{'n_neighbors': 15},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
8,17,0.998437,0.998437,0.077175,0.006445,0.016194,0.001077,{'n_neighbors': 17},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521
9,19,0.998437,0.998437,0.073876,0.001758,0.015395,0.000663,{'n_neighbors': 19},1.0,1.0,...,0.998264,1.0,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.998264,0.000521


### k-NN - Genes - Cross Validation
**Raw data (best k = 3,5,7,9 [0.998752, 0.998750])**

In [8]:
# Tune k using gridsearch on the entire data set
knn_gridsearch(num_data, num_labels[1], [k for k in range(1,21,2)])

# Evaluate with the optimal k (=k) using cross validation
# acc, f1 = knn_cross_validation(num_data, num_labels[1], k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
0,1,1.0,0.99875,3.825881,0.113776,0.631999,0.058487,{'n_neighbors': 1},1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,3,0.998752,0.99875,4.107092,0.119092,0.647494,0.023441,{'n_neighbors': 3},1.0,1.0,...,0.998613,0.998613,0.998613,0.998613,1.0,0.998613,0.998613,0.998613,0.998613,0.000416
2,5,0.998752,0.99875,3.846475,0.085008,0.603008,0.025455,{'n_neighbors': 5},1.0,1.0,...,0.998613,0.998613,0.998613,0.998613,1.0,0.998613,0.998613,0.998613,0.998613,0.000416
3,7,0.998752,0.99875,3.900857,0.109242,0.617904,0.032141,{'n_neighbors': 7},1.0,1.0,...,0.998613,0.998613,0.998613,0.998613,1.0,0.998613,0.998613,0.998613,0.998613,0.000416
4,9,0.998752,0.99875,4.027417,0.171806,0.610106,0.048673,{'n_neighbors': 9},1.0,1.0,...,0.998613,0.998613,0.998613,0.998613,1.0,0.998613,0.998613,0.998613,0.998613,0.000416
5,11,0.998335,0.9975,3.857471,0.119032,0.634398,0.052177,{'n_neighbors': 11},1.0,1.0,...,0.997226,0.998613,0.998613,0.997226,1.0,0.998613,0.998613,0.997226,0.998613,0.000832
6,13,0.998474,0.9975,3.900657,0.133504,0.627301,0.013241,{'n_neighbors': 13},1.0,1.0,...,0.997226,0.998613,0.998613,0.998613,1.0,0.998613,0.998613,0.997226,0.998613,0.000747
7,15,0.998058,0.9975,3.772199,0.113435,0.59831,0.019411,{'n_neighbors': 15},1.0,1.0,...,0.997226,0.997226,0.998613,0.997226,1.0,0.997226,0.998613,0.997226,0.998613,0.00092
8,17,0.998335,0.99625,3.739209,0.089963,0.593411,0.040772,{'n_neighbors': 17},1.0,1.0,...,0.997226,0.997226,0.998613,0.997226,1.0,0.998613,0.998613,0.998613,0.998613,0.000832
9,19,0.997642,0.99625,3.907655,0.168925,0.599809,0.037628,{'n_neighbors': 19},1.0,1.0,...,0.997226,0.997226,0.998613,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000888


**PCA (best k = 3 [0.998613, 0.997500])**

In [9]:
# Tune k using gridsearch on the entire data set
knn_gridsearch(pca_num_data, num_labels[1], [k for k in range(1,21,2)])

# Evaluate with the optimal k (=k) using cross validation
# acc, f1 = knn_cross_validation(pca_num_data, num_labels[1], k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
1,3,0.998613,0.9975,0.011197,0.000979,0.012696,0.0009,{'n_neighbors': 3},1.0,1.0,...,0.998613,0.998613,0.998613,0.998613,1.0,0.998613,0.998613,0.998613,0.998613,0.000621
0,1,1.0,0.99625,0.011996,0.001095,0.012796,0.001249,{'n_neighbors': 1},1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,7,0.997226,0.99625,0.012196,0.001077,0.013795,0.0014,{'n_neighbors': 7},1.0,1.0,...,0.995839,0.997226,0.997226,0.995839,1.0,0.997226,0.997226,0.997226,0.997226,0.001074
7,15,0.996532,0.99625,0.012696,0.001187,0.015295,0.001676,{'n_neighbors': 15},1.0,1.0,...,0.995839,0.995839,0.995839,0.995839,0.998613,0.997226,0.994452,0.997226,0.997226,0.001118
2,5,0.997087,0.995,0.011696,0.000781,0.013595,0.001113,{'n_neighbors': 5},1.0,0.9875,...,0.994452,0.997226,0.997226,0.997226,0.998613,0.997226,0.997226,0.997226,0.997226,0.000971
4,9,0.997364,0.995,0.012096,0.0013,0.013595,0.00128,{'n_neighbors': 9},1.0,0.9875,...,0.995839,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000971
5,11,0.997226,0.995,0.014395,0.003006,0.020193,0.015133,{'n_neighbors': 11},1.0,0.9875,...,0.995839,0.997226,0.997226,0.997226,1.0,0.997226,0.995839,0.997226,0.997226,0.001074
6,13,0.996394,0.995,0.015895,0.0075,0.017194,0.003428,{'n_neighbors': 13},1.0,0.9875,...,0.995839,0.994452,0.995839,0.995839,0.998613,0.997226,0.994452,0.997226,0.997226,0.001271
8,17,0.995977,0.995,0.013296,0.0011,0.015695,0.000781,{'n_neighbors': 17},1.0,1.0,...,0.995839,0.995839,0.997226,0.995839,0.998613,0.994452,0.994452,0.995839,0.995839,0.001152
9,19,0.995145,0.995,0.014296,0.002864,0.015395,0.002058,{'n_neighbors': 19},1.0,1.0,...,0.994452,0.995839,0.995839,0.994452,0.997226,0.994452,0.994452,0.994452,0.995839,0.000931


**Mutual Information (best k = 5-19 [0.997503, 0.997500])**

In [10]:
# Tune k using gridsearch on the entire data set
knn_gridsearch(mi_num_data, num_labels[1], [k for k in range(1,21,2)])

# Evaluate with the optimal k (=k) using cross validation
# acc, f1 = knn_cross_validation(num_data, num_labels[1], k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
0,1,1.0,0.9975,0.101168,0.009461,0.019993,0.001341,{'n_neighbors': 1},1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,3,0.99778,0.9975,0.091771,0.006522,0.018694,0.001615,{'n_neighbors': 3},1.0,1.0,...,0.997226,0.997226,0.997226,0.998613,1.0,0.997226,0.998613,0.997226,0.997226,0.00092
2,5,0.997503,0.9975,0.09437,0.006871,0.019994,0.001183,{'n_neighbors': 5},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
3,7,0.997503,0.9975,0.091271,0.007415,0.019194,0.001399,{'n_neighbors': 7},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
4,9,0.997503,0.9975,0.088372,0.005498,0.018394,0.001199,{'n_neighbors': 9},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
5,11,0.997503,0.9975,0.088971,0.006571,0.018594,0.00128,{'n_neighbors': 11},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
6,13,0.997503,0.9975,0.096069,0.007406,0.020593,0.0018,{'n_neighbors': 13},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
7,15,0.997503,0.9975,0.091571,0.006958,0.019094,0.001374,{'n_neighbors': 15},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
8,17,0.997503,0.9975,0.09267,0.007482,0.019394,0.001854,{'n_neighbors': 17},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832
9,19,0.997503,0.9975,0.09317,0.00869,0.019294,0.001552,{'n_neighbors': 19},1.0,1.0,...,0.997226,0.997226,0.997226,0.997226,1.0,0.997226,0.997226,0.997226,0.997226,0.000832


### k-NN - BigCats - 80/20 split
**Raw data (best k = 3 [0.520405, 0.235714])**

In [13]:
# Create the 80/20 split
x_train, x_test, y_train, y_test = train_test_split(img_data, img_labels, test_size=0.2, random_state=42, stratify=img_labels)

# Tune k using gridsearch on the train set
knn_gridsearch(x_train, y_train, [k for k in range(1,21,2)])

# Train using the optimal k (=5) and test on the test set
# acc, f1, pred_labels = knn(x_train, y_train[1], x_test, y_test[1], k=5)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
1,3,0.520405,0.235714,0.140355,0.00642,0.146554,0.006198,{'n_neighbors': 3},0.214286,0.071429,...,0.54918,0.557377,0.5,0.54918,0.52459,0.536585,0.520325,0.544715,0.504065,0.038803
4,9,0.360323,0.228571,0.140555,0.003877,0.149453,0.008379,{'n_neighbors': 9},0.285714,0.071429,...,0.368852,0.418033,0.352459,0.393443,0.385246,0.357724,0.365854,0.382114,0.300813,0.039876
6,13,0.330868,0.192857,0.143954,0.006146,0.149053,0.004368,{'n_neighbors': 13},0.142857,0.071429,...,0.303279,0.409836,0.336066,0.327869,0.295082,0.365854,0.300813,0.365854,0.308943,0.036642
2,5,0.396242,0.192308,0.139556,0.006018,0.144354,0.00269,{'n_neighbors': 5},0.285714,0.071429,...,0.418033,0.434426,0.368852,0.418033,0.377049,0.390244,0.373984,0.398374,0.422764,0.02456
0,1,0.980388,0.191758,0.142555,0.011017,0.148353,0.007253,{'n_neighbors': 1},0.142857,0.142857,...,0.983607,0.97541,0.97541,0.991803,0.97541,0.98374,0.98374,0.97561,0.98374,0.005444
8,17,0.300646,0.182967,0.143554,0.006709,0.152052,0.006889,{'n_neighbors': 17},0.214286,0.142857,...,0.278689,0.336066,0.286885,0.360656,0.278689,0.268293,0.260163,0.365854,0.317073,0.039222
5,11,0.332474,0.176923,0.140955,0.003032,0.153652,0.014075,{'n_neighbors': 11},0.142857,0.071429,...,0.327869,0.368852,0.254098,0.344262,0.336066,0.325203,0.333333,0.406504,0.317073,0.037217
7,15,0.307224,0.176374,0.139355,0.002199,0.152952,0.010589,{'n_neighbors': 15},0.142857,0.142857,...,0.286885,0.311475,0.286885,0.368852,0.311475,0.292683,0.268293,0.317073,0.308943,0.025775
3,7,0.397061,0.17033,0.151252,0.021376,0.151152,0.009365,{'n_neighbors': 7},0.142857,0.0,...,0.393443,0.418033,0.377049,0.434426,0.409836,0.406504,0.398374,0.422764,0.357724,0.025886
9,19,0.300673,0.168132,0.140355,0.003954,0.151052,0.003832,{'n_neighbors': 19},0.214286,0.0,...,0.327869,0.295082,0.295082,0.344262,0.286885,0.284553,0.276423,0.317073,0.300813,0.021196


**SIFT (best k = k)**

**Fourier (best k = 17 [0.361922, 0.300549])**

In [16]:
# Create the 80/20 split
x_train, x_test, y_train, y_test = train_test_split(ft_img_data, img_labels, test_size=0.2, random_state=42, stratify=img_labels)

# Tune k using gridsearch on the train set
knn_gridsearch(x_train, y_train, [k for k in range(1,21,2)])

# Train using the optimal k (=5) and test on the test set
# acc, f1, pred_labels = knn(x_train, y_train[1], x_test, y_test[1], k=5)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
8,17,0.361922,0.300549,0.15645,0.018784,0.151552,0.028315,{'n_neighbors': 17},0.357143,0.285714,...,0.344262,0.393443,0.401639,0.344262,0.336066,0.365854,0.390244,0.365854,0.333333,0.024136
0,1,0.980388,0.3,0.150152,0.019958,0.144554,0.011897,{'n_neighbors': 1},0.428571,0.214286,...,0.983607,0.97541,0.97541,0.991803,0.97541,0.98374,0.98374,0.97561,0.98374,0.005444
4,9,0.446901,0.293407,0.150153,0.00802,0.142655,0.008623,{'n_neighbors': 9},0.428571,0.285714,...,0.442623,0.467213,0.491803,0.409836,0.442623,0.447154,0.447154,0.439024,0.447154,0.020142
9,19,0.370925,0.286264,0.157051,0.018289,0.15765,0.027581,{'n_neighbors': 19},0.357143,0.285714,...,0.377049,0.401639,0.377049,0.377049,0.344262,0.382114,0.390244,0.349593,0.349593,0.018113
7,15,0.334986,0.285714,0.138656,0.007319,0.137456,0.009241,{'n_neighbors': 15},0.357143,0.285714,...,0.327869,0.368852,0.344262,0.311475,0.344262,0.341463,0.357724,0.325203,0.292683,0.020899
1,3,0.611942,0.274176,0.148353,0.007836,0.146354,0.01365,{'n_neighbors': 3},0.428571,0.285714,...,0.598361,0.614754,0.647541,0.57377,0.614754,0.626016,0.601626,0.617886,0.585366,0.021729
6,13,0.367593,0.271978,0.139156,0.006952,0.131858,0.002299,{'n_neighbors': 13},0.357143,0.214286,...,0.352459,0.401639,0.368852,0.327869,0.327869,0.390244,0.406504,0.382114,0.357724,0.026324
3,7,0.459123,0.271429,0.145054,0.005716,0.143655,0.009442,{'n_neighbors': 7},0.285714,0.285714,...,0.45082,0.434426,0.459016,0.467213,0.467213,0.479675,0.479675,0.422764,0.487805,0.020178
2,5,0.541617,0.265934,0.144454,0.003201,0.140956,0.005863,{'n_neighbors': 5},0.214286,0.142857,...,0.54918,0.540984,0.557377,0.508197,0.52459,0.593496,0.536585,0.536585,0.560976,0.024386
5,11,0.406857,0.257143,0.148253,0.008244,0.141755,0.007881,{'n_neighbors': 11},0.357143,0.214286,...,0.42623,0.368852,0.442623,0.418033,0.385246,0.439024,0.398374,0.398374,0.398374,0.02263


### k-NN - BigCats - Cross Validation
**Raw data (best k = 7 [0.384967, 0.247059])**

In [17]:
# Tune k using gridsearch on the entire data set
knn_gridsearch(img_data, img_labels, [k for k in range(1,21,2)])

# Evaluate with the optimal k (=k) using cross validation
# acc, f1 = knn_cross_validation(img_data, img_labels, k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
3,7,0.384967,0.247059,0.149253,0.011676,0.151152,0.005911,{'n_neighbors': 7},0.294118,0.294118,...,0.392157,0.346405,0.424837,0.346405,0.366013,0.372549,0.418301,0.379085,0.398693,0.026217
2,5,0.461438,0.235294,0.161848,0.033436,0.175144,0.028622,{'n_neighbors': 5},0.235294,0.294118,...,0.444444,0.503268,0.464052,0.418301,0.470588,0.431373,0.477124,0.503268,0.509804,0.037455
1,3,0.510458,0.223529,0.152252,0.018606,0.174144,0.021969,{'n_neighbors': 3},0.294118,0.176471,...,0.496732,0.509804,0.51634,0.496732,0.51634,0.496732,0.529412,0.509804,0.529412,0.011855
4,9,0.376471,0.223529,0.147053,0.01268,0.151452,0.005461,{'n_neighbors': 9},0.352941,0.352941,...,0.366013,0.352941,0.437908,0.366013,0.352941,0.372549,0.385621,0.366013,0.379085,0.023237
5,11,0.370588,0.223529,0.143354,0.011125,0.151452,0.005141,{'n_neighbors': 11},0.352941,0.294118,...,0.346405,0.411765,0.366013,0.392157,0.339869,0.346405,0.379085,0.359477,0.372549,0.02227
6,13,0.343791,0.217647,0.140555,0.004736,0.152651,0.005966,{'n_neighbors': 13},0.294118,0.294118,...,0.313725,0.392157,0.372549,0.333333,0.313725,0.352941,0.366013,0.359477,0.326797,0.027451
7,15,0.335948,0.217647,0.142055,0.004741,0.154051,0.010725,{'n_neighbors': 15},0.411765,0.235294,...,0.313725,0.372549,0.339869,0.326797,0.339869,0.320261,0.366013,0.366013,0.300654,0.023961
8,17,0.342484,0.211765,0.144354,0.006451,0.15855,0.013135,{'n_neighbors': 17},0.176471,0.235294,...,0.287582,0.339869,0.405229,0.320261,0.333333,0.333333,0.352941,0.352941,0.346405,0.028218
9,19,0.343137,0.211765,0.140555,0.00361,0.152652,0.007333,{'n_neighbors': 19},0.411765,0.176471,...,0.300654,0.385621,0.359477,0.359477,0.294118,0.326797,0.352941,0.339869,0.320261,0.031379
0,1,0.972549,0.205882,0.148053,0.013296,0.15555,0.011408,{'n_neighbors': 1},0.176471,0.235294,...,0.980392,0.973856,0.986928,0.96732,0.96732,0.96732,0.96732,0.980392,0.96732,0.007039


**SIFT (best k = k)**

**Fourier (best k = 13 [0.425490, 0.382353])**

In [18]:
# Tune k using gridsearch on the entire data set
knn_gridsearch(ft_img_data, img_labels, [k for k in range(1,21,2)])

# Evaluate with the optimal k (=k) using cross validation
# acc, f1 = knn_cross_validation(ft_img_data, img_labels, k=7)

Unnamed: 0,param_n_neighbors,mean_train_score,mean_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,...,split1_train_score,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,std_train_score
6,13,0.42549,0.382353,0.152852,0.013099,0.148053,0.006976,{'n_neighbors': 13},0.470588,0.235294,...,0.437908,0.437908,0.418301,0.418301,0.431373,0.418301,0.405229,0.437908,0.431373,0.01072
4,9,0.473203,0.364706,0.147154,0.004307,0.144954,0.009463,{'n_neighbors': 9},0.411765,0.294118,...,0.457516,0.464052,0.437908,0.496732,0.464052,0.503268,0.457516,0.477124,0.529412,0.027138
7,15,0.418954,0.364706,0.158649,0.011239,0.149053,0.007117,{'n_neighbors': 15},0.470588,0.235294,...,0.405229,0.431373,0.411765,0.437908,0.385621,0.424837,0.385621,0.45098,0.437908,0.020966
8,17,0.410458,0.358824,0.155451,0.006958,0.154451,0.008013,{'n_neighbors': 17},0.352941,0.176471,...,0.424837,0.405229,0.411765,0.411765,0.398693,0.405229,0.392157,0.431373,0.411765,0.010858
9,19,0.400654,0.358824,0.153252,0.007278,0.148653,0.011641,{'n_neighbors': 19},0.411765,0.235294,...,0.418301,0.392157,0.398693,0.411765,0.398693,0.392157,0.379085,0.398693,0.411765,0.010956
3,7,0.504575,0.352941,0.147754,0.006568,0.146753,0.009618,{'n_neighbors': 7},0.294118,0.235294,...,0.542484,0.51634,0.470588,0.496732,0.45098,0.522876,0.470588,0.529412,0.535948,0.029636
0,1,0.972549,0.335294,0.147156,0.011482,0.146852,0.015977,{'n_neighbors': 1},0.235294,0.411765,...,0.980392,0.973856,0.986928,0.96732,0.96732,0.96732,0.96732,0.980392,0.96732,0.007039
2,5,0.552288,0.335294,0.143555,0.004385,0.141655,0.007705,{'n_neighbors': 5},0.411765,0.294118,...,0.581699,0.555556,0.529412,0.54902,0.509804,0.542484,0.575163,0.562092,0.575163,0.02133
5,11,0.464052,0.329412,0.15845,0.015984,0.144654,0.011337,{'n_neighbors': 11},0.411765,0.176471,...,0.464052,0.457516,0.470588,0.477124,0.464052,0.464052,0.444444,0.45098,0.490196,0.012401
1,3,0.581046,0.288235,0.147254,0.005254,0.140955,0.0054,{'n_neighbors': 3},0.235294,0.235294,...,0.581699,0.555556,0.542484,0.601307,0.601307,0.562092,0.555556,0.607843,0.607843,0.023647


### Naive Bayes

#### 80/20 split

#### K-fold

### Decision Trees

#### 80/20 split

In [7]:
# Raw data
print("using decision tree classifier on genes raw")
accuracy = decision_tree(num_data, num_labels)
print(f"test accuracy: {accuracy}")

# PCA data
print("using decision tree classifier on genes PCA")
accuracy = decision_tree(pca_num_data, num_labels)
print(f"test accuracy: {accuracy}")

# MI data
print("using decision tree classifier on genes MI")
accuracy = decision_tree(mi_num_data, num_labels)
print(f"test accuracy: {accuracy}")



using decision tree classifier on genes raw
test accuracy: 0.9875776397515528
using decision tree classifier on genes PCA
test accuracy: 0.906832298136646
using decision tree classifier on genes MI
test accuracy: 0.9875776397515528


#### K-fold

In [8]:
k = 5
# Raw data
print(f"using decision tree classifier on genes raw with {k}-fold")
accuracy = cross_val_decision_tree(num_data, num_labels, cv=k)
print("%0.2f test accuracy with a standard deviation of %0.2f" % (accuracy.mean(), accuracy.std()))

# PCA data
print(f"using decision tree classifier on genes PCA with {k}-fold")
accuracy = cross_val_decision_tree(pca_num_data, num_labels, cv=k)
print("%0.2f test accuracy with a standard deviation of %0.2f" % (accuracy.mean(), accuracy.std()))

# MI data
print(f"using decision tree classifier on genes MI with {k}-fold")
accuracy = cross_val_decision_tree(mi_num_data, num_labels, cv=k)
print("%0.2f test accuracy with a standard deviation of %0.2f" % (accuracy.mean(), accuracy.std()))

using decision tree classifier on genes raw with 5-fold
0.96 test accuracy with a standard deviation of 0.02
using decision tree classifier on genes PCA with 5-fold
0.92 test accuracy with a standard deviation of 0.01
using decision tree classifier on genes MI with 5-fold
0.98 test accuracy with a standard deviation of 0.01


### Logistic Regression

#### 80/20 split

#### K-fold

### SVM

#### 80/20 split

In [7]:
svm_genes(mi_num_data,num_labels)

svm train start
new model saved
accuracy on the training subset:1.000
accuracy on the test subset:0.994


  y = column_or_1d(y, warn=True)


#### K-fold

## Clustering

### K-Means

In [3]:
kmeans_train(img_data_k, img_labels_k)

score: -680557997.8520824
MI_evaluate_score_of_test: 0.22804604671270068
----------------k= 0 ----------------
['Tiger' 'Cheetah' 'Tiger' 'Tiger' 'Tiger' 'Lion' 'Tiger' 'Cheetah'
 'Jaguar' 'Cheetah' 'Cheetah' 'Lion' 'Tiger' 'Tiger' 'Leopard' 'Lion']
----------------k= 1 ----------------
['Jaguar' 'Jaguar' 'Jaguar' 'Tiger' 'Cheetah' 'Cheetah' 'Tiger' 'Leopard'
 'Cheetah' 'Cheetah' 'Jaguar' 'Lion' 'Lion' 'Cheetah' 'Jaguar' 'Leopard'
 'Lion' 'Leopard' 'Jaguar' 'Jaguar' 'Cheetah' 'Lion' 'Tiger' 'Jaguar'
 'Leopard']
----------------k= 2 ----------------
['Leopard' 'Lion' 'Tiger' 'Tiger' 'Tiger' 'Lion' 'Cheetah' 'Leopard'
 'Lion' 'Lion' 'Jaguar']
----------------k= 3 ----------------
['Lion' 'Tiger' 'Lion' 'Tiger' 'Cheetah' 'Tiger' 'Leopard' 'Jaguar'
 'Leopard' 'Tiger' 'Cheetah' 'Lion' 'Jaguar' 'Lion' 'Cheetah' 'Cheetah'
 'Leopard' 'Lion' 'Leopard' 'Lion' 'Leopard' 'Lion' 'Leopard' 'Tiger'
 'Tiger']
----------------k= 4 ----------------
['Lion' 'Jaguar' 'Cheetah' 'Tiger' 'Lion' 'Cheetah' 'Ch

### Fuzzy C-Means

In [10]:
print("RAW NUM")
fuzzy_c_means(num_data, 3)
print("PCA NUM")
fuzzy_c_means(pca_num_data, 3)
print("MI NUM")
fuzzy_c_means(mi_num_data, 3)

print("RAW IMG")
fuzzy_c_means(img_data, 3)
print("FT IMG")
fuzzy_c_means(ft_img_data, 3)



RAW NUM
fcm.partition_coefficient=0.040000000000048164
fcm.partition_entropy_coefficient=0.46438561897729874
PCA NUM
fcm.partition_coefficient=0.040000000000009125
fcm.partition_entropy_coefficient=0.4643856189774396
MI NUM
fcm.partition_coefficient=0.040000000000133616
fcm.partition_entropy_coefficient=0.46438561897699054
RAW IMG
fcm.partition_coefficient=0.04000000000005647
fcm.partition_entropy_coefficient=0.4643856189772687
FT IMG
fcm.partition_coefficient=0.040000000000025245
fcm.partition_entropy_coefficient=0.46438561897738134


-----------------------------------------------------
*end*