# Import modules

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import ipywidgets as widgets
%matplotlib inline
from IPython.display import HTML
import cv2
from pathlib import Path
import pandas as pd
from scipy.special import softmax
from scipy.spatial.distance import cdist
HTML('../style/course.css') #apply general CSS

%load_ext autoreload
%autoreload 2

In [2]:
from utils import (imshow, hsv_inrange_mask, crop_image, get_edges_mask, get_files_dict_list, filter_data, select_top_by_class,
                    histogram_distances, smallest_distance_label, confusion_matrix_df, create_evaluation_df, largest_occupancy_label, 
                    focal_spread_mask, mlib_rgb_to_cv2_rgb, hsv_percentile_mask, distances_dict_to_df, print_execution_time,
                    ColumnNames,IntermediateColumnNames, plot_hue_sat_val_histograms, plot_images)
from predictor import ColorPredictor

In [3]:
import importlib
import utils
import predictor

# Reload the module
importlib.reload(utils)
importlib.reload(predictor)
from utils import *
from predictor import ColorPredictor

# Contents

## 1) Demonstration of ColorPredictor class

Overview of method, adjusting hyperparamters, inference and evaluation

## 2) Histograms

Mean and standard deviations of histograms for a particular class

## 3) The process

Visualization of the process and discussion of design decisions

## 4) Incorrect predictions

Visualizations of incorrect labels

## 5) Random hyperparameter search

## 6) Some machine learning

Using features extracted from the ColorPredictor to enhance predictions

# Demo of ColorPredictor class

This class predicts the overall color of objects, in this case, cars.
There are two main approaches.
The first finds the color with the most occupancy by looking determining all the pixels that lie in the HSV range for a particulare color.
This is a fully hard-coded unsupervised method.

The second method uses the H, S and V histograms of an image.
It requires knowledge of the average and standard deviations of the histograms for each color.
It is therefore a supervised method that requires training.
The number of saved values that are stored to a file are : n_colors x 3 x n_bins x 2.
It compares the histograms on an image to the average histograms of each color, and chooses
the most similar color.
The average square distance between each bin is normalized by the color std values.

The formula is:
dist_for_color = sqrt( sum(  ((b_color - b_img)/ b_color_std )**2 )),
where the sum is computed over all (b)ins, which usually is 256 * 3 = 768 bins.
The standard deviations are also weighted, where each of the 9 colors get 3 weights, one for H S and V.
This way we can pay more attention less attention to hue when considering black for example.

Another approach would be to research HSV or RGB histograms for e.g. blue objects in general, and then use the same formula as before.
This has an advantage, since the average histograms in my method contains background pixels, which is not desired.
This way the histograms are not learned from the train set, but hardcoded and should be more robust to new scenarios.
However, the general histogram may not apply well to cars.
( This is a method I still want to test )


For both methods we use the same image processing which consists of cropping, edge detection, saturation and value clipping and focal scaling. These will be explained later.

When predicting labels for each image, a label is produced for each method.
Both methods are therefore evaluated and are completely independant.
This comes with the downside of additional execution time in the case that we are only
interested in one method.

One feature per method per color is extracted, totaling 18 features per image ( we have 9 colors).
Calculating the features takes up most of the execution time.
Classifying an image is as simple as selecting the color with the highest/lowest value.
Parts of the image are ignored, such as edges, before the features are calculated. 

The HSV range method gets an overall F1 score of 74% for both test and train set.
The histogram  method gets an overall F1 score of 80% for the train set and 72% for the test set.

These features can also be fed into other ML classifiers.
We see up to an 8% increase in accuracy

Lastly, I assumed that the images can have different sizes, so I handle one image at a time.
A better way would be to load a batch of e.g. 32 images and process them all together, which should be
easy for HSV range checking and numpy methods where we can specify the axis.

In [4]:
# Below we can edit the hyperparameters of the classifier
# Maybe skip reading it for now and come back to it later
# Can perform a grid search to choose better hyper parameters

# These ranges were found by looking at histograms of images, they can definitely be improved
ColorPredictor.hsv_ranges = hsv_ranges = {
       'black': [[0, 0, 0], [255, 255, 50]], 
       'blue': [[140, 50, 50], [180, 255, 255]],
       'brown': [[10, 25, 25], [30, 255, 255]],
       'green': [[55, 25, 25], [140, 255, 255]],
       'pink': [[180, 25, 50], [245, 255, 255]], 
       'red': [[245, 50, 50], [10, 255, 255]],
       'silver': [[110, 0, 75], [180, 100, 220]], 
       'white': [[0, 0, 200], [255, 50, 255]],
       'yellow': [[30, 50, 50], [55, 255, 255]]}
for key in hsv_ranges.keys():
    hsv_ranges[key] = np.array(list(hsv_ranges[key])) / 255.0
    
# A weight for H, S and V for each color
ColorPredictor.hsv_color_weights = {'black': np.array([1.2, 0.8, 1.2]),
                                    'blue': np.array([0.8, 0.9, 1. ]),
                                    'brown': np.array([1. , 1.1, 1.1]),
                                    'green': np.array([0.8, 0.9, 1. ]),
                                    'pink': np.array([1.1, 1. , 1. ]), 
                                    'red': np.array([1. , 0.9, 0.9]),
                                    'silver': np.array([1. , 1.2, 0.9]),
                                    'white': np.array([0.8, 0.9, 1.2]),
                                    'yellow': np.array([0.8, 1.1, 1. ])}


ColorPredictor.focal_spread_min_val = 0.1 # pixels get weighted with distance to the center, with the furthest pixel having 0.1
ColorPredictor.sat_percentile_min = 8 # all pixels with a saturation lower than the 8th percentile are ignored
ColorPredictor.val_percentile_min = 8 # all pixels with a value lower than the 8th percentile are ignored
ColorPredictor.edges_val = 160 # used by cv2.canny
ColorPredictor.edges_blur_ksize = 3 # blurs edges with kernel of size 3
ColorPredictor.crop_ratio = 0.07 # cars are usually in the center of image
ColorPredictor.bins = 256 # number of bins for histograms



In [5]:
"""     Loads a list of dictionaries containing file paths, and true labels if available.
        Checks whether the average histograms of each color should be calculated for inference later i.e. should
        the model be trained? Also tries to load these weights from a file. 
        Processes each file by extracting some features for each image ( The bulk of execution time).
        Lastly performs training if user has requested it and labeled data is available.
        Training is really quick since it just calculating an average and std for each color for each channel.
        So we have  3 x 256 x 2 trained values for each color)"""

# Set top = 0 to evaluate the full dataset. 
top = 10

# save_intermediate_images = True is only for demonstration, debugging and development

# The contructor will generate the features for each image, this part takes the longest, 70 sec for entire dataset
# 190 seconds if  save_intermediate_images = True
clr_predictor = ColorPredictor(Path('./dataset'),
                               Path('./'),
                               train_if_not_trained=True,
                               train_regardless=False, # set this to true to retrain 
                               top=top, 
                               save_intermediate_images = True
                              )

trainable?:  True  trained?:  True  training queued?:  False
Execution time for 'process_files': 191.438541 seconds


Now we created a dataframe containing labels for both methods. 
Then we evaluate it, giving two evaluation dataframes per method.
One for a confusion matrix, another for F1, recall and precision scores

In [6]:
# "Infers by looking at smallest distance for histograms, and largest occupancy rate for HSV ranges
# This df contains the train set, test set and unlabeled images.
# During evaluation we seperate the train set from the test set and ignore the unlabeled images. 
df_inferred = clr_predictor.infer(save_to_file=False)

"""     Calculates the confusion matrix and saves to file. 
        Calculates precision, recall and F1 score for each class and saves to file.
        Seperate files are stored for the train and test set.
        Seperate files are stored for the HSV range method and histogram method.
        """
eval_dfs, eval_dfs_names = clr_predictor.evaluate(df_inferred, save_to_file=False)

Execution time for 'infer': 3.377001 seconds


In [7]:
# Lets look at how we store the predictions and some of the color features
df_demo = df_inferred
if clr_predictor.save_intermediate_images:
    df_demo = df_demo.drop(columns=ColorPredictor.intermediate_columns_to_drop())

# we have 9 colors
# 23 columns = file + true_color + dataset + hist method label + occupancy method label
#           9 * color occupancy + 9 * color hist
# 
# print(df_demo)
df_demo

Unnamed: 0,file,true_color,dataset,black_occupancy,blue_occupancy,brown_occupancy,green_occupancy,pink_occupancy,red_occupancy,silver_occupancy,...,pred_hist_color,black_dist,blue_dist,brown_dist,green_dist,pink_dist,red_dist,silver_dist,white_dist,yellow_dist
0,dataset\train\black\2289261.jpg,black,train,0.614714,0.000109,0.000000,0.000122,0.000058,0.000057,0.326741,...,black,23.851600,66.417659,159.665521,187.454093,122.282122,191.222392,44.306277,57.642947,194.325317
1,dataset\train\black\2289271.jpg,black,train,0.408058,0.002289,0.000714,0.006826,0.003818,0.000484,0.293821,...,black,14.655955,24.226095,26.512584,34.493292,30.980882,44.065179,16.927640,18.325853,35.509598
2,dataset\train\black\2429402.jpg,black,train,0.619234,0.083900,0.222249,0.030295,0.006690,0.004751,0.082401,...,black,20.443563,46.937965,32.425545,53.332836,36.826812,62.490569,33.483860,45.665926,58.869159
3,dataset\train\black\257213_share1.jpg,black,train,0.588055,0.004377,0.002878,0.009587,0.000000,0.000000,0.006056,...,black,24.750873,53.997414,119.552524,42.050988,155.754956,156.347114,42.056981,47.858057,91.104464
4,dataset\train\black\26188161.jpg,black,train,0.510914,0.000000,0.000015,0.000151,0.000011,0.000330,0.146159,...,black,18.028144,35.040003,27.872283,35.908789,36.239849,95.283291,25.397878,30.416170,41.826449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3504,dataset\test\yellow\eorg7m1z49bnggxvjyrqi54t31...,yellow,test,0.071137,0.001722,0.806029,0.062081,0.000000,0.000000,0.073378,...,yellow,63.043876,36.777931,49.817238,57.213382,67.592393,56.668988,115.082469,87.472430,28.834948
3505,dataset\test\yellow\es_gl000336923_11.jpg,yellow,test,0.170140,0.244442,0.005077,0.008224,0.000448,0.000000,0.013954,...,blue,126.652933,30.961796,82.563858,36.019303,68.318435,51.557655,105.290807,93.449041,33.023889
3506,dataset\test\yellow\es_gl000480512qhs6661_0011...,yellow,test,0.000000,0.000000,0.010280,0.000299,0.003101,0.000000,0.006583,...,yellow,206.805944,75.609858,166.280388,83.998520,216.549524,74.621712,164.271436,163.347478,32.705319
3507,dataset\test\yellow\everyone-is-asking-for1.jpg,yellow,test,0.082527,0.078564,0.014231,0.017108,0.001748,0.000344,0.014084,...,yellow,309.485273,55.221919,116.898694,55.288016,78.938083,93.045223,248.635900,295.409480,21.201528


In [8]:
# Lets look at what is stored in one row.
# We will visuale the masks shortly
df_inferred.iloc[0]

file                                                   dataset\train\black\2289261.jpg
true_color                                                                       black
dataset                                                                          train
black_occupancy                                                               0.614714
blue_occupancy                                                             0.000108969
brown_occupancy                                                                      0
green_occupancy                                                            0.000122182
pink_occupancy                                                             5.77665e-05
red_occupancy                                                              5.73425e-05
silver_occupancy                                                              0.326741
white_occupancy                                                             0.00466734
yellow_occupancy                           

In [9]:
for eval_df, eval_df_name in zip(eval_dfs, eval_dfs_names):
    print(eval_df_name, '\n --------------------------------')
    print(eval_df)
    print('--------------------------------')

#  We see that the occupancy method is consistent for the train and test set
#      F1 scores
#       Hist       Occupancy
#Train  0.809624   0.75483
#Test   0.722054   0.767372

train_occupancy_scores.csv 
 --------------------------------
                 black  blue  brown  green  pink  red  silver  white  yellow  \
black              338    32      7      5     0    0      41      8       0   
blue                23   384      0     31     0    0       5      0       0   
brown               38     2    108      0     1   12       6      3       0   
green               10     0      1    118     0    0       1      0      24   
pink                25     3      3      1   297   66      10     19       0   
red                 22     3      2      0     6  358       3      1       0   
silver              27    13      8      2     0    0     127     39       1   
white               20     2      9      1     0    0      72    205       1   
yellow               9     0     75      0     0    0       2      3     214   
Total Predicted    512   439    213    158   304  436     267    278     240   

                 Total True  
black                 431.0

# Mean and standard deviation of each histogram of each color

In [12]:

# Create a slider widget with the color names as options
hist_color_slider = widgets.SelectionSlider(options=ColorPredictor.color_names,
                                 value='red',
                                 description='Color:')


# Define a function to plot the selected color as a rectangle
def plot_color_histograms(color):
    means = clr_predictor.hsv_hist_means_stds['means'][color]
    stds = clr_predictor.hsv_hist_means_stds['stds'][color]
    plot_hue_sat_val_histograms(means[0], means[1], means[2], title='Mean Histogram')
    plot_hue_sat_val_histograms(stds[0], stds[1], stds[2], title='Standard Deviation Histograms')
    

# Use the interact function to link the slider and the plot function
widgets.interact(plot_color_histograms, color=hist_color_slider)

interactive(children=(SelectionSlider(description='Color:', index=5, options=('black', 'blue', 'brown', 'green…

<function __main__.plot_color_histograms(color)>

# Visualize the process

The process_image(img) method extracts image features such as histograms and color occupancy.
3 histograms are calculated, one for H, S and V channels.
A mask according to each color is geneerated using the hard-coded hsv_ranges.
The weighted average of the mask determines the occupancy of that color.

These features can be used however desired in a later stage.
In this case, the average histogram for each color is used to
determine the similarity of the image histograms with each color.
This average is obtained during training from the train set, as visualed earlier. 

The pixels with the lowest saturation and value are ignored, since most cars
have some black and white pixels due to reflections, the car outline;
and very dark regions such as shadows, wheels.

Edges are also ignored, since these tend to be wheels as well as other things
such as mirrors, the grid at the front of the car, the inside of the car.
The edges are also blurred to increase the number of ignored pixels.


Since most cars take up almost the whole image, we use focal scaling, where
pixels further from the center have a lower weight.
We use a linear scale, where the pixel furthest away has a weight of 0.1
The image is also cropped for the same reason.

We combine the masks from focal scaling, edges and sat_val clipping.

Please see predictor.ColorPredictor.process_image(self, img) for the implementation details

In the code below we can see how the difference between the black mask and full black mask
sucessfully gave less weight to black pixels in the wheels. The same is true for white

In [13]:
# Create a slider widget with the color names as options
color_slider = widgets.SelectionSlider(options=ColorPredictor.color_names,
                                 value='red',
                                 description='Color:')

file_slider = widgets.SelectionSlider(options=np.arange(top if top > 0 else 100),
                                      value=0,
                                      description='File:')


def showcase_image(color_name, index):
    
    
    color_df = filter_data(df_inferred, {ColumnNames.true_label : color_name})
    row = color_df.iloc[index]
    
    true_color = row[ColumnNames.true_label]
    pred_hist_label = row[ColumnNames.pred_hist_color]
    pred_occupancy_label = row[ColumnNames.pred_occupancy_color]
    print(f'True color: {true_color}, Histogram Label: {pred_hist_label}, Occupncy Label: {pred_occupancy_label}')
    
    # Preprocessing

    images_column_names = [
        IntermediateColumnNames.original,
        IntermediateColumnNames.cropped,
        IntermediateColumnNames.sv_mask,
        IntermediateColumnNames.edges_mask,
        IntermediateColumnNames.mask,
        IntermediateColumnNames.total_mask
    ]
    images_kwargs = {}
    for column in images_column_names:
        images_kwargs[column] = row[column]

    plot_images(n_columns=3, **images_kwargs)
    
    # Histograms
    
    plot_hue_sat_val_histograms(row[IntermediateColumnNames.hue_histogram],
                                row[IntermediateColumnNames.sat_histogram],
                                row[IntermediateColumnNames.val_histogram],
                                title='HSV Histogram')
    
    # Color masks
    
    images_column_names = [f'{col}_mask' for col in ColorPredictor.color_names]
    images_column_names += [f'{col}_mask_full' for col in ColorPredictor.color_names]
    images_kwargs = {}
    for column in images_column_names:
        images_kwargs[column] = row[column]
        
    
    plot_images(n_columns=6, **images_kwargs)


widgets.interact(showcase_image, color_name=color_slider, index=file_slider)


interactive(children=(SelectionSlider(description='Color:', index=5, options=('black', 'blue', 'brown', 'green…

<function __main__.showcase_image(color_name, index)>

# Focus on incorrect prediction

In [14]:

# Histogram mistakes
#df_mistakes = df[df[ColumnNames.true_label] != df[ColumnNames.pred_hist_color]]

# Occupancy mistakes
df_mistakes = df_inferred[df_inferred[ColumnNames.true_label] != df_inferred[ColumnNames.pred_occupancy_color]]

file_slider_mitakes = widgets.SelectionSlider(options=np.arange(len(df_mistakes)),
                                      value=0,
                                      description='File:')


def showcase_mislabeling(index):
    
    
    row = df_mistakes.iloc[index]
    
    true_color = row[ColumnNames.true_label]
    pred_hist_label = row[ColumnNames.pred_hist_color]
    pred_occupancy_label = row[ColumnNames.pred_occupancy_color]
    print(f'True color: {true_color}, Histogram Label: {pred_hist_label}, Occupncy Label: {pred_occupancy_label}')
    
    # Preprocessing

    images_column_names = [
        #IntermediateColumnNames.original,
        IntermediateColumnNames.cropped,
        #IntermediateColumnNames.sv_mask,
        IntermediateColumnNames.edges_mask,
        #IntermediateColumnNames.mask,
        IntermediateColumnNames.total_mask
    ]
    images_kwargs = {}
    for column in images_column_names:
        images_kwargs[column] = row[column]

    plot_images(n_columns=3, **images_kwargs)
    
    # Histograms
    
#     plot_hue_sat_val_histograms(row[IntermediateColumnNames.hue_histogram],
#                                 row[IntermediateColumnNames.sat_histogram],
#                                 row[IntermediateColumnNames.val_histogram],
#                                 title='HSV Histogram')
    
    # Color masks
    
    images_column_names = [f'{col}_mask' for col in ColorPredictor.color_names]
    images_column_names += [f'{col}_mask_full' for col in ColorPredictor.color_names]
    images_kwargs = {}
    for column in images_column_names:
        images_kwargs[column] = row[column]
        
    
    plot_images(n_columns=6, **images_kwargs)


widgets.interact(showcase_mislabeling, index=file_slider_mitakes)


interactive(children=(SelectionSlider(description='File:', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …

<function __main__.showcase_mislabeling(index)>

# Random Search color HSV weight

We can do the same for the other hyperparameters

In [92]:
def all_ones(): 
    hsv_color_weights = {}
    for color in ColorPredictor.color_names:
        hsv_color_weights[color] = (1,1,1)
    return hsv_color_weights

def normalize_hsv_color_weights(hsv_color_weights):
    for color_Name in hsv_color_weights.keys():
        # Calculate the square root of the sum of squares of elements
        arr = np.array(hsv_color_weights[color_Name])
        sum_of_squares = np.sum(arr**2)
        sqrt_sum_of_squares = np.sqrt(sum_of_squares)

        # Normalize the array
        normalized_arr = arr / sqrt_sum_of_squares
        hsv_color_weights[color_Name] = normalized_arr
    return hsv_color_weights

In [93]:
# from itertools import product

# Define hyperparameter search space (within the range 0.8 to 1.2)
search_space = np.linspace(0.8, 1.2, num=5)
# print(search_space)

# Initialize best metric and best weights
best_overall_f1 = 0
best_weights = None

# Perform random search
num_iterations = 2  # Adjust as needed
for _ in range(num_iterations):
    sampled_weights = {}
    for color  in ColorPredictor.color_names:
        sampled_weights[color] = np.random.choice(search_space, 3)
        
    # Normalize ?
    #sampled_weights = normalize_hsv_color_weights(sampled_weights)
        
    ColorPredictor.hsv_color_weights = sampled_weights
    df_temp = clr_predictor.infer()
    eval_dfs, eval_dfs_names = clr_predictor.evaluate(df_temp, save_to_file=False)
    overall_f1 = eval_dfs[3]['Overall'][-1]


    # Update best metric and best weights if needed
    if overall_f1 > best_overall_f1:
        best_overall_f1 = overall_f1
        best_weights = sampled_weights

print("Best weights (random search):", best_weights)
print("Best metric value:", best_overall_f1)

Execution time for 'infer': 2.441002 seconds
Execution time for 'infer': 2.473000 seconds
Execution time for 'infer': 2.323000 seconds
Execution time for 'infer': 2.377999 seconds
Execution time for 'infer': 2.351000 seconds
Execution time for 'infer': 2.302000 seconds
Execution time for 'infer': 2.313998 seconds
Execution time for 'infer': 2.509999 seconds
Execution time for 'infer': 2.324000 seconds
Execution time for 'infer': 2.323997 seconds
Execution time for 'infer': 2.313000 seconds
Execution time for 'infer': 2.302999 seconds
Execution time for 'infer': 2.326000 seconds
Execution time for 'infer': 2.314000 seconds
Execution time for 'infer': 2.300001 seconds
Execution time for 'infer': 2.290001 seconds
Execution time for 'infer': 2.307000 seconds
Execution time for 'infer': 2.314000 seconds
Execution time for 'infer': 2.308998 seconds
Execution time for 'infer': 2.315000 seconds
Execution time for 'infer': 2.314000 seconds
Execution time for 'infer': 2.295001 seconds
Execution 

In [94]:
# Lets evaluate the new weights

# best_weights =  {'black': array([1.2, 0.8, 1.2]), 'blue': array([0.8, 0.9, 1. ]),
#                  'brown': array([1. , 1.1, 1.1]), 'green': array([0.8, 0.9, 1. ]),
#                  'pink': array([1.1, 1. , 1. ]), 'red': array([1. , 0.9, 0.9]),
#                  'silver': array([1. , 1.2, 0.9]), 'white': array([0.8, 0.9, 1.2]),
#                  'yellow': array([0.8, 1.1, 1. ])}
ColorPredictor.hsv_color_weights = best_weights

df_temp = clr_predictor.infer(save_to_file=False)
eval_dfs, eval_dfs_names = clr_predictor.evaluate(df_temp, save_to_file=False)
#        old weights     new weights
# train  809624         0.832806
# test   722054         0.767372


for eval_df, eval_df_name in zip(eval_dfs, eval_dfs_names):
    print(eval_df_name, '\n --------------------------------')
    print(eval_df)
    print('--------------------------------')

Execution time for 'infer': 2.361999 seconds
train_occupancy_scores.csv 
 --------------------------------
                 black  blue  brown  green  pink  red  silver  white  yellow  \
black              338    32      7      5     0    0      41      8       0   
blue                23   384      0     31     0    0       5      0       0   
brown               38     2    108      0     1   12       6      3       0   
green               10     0      1    118     0    0       1      0      24   
pink                25     3      3      1   297   66      10     19       0   
red                 22     3      2      0     6  358       3      1       0   
silver              27    13      8      2     0    0     127     39       1   
white               20     2      9      1     0    0      72    205       1   
yellow               9     0     75      0     0    0       2      3     214   
Total Predicted    512   439    213    158   304  436     267    278     240   

            

# Try some machine learning stuff

Now we transform the features with PCA or CFS and use those to train:
logistic regression, SVM, RFC, KNN, NB, LDA

In [86]:
from sklearn.preprocessing import LabelEncoder
from copy import deepcopy

dist_columns_names = [f'{color}_dist' for color in ColorPredictor.color_names]
occupancy_columns_names = [f'{color}_occupancy' for color in ColorPredictor.color_names]

# TRy different combinations
columns = occupancy_columns_names
#columns = dist_columns_names 
#columns = occupancy_columns_names

# Train set
df_train = deepcopy(df_inferred[df_inferred['dataset'] == 'train'])
X_train = np.array(df_train[columns])
print(X_train.shape)
#X_train = np.array(df[columns])
#X_train[:, 0:9] = 20 / X_train[:, 0:9]

y_train = np.array(df_train['true_color'])
print(y_train.shape)

# Train set
df_test = deepcopy(df_inferred[df_inferred['dataset'] == 'test'])
X_test = np.array(df_test[columns])
print(X_test.shape)
#X_train = np.array(df[columns])
#X_train[:, 0:9] = 20 / X_train[:, 0:9]

y_test = np.array(df_test['true_color'])
print(y_test.shape)


# Encode class labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)


(2847, 9)
(2847,)
(662, 9)
(662,)


In [None]:
# Some Results

#       F1 scores

#  Raw scores from before
#       histograms  Occupancy 
# Train  0.809624   0.75483
# Test   0.722054   0.767372

#       features = 9 = only occupancy
#       logisticregr   RFC       SVM        KNN         NB          LDA 
# Train  0.809975      0.99      0.816649   0.834211    0.789603    0.791008
# Test   0.811178      0.806647  0.817221   0.805136    0.797583    0.796073
# max test score = 0.806

#       features = 9 =only histograms
#       logisticregr   RFC       SVM        KNN         NB          LDA 
# Train 0.855286       0.994731  0.874254   0.776256    0.525465    0.554619
# Test  0.767372       0.750755  0.768882   0.714502    0.510574    0.515106
# max test score = 0.768

#       features = 18 = both
#       logisticregr   RFC       SVM        KNN         NB          LDA 
# Train 0.855638       0.994731  0.874254   0.776256    0.761503    0.802248
# Test  0.767372       0.809668  0.768882   0.714502    0.731118    0.802248
# max test score = 0.809

## CFS

In [88]:
from sklearn.feature_selection import SelectKBest, f_classif
# Train CFS with encoded labels
num_features = 9
cfs_selector = SelectKBest(score_func=f_classif, k=num_features)
X_train_cfs = cfs_selector.fit_transform(X_train, y_train_encoded)

X_test_cfs = cfs_selector.transform(X_test)

## PCA

In [89]:
from sklearn.decomposition import PCA
# Train PCA with encoded labels
num_components = 9
pca = PCA(n_components=num_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

## Classifiers

In [91]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Assume you have already performed PCA and CFS
# X_train_pca, X_test_pca: Reduced features after PCA
# X_train_cfs, X_test_cfs: Reduced features after CFS

# Create a pipeline with PCA and logistic regression
pca = PCA(n_components=9)  # Adjust the number of components
clf = LogisticRegression()

pipe_pca = Pipeline([('pca', pca), ('logistic', clf)])
pipe_pca.fit(X_train_pca, y_train)
#pipe_pca.fit(X_train, y_train)


# Predictions on train set
train_predictions_pca = pipe_pca.predict(X_train_pca)
#train_predictions_pca = pipe_pca.predict(X_train)

df_train['pred_pca_log_label'] = train_predictions_pca
print(create_evaluation_df(df_train,'true_color', 'pred_pca_log_label'))

# Predictions on test set
test_predictions_pca = pipe_pca.predict(X_test_pca)
#test_predictions_pca = pipe_pca.predict(X_test)
df_test['pred_pca_log_label'] = test_predictions_pca
print(create_evaluation_df(df_test,'true_color', 'pred_pca_log_label'))

           Precision    Recall  F1-score   Overall
black       0.744235  0.823666  0.781938       NaN
blue        0.902004  0.914221  0.908072       NaN
brown       0.658960  0.670588  0.664723       NaN
green       0.848276  0.798701  0.822742       NaN
pink        0.933518  0.794811  0.858599       NaN
red         0.838710  0.921519  0.878166       NaN
silver      0.526882  0.451613  0.486352       NaN
white       0.743827  0.777419  0.760252       NaN
yellow      0.902685  0.887789  0.895175       NaN
Precision        NaN       NaN       NaN  0.809975
Recall           NaN       NaN       NaN  0.809975
F1-score         NaN       NaN       NaN  0.809975
           Precision    Recall  F1-score   Overall
black       0.735043  0.877551  0.800000       NaN
blue        0.876289  0.850000  0.862944       NaN
brown       0.666667  0.400000  0.500000       NaN
green       0.736842  0.800000  0.767123       NaN
pink        0.900990  0.910000  0.905473       NaN
red         0.936170  0.880000 

### Random Forest

In [82]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=10)  # Adjust hyperparameters

# Fit the model on reduced features, or raw features
rf.fit(X_train, y_train)
#rf.fit(X_train_pca, y_train)
#rf.fit(X_train_cfs, y_train)

# Predictions on train set
train_predictions_cfs_rf = rf.predict(X_train_cfs)
df_train['pred_rf_label'] = train_predictions_cfs_rf
print(create_evaluation_df(df_train,'true_color', 'pred_rf_label'))

# Predictions on test set
test_predictions_cfs_rf = rf.predict(X_test_cfs)
df_test['pred_rf_label'] = test_predictions_cfs_rf
print(create_evaluation_df(df_test,'true_color', 'pred_rf_label'))

           Precision    Recall  F1-score   Overall
black       0.988453  0.993039  0.990741       NaN
blue        0.995485  0.995485  0.995485       NaN
brown       1.000000  0.988235  0.994083       NaN
green       0.993548  1.000000  0.996764       NaN
pink        0.995294  0.997642  0.996466       NaN
red         0.997468  0.997468  0.997468       NaN
silver      0.986175  0.986175  0.986175       NaN
white       0.996753  0.990323  0.993528       NaN
yellow      1.000000  1.000000  1.000000       NaN
Precision        NaN       NaN       NaN  0.994731
Recall           NaN       NaN       NaN  0.994731
F1-score         NaN       NaN       NaN  0.994731
           Precision    Recall  F1-score   Overall
black       0.690265  0.795918  0.739336       NaN
blue        0.893617  0.840000  0.865979       NaN
brown       0.677419  0.525000  0.591549       NaN
green       0.896552  0.742857  0.812500       NaN
pink        0.894231  0.930000  0.911765       NaN
red         0.967391  0.890000 

### Support Vector Machine

In [67]:
from sklearn.svm import SVC

# Create an SVM classifier
svm = SVC(kernel='linear')  # Use linear kernel for simplicity

# Fit the model on reduced features
svm.fit(X_train_pca, y_train)

# Predictions on test set
train_predictions_pca_svm = svm.predict(X_train_pca)
df_train['pred_pca_svm_label'] = train_predictions_pca_svm
print(create_evaluation_df(df_train,'true_color', 'pred_pca_svm_label'))

# Predictions on test set
test_predictions_pca_svm = svm.predict(X_test_pca)
df_test['pred_pca_svm_label'] = test_predictions_pca_svm
print(create_evaluation_df(df_test,'true_color', 'pred_pca_svm_label'))

           Precision    Recall  F1-score   Overall
black       0.771008  0.851508  0.809261       NaN
blue        0.947005  0.927765  0.937286       NaN
brown       0.804734  0.800000  0.802360       NaN
green       0.940789  0.928571  0.934641       NaN
pink        0.931707  0.900943  0.916067       NaN
red         0.946565  0.941772  0.944162       NaN
silver      0.634615  0.608295  0.621176       NaN
white       0.831126  0.809677  0.820261       NaN
yellow      0.973597  0.973597  0.973597       NaN
Precision        NaN       NaN       NaN  0.874254
Recall           NaN       NaN       NaN  0.874254
F1-score         NaN       NaN       NaN  0.874254
           Precision    Recall  F1-score   Overall
black       0.678571  0.775510  0.723810       NaN
blue        0.836735  0.820000  0.828283       NaN
brown       0.588235  0.500000  0.540541       NaN
green       0.827586  0.685714  0.750000       NaN
pink        0.807692  0.840000  0.823529       NaN
red         0.946237  0.880000 

### K-NN

In [83]:
from sklearn.neighbors import KNeighborsClassifier

# Create a k-NN classifier
knn = KNeighborsClassifier(n_neighbors=10)  # Adjust the number of neighbors

# Fit the model on reduced features
knn.fit(X_train_cfs, y_train)
#knn.fit(X_train_pca, y_train)

# Predictions on train set
train_predictions_cfs_knn = knn.predict(X_train_cfs)
#train_predictions_cfs_knn = knn.predict(X_train_pca)

df_train['pred_cfs_knn_label'] = train_predictions_cfs_knn
print(create_evaluation_df(df_train, 'true_color', 'pred_cfs_knn_label'))

# Predictions on test set
test_predictions_cfs_knn = knn.predict(X_test_cfs)
#test_predictions_cfs_knn = knn.predict(X_test_pca)

df_test['pred_cfs_knn_label'] = test_predictions_cfs_knn
print(create_evaluation_df(df_test, 'true_color', 'pred_cfs_knn_label'))

           Precision    Recall  F1-score   Overall
black       0.611615  0.781903  0.686354       NaN
blue        0.833698  0.860045  0.846667       NaN
brown       0.769231  0.588235  0.666667       NaN
green       0.863014  0.818182  0.840000       NaN
pink        0.828829  0.867925  0.847926       NaN
red         0.900000  0.911392  0.905660       NaN
silver      0.506494  0.359447  0.420485       NaN
white       0.732852  0.654839  0.691652       NaN
yellow      0.892361  0.848185  0.869712       NaN
Precision        NaN       NaN       NaN  0.776256
Recall           NaN       NaN       NaN  0.776256
F1-score         NaN       NaN       NaN  0.776256
           Precision    Recall  F1-score   Overall
black       0.492857  0.704082  0.579832       NaN
blue        0.766990  0.790000  0.778325       NaN
brown       0.542857  0.475000  0.506667       NaN
green       0.724138  0.600000  0.656250       NaN
pink        0.806122  0.790000  0.797980       NaN
red         0.900990  0.910000 

### Naive Bayes

In [84]:
from sklearn.naive_bayes import GaussianNB

# Does worse with less columns features

# Create a Naive Bayes classifier
nb = GaussianNB()

# Fit the model on reduced features
nb.fit(X_train_cfs, y_train)

# Predictions on train set
predictions_cfs_nb = nb.predict(X_train_cfs)
df_train['pred_cfs_nb_label'] = predictions_cfs_nb
print(create_evaluation_df(df_train, 'true_color', 'pred_cfs_nb_label'))

# Predictions on test set
predictions_cfs_nb = nb.predict(X_test_cfs)
df_test['pred_cfs_nb_label'] = predictions_cfs_nb
print(create_evaluation_df(df_test,'true_color', 'pred_cfs_nb_label'))

           Precision    Recall  F1-score   Overall
black       0.895349  0.357309  0.510779       NaN
blue        0.929864  0.927765  0.928814       NaN
brown       0.391645  0.882353  0.542495       NaN
green       0.877419  0.883117  0.880259       NaN
pink        0.919897  0.839623  0.877928       NaN
red         0.880196  0.911392  0.895522       NaN
silver      0.400000  0.221198  0.284866       NaN
white       0.557692  0.841935  0.670951       NaN
yellow      0.938907  0.963696  0.951140       NaN
Precision        NaN       NaN       NaN  0.761503
Recall           NaN       NaN       NaN  0.761503
F1-score         NaN       NaN       NaN  0.761503
           Precision    Recall  F1-score   Overall
black       0.735294  0.255102  0.378788       NaN
blue        0.870968  0.810000  0.839378       NaN
brown       0.314607  0.700000  0.434109       NaN
green       0.736842  0.800000  0.767123       NaN
pink        0.871287  0.880000  0.875622       NaN
red         0.946809  0.890000 

### LDA

In [85]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Create an LDA classifier
lda = LinearDiscriminantAnalysis()

# Fit the model on reduced features
lda.fit(X_train_pca, y_train)

# Predictions on test set
predictions_pca_lda = lda.predict(X_train_pca)
df_train['pred_pca_lda_label'] = predictions_pca_lda
print(create_evaluation_df(df_train,'true_color', 'pred_pca_lda_label'))

# Predictions on test set
predictions_pca_lda = lda.predict(X_test_pca)
df_test['pred_pca_lda_label'] = predictions_pca_lda
print(create_evaluation_df(df_test,'true_color', 'pred_pca_lda_label'))

           Precision    Recall  F1-score   Overall
black       0.775982  0.779582  0.777778       NaN
blue        0.925234  0.893905  0.909300       NaN
brown       0.576037  0.735294  0.645995       NaN
green       0.787500  0.818182  0.802548       NaN
pink        0.970238  0.768868  0.857895       NaN
red         0.835214  0.936709  0.883055       NaN
silver      0.475862  0.635945  0.544379       NaN
white       0.809886  0.687097  0.743455       NaN
yellow      0.916968  0.838284  0.875862       NaN
Precision        NaN       NaN       NaN  0.802248
Recall           NaN       NaN       NaN  0.802248
F1-score         NaN       NaN       NaN  0.802248
           Precision    Recall  F1-score   Overall
black       0.761905  0.816327  0.788177       NaN
blue        0.891304  0.820000  0.854167       NaN
brown       0.538462  0.525000  0.531646       NaN
green       0.714286  0.857143  0.779221       NaN
pink        0.956044  0.870000  0.910995       NaN
red         0.937500  0.900000 