# Llegim totes les imatges del dataset

In [1]:
import os

def get_image_files_by_folder(root_folder, image_extensions=('jpg', 'jpeg', 'png', 'gif', 'bmp')):
    image_files_by_folder = {}

    # Walk through the root folder and its subdirectories
    for folder_path, _, files in os.walk(root_folder):
        folder_name = os.path.basename(folder_path)
        image_files = []

        for file in files:
            file_extension = file.lower().split('.')[-1]
            if file_extension in image_extensions:
                image_files.append(os.path.join(folder_path, file))

        if image_files:
            image_files_by_folder[folder_name] = image_files

    return image_files_by_folder

In [2]:
train_dataset_path = "data/train"
images_to_load = get_image_files_by_folder(train_dataset_path)
images_to_load.keys()

dict_keys(['bedroom', 'Coast', 'Forest', 'Highway', 'industrial', 'Insidecity', 'kitchen', 'livingroom', 'Mountain', 'Office', 'OpenCountry', 'store', 'Street', 'Suburb', 'TallBuilding'])

# Cream el dataframe, TODO: COM ORGANITZAR IMPORTS? TOT A DALT? PER FUNCIO?

In [3]:
import os
import pandas as pd
from skimage import io
from sklearn.feature_extraction import image

def read_and_flatten_images(image_list):
    image_data = []
    
    for image_path in image_list:
        img = io.imread(image_path)
        flat_img = img.flatten()
        image_data.append(flat_img)
    
    return image_data

In [4]:
image_df = pd.DataFrame()
for label, image_list in images_to_load.items():
    image_data = read_and_flatten_images(images_to_load[label])
    next_part = pd.DataFrame(image_data)
    next_part['label'] = label
    image_df = pd.concat([image_df, next_part], ignore_index=True)

In [5]:
# TODO: read https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
image_df.shape

(1500, 40001)

# Cream els conjunts de test i d'entrenament

In [6]:
from sklearn.model_selection import train_test_split

# TODO: Es necesari? tenim un conjunt de test ja. Això seria un conjunt de validacio "per fer overfitting" :DD
X_train, X_test, y_train, y_test = train_test_split(image_df.drop('label', axis=1), image_df['label'], test_size=0.10, random_state=42)

# Entrenam el model

In [7]:
from sklearn.metrics import precision_score, make_scorer
from sklearn.svm import SVC

svm = SVC(C=1.0, kernel='linear', random_state=42)
svm.fit(X_train, y_train)

SVC(kernel='linear', random_state=42)

# Donam una predicció

In [8]:
y_predicted = svm.predict(X_test)

print(f"La precisició es de: {precision_score(y_test, y_predicted, average='micro')}")

La precisició es de: 0.20666666666666667


# Cercam els millors paràmetres del model

In [9]:
# Mostram tots el parametres del model
svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# Funció que permet mostrar els millor hiper parametres
def showResults(grid_result, number_to_show = 5):
    print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    results = sorted(zip(means, stds, params), key=lambda x: x[0], reverse=True)
    for mean, stdev, param in itertools.islice(results, number_to_show):
        print(f"{mean} ({stdev}) with: {param}")

In [12]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'degree': [2, 4, 6, 8],
    #'coef0': [0.5, 1, 10],
    #'gamma': [0.1, 1, 100, 1000],
    #'C': [0.1, 1, 100, 1000],
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [None]:
# GridSearchCV pot ser molt lent ja que el random forest necesita ja un temps elevat inicialment
# Farem sevir una variant experimental que treballa dividint els entrenaments

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 900
Fitting 30 folds for each of 4 candidates, totalling 120 fits


In [None]:
# Slow approach

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
#grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0,verbose=10)
#grid_result = grid_search.fit(X_train, y_train)

Fitting 30 folds for each of 768 candidates, totalling 23040 fits


In [None]:
showResults(grid_result)

In [None]:
model = grid_result.best_estimator_

pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, y_predicted, average='micro')}")