# Llegim totes les imatges del dataset

In [1]:
import os

def get_image_files_by_folder(root_folder, image_extensions=('jpg', 'jpeg', 'png', 'gif', 'bmp')):
    image_files_by_folder = {}

    # Walk through the root folder and its subdirectories
    for folder_path, _, files in os.walk(root_folder):
        folder_name = os.path.basename(folder_path)
        image_files = []

        for file in files:
            file_extension = file.lower().split('.')[-1]
            if file_extension in image_extensions:
                image_files.append(os.path.join(folder_path, file))

        if image_files:
            image_files_by_folder[folder_name] = image_files

    return image_files_by_folder

In [2]:
train_dataset_path = "data/train"
images_to_load = get_image_files_by_folder(train_dataset_path)
images_to_load.keys()

dict_keys(['bedroom', 'Coast', 'Forest', 'Highway', 'industrial', 'Insidecity', 'kitchen', 'livingroom', 'Mountain', 'Office', 'OpenCountry', 'store', 'Street', 'Suburb', 'TallBuilding'])

# Cream el dataframe

In [5]:
import os
import pandas as pd
from skimage.feature import hog
from skimage import io
from sklearn.feature_extraction import image

def read_and_flatten_images(image_list):
    image_data = []
    
    for image_path in image_list:
        img = io.imread(image_path)
        hog_vectors = hog(img, orientations=8, pixels_per_cell=(20, 20),
                          cells_per_block=(10, 10), channel_axis=None)
        image_data.append(hog_vectors)
    
    return image_data

In [6]:
image_df = pd.DataFrame()
for label, image_list in images_to_load.items():
    image_data = read_and_flatten_images(images_to_load[label])
    next_part = pd.DataFrame(image_data)
    next_part['label'] = label
    image_df = pd.concat([image_df, next_part], ignore_index=True)

In [7]:
image_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,791,792,793,794,795,796,797,798,799,label
1495,0.006665,0.004312,0.012914,0.011915,0.0298,0.00926,0.010586,0.004804,0.005135,0.003837,...,0.001792,0.006399,0.002687,0.004501,0.005098,0.010985,0.004937,0.005745,0.001913,TallBuilding
1496,0.006697,0.004072,0.004563,0.001776,0.007644,0.00143,0.001808,0.00229,0.004555,0.002243,...,0.009001,0.011838,0.004963,0.009943,0.014632,0.020164,0.012421,0.014866,0.007037,TallBuilding
1497,0.000572,0.000677,0.004774,0.000564,0.017476,0.001701,0.000761,0.0,0.000135,0.0,...,0.005332,0.018837,0.007991,0.01997,0.014601,0.020679,0.011769,0.006496,0.012003,TallBuilding
1498,0.017734,0.008054,0.003554,0.001317,0.002457,0.000543,0.002226,0.002022,0.178328,0.060599,...,0.001235,0.004061,0.002386,0.006471,0.008394,0.00551,0.002986,0.004494,0.001402,TallBuilding
1499,0.006909,0.002044,0.001812,0.005448,0.046775,0.095771,0.027003,0.002273,0.008878,0.036699,...,0.002123,0.005973,0.005315,0.024412,0.027895,0.021045,0.003958,0.002821,0.001787,TallBuilding


In [8]:
# TODO: read https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
image_df.shape

(1500, 801)

# Cream els conjunts de test i d'entrenament

In [9]:
from sklearn.model_selection import train_test_split

# TODO: Es necesari? tenim un conjunt de test ja. Això seria un conjunt de validacio "per fer overfitting" :DD
X_train, X_test, y_train, y_test = train_test_split(image_df.drop('label', axis=1), image_df['label'], test_size=0.10, random_state=42)

# Entrenam el model

In [10]:
from sklearn.metrics import precision_score, make_scorer
from sklearn.svm import SVC

svm = SVC(C=1.0, kernel='poly', random_state=42)
svm.fit(X_train, y_train)

SVC(kernel='poly', random_state=42)

# Donam una predicció

In [11]:
y_predicted = svm.predict(X_test)

print(f"La precisició es de: {precision_score(y_test, y_predicted, average='micro')}")

La precisició es de: 0.62


# Cercam els millors paràmetres del model

In [16]:
# Mostram tots el parametres del model
svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.5,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import itertools

# Funció que permet mostrar els millor hiper parametres
def showResults(grid_result, number_to_show = 5):
    print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    results = sorted(zip(means, stds, params), key=lambda x: x[0], reverse=True)
    for mean, stdev, param in itertools.islice(results, number_to_show):
        print(f"{mean} ({stdev}) with: {param}")

In [13]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'degree': [2, 4, 6, 8],
    #'coef0': [0.5, 1, 10],
    #'gamma': [0.1, 1, 100, 1000],
    #'C': [0.1, 1, 100, 1000],
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [14]:
# GridSearchCV pot ser molt lent ja que el random forest necesita ja un temps elevat inicialment
# Farem sevir una variant experimental que treballa dividint els entrenaments

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 900
Fitting 30 folds for each of 4 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 2
n_resources: 900
Fitting 30 folds for each of 2 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 1
n_resources: 900
Fitting 30 folds for each of 1 candidates, totalling 30 fits


In [17]:
showResults(grid_result)

Best: 0.5755555555555556 using {'kernel': 'poly'}
0.5755555555555556 (0.04955771461830439) with: {'kernel': 'poly'}
0.5740740740740741 (0.04930240382901848) with: {'kernel': 'poly'}
0.5733333333333334 (0.041928805031362704) with: {'kernel': 'linear'}
0.572962962962963 (0.05161453629214303) with: {'kernel': 'poly'}
0.5696296296296296 (0.0467840968137426) with: {'kernel': 'linear'}


In [22]:
model = grid_result.best_estimator_
pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, y_predicted, average='micro')}")

# Cercam els millor parametres per poly

In [32]:
svm = SVC(C=1.0, kernel='poly', random_state=42)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print(f"La precisició abans es de: {precision_score(y_test, pred, average='weighted')}")

La precisició abans es de: 0.5790054945054944


In [26]:
param_grid = {
    'degree': [1, 2, 3, 4, 6, 8],
    'coef0': [0.5, 1, 10]
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [27]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 18
n_resources: 900
Fitting 30 folds for each of 18 candidates, totalling 540 fits
----------
iter: 1
n_candidates: 9
n_resources: 900
Fitting 30 folds for each of 9 candidates, totalling 270 fits
----------
iter: 2
n_candidates: 5
n_resources: 900
Fitting 30 folds for each of 5 candidates, totalling 150 fits
----------
iter: 3
n_candidates: 3
n_resources: 900
Fitting 30 folds for each of 3 candidates, totalling 90 fits
----------
iter: 4
n_candidates: 2
n_resources: 900
Fitting 30 folds for each of 2 candidates, totalling 60 fits


In [28]:
showResults(grid_result)

Best: 0.5814814814814814 using {'coef0': 0.5, 'degree': 3}
0.5914814814814815 (0.04614196498616674) with: {'coef0': 10, 'degree': 8}
0.5907407407407407 (0.04408029834667594) with: {'coef0': 10, 'degree': 4}
0.5907407407407407 (0.04058049292499619) with: {'coef0': 10, 'degree': 3}
0.5903703703703703 (0.04533672221659069) with: {'coef0': 1, 'degree': 3}
0.59 (0.04673275743291799) with: {'coef0': 0.5, 'degree': 3}


In [33]:
model = grid_result.best_estimator_

model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.5,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, pred, average='weighted')}")

La precisició es de: 0.5533333333333333


# Cercam els valors de C 

In [33]:
svm = SVC(C=1.0, kernel='poly', degree=3, coef0=0.5, random_state=42)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print(f"La precisició abans es de: {precision_score(y_test, pred, average='weighted')}")

La precisició abans es de: 0.573478798978799


In [13]:
param_grid = {
    #'gamma': [0.1, 1, 100, 1000],
    'C': [0.1, 1, 100, 1000],
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [18]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 900
Fitting 30 folds for each of 4 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 2
n_resources: 900
Fitting 30 folds for each of 2 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 1
n_resources: 900
Fitting 30 folds for each of 1 candidates, totalling 30 fits


In [19]:
showResults(grid_result)

Best: 0.5692592592592592 using {'C': 1000}
0.5796296296296297 (0.03785214176883973) with: {'C': 100}
0.5796296296296297 (0.03785214176883973) with: {'C': 1000}
0.5755555555555555 (0.05972502146755923) with: {'C': 1}
0.5755555555555555 (0.05972502146755923) with: {'C': 100}
0.5755555555555555 (0.05972502146755923) with: {'C': 1000}


In [27]:
model = grid_result.best_estimator_

model.get_params()

{'C': 1000,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.5,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, pred, average='weighted')}")

La precisició es de: 0.573478798978799
