# Llegim totes les imatges del dataset

In [1]:
import os

def get_image_files_by_folder(root_folder, image_extensions=('jpg', 'jpeg', 'png', 'gif', 'bmp')):
    image_files_by_folder = {}

    # Walk through the root folder and its subdirectories
    for folder_path, _, files in os.walk(root_folder):
        folder_name = os.path.basename(folder_path)
        image_files = []

        for file in files:
            file_extension = file.lower().split('.')[-1]
            if file_extension in image_extensions:
                image_files.append(os.path.join(folder_path, file))

        if image_files:
            image_files_by_folder[folder_name] = image_files

    return image_files_by_folder

In [2]:
train_dataset_path = "data/train"
images_to_load = get_image_files_by_folder(train_dataset_path)
images_to_load.keys()

dict_keys(['bedroom', 'Coast', 'Forest', 'Highway', 'industrial', 'Insidecity', 'kitchen', 'livingroom', 'Mountain', 'Office', 'OpenCountry', 'store', 'Street', 'Suburb', 'TallBuilding'])

# Cream el dataframe

In [36]:
import os
import pandas as pd
from skimage.feature import hog
from skimage import io
from sklearn.feature_extraction import image
from sklearn.decomposition import PCA
import time

def read_and_flatten_images(image_list):
    start_time = time.time()
    image_data = []
    
    for image_path in image_list:
        img = io.imread(image_path)
        pca_img = pca_reduction(img)
        hog_vectors = hog(img, orientations=8, pixels_per_cell=(10, 10),
                          cells_per_block=(5, 5), channel_axis=None)
        image_data.append(hog_vectors)
    
    end_time = time.time()
    print("Read + HOG time : ", end_time - start_time, " seconds")

    return image_data

def pca_reduction(input_data):
    start_time = time.time()
    pca = PCA(100)
    pca.fit(input_data)
    pca_data = pca.transform(input_data)
    end_time = time.time()
    print("PCA time : ", end_time - start_time, " seconds")
    return pca_data

In [37]:
image_df = pd.DataFrame()
for label, image_list in images_to_load.items():
    image_data = pca_reduction(read_and_flatten_images(images_to_load[label]))
    next_part = pd.DataFrame(image_data)
    next_part['label'] = label
    image_df = pd.concat([image_df, next_part], ignore_index=True)

Read + HOG time :  0.7747225761413574  seconds
PCA time :  0.654597282409668  seconds
Read + HOG time :  0.7965993881225586  seconds
PCA time :  0.574228048324585  seconds
Read + HOG time :  0.8286890983581543  seconds
PCA time :  0.5525014400482178  seconds
Read + HOG time :  0.7882201671600342  seconds
PCA time :  0.5575082302093506  seconds
Read + HOG time :  0.8167576789855957  seconds
PCA time :  0.5575056076049805  seconds
Read + HOG time :  0.8498179912567139  seconds
PCA time :  0.5550074577331543  seconds
Read + HOG time :  0.8083004951477051  seconds
PCA time :  0.5780367851257324  seconds
Read + HOG time :  0.8131923675537109  seconds
PCA time :  0.5519602298736572  seconds
Read + HOG time :  0.8158466815948486  seconds
PCA time :  0.539013147354126  seconds
Read + HOG time :  0.810218334197998  seconds
PCA time :  0.5630369186401367  seconds
Read + HOG time :  0.7988951206207275  seconds
PCA time :  0.5373702049255371  seconds
Read + HOG time :  0.8246369361877441  seconds


In [38]:
image_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,label
1495,-1.11733,-3.063767,-2.259401,-0.607102,2.390871,-2.539157,-0.695627,-0.074136,-1.294186,-1.891609,...,1.472413,0.870271,-0.114253,0.16213,0.269653,-0.158497,0.588908,-0.827581,1.079593e-14,TallBuilding
1496,-0.219208,-2.508322,0.348019,-3.306508,-0.279225,-1.135064,-1.488556,-0.318074,-1.477387,2.230991,...,1.747337,-0.316738,-0.151355,0.402712,-0.943636,0.806713,0.141501,0.552801,-9.165201e-15,TallBuilding
1497,-0.265521,-1.021326,2.910333,-3.872835,0.442327,-0.406851,-2.402198,2.023114,-2.490353,2.96015,...,0.036137,-0.109164,-0.479233,0.316661,0.09362,-0.399336,0.004073,0.25943,-2.796647e-15,TallBuilding
1498,-1.280339,-1.68808,0.824262,3.26396,0.699227,-0.198142,0.77815,-0.626669,1.319214,-1.037937,...,-0.271657,-0.553134,-0.030931,-0.076642,-1.123909,-0.52,-0.677214,-0.31372,-7.033775e-15,TallBuilding
1499,-1.802656,-1.562697,1.591577,0.549694,-0.204915,-1.391799,6.844907,1.153691,-0.134616,-0.700748,...,0.028836,-0.100821,-0.298116,-0.219455,-0.176442,-0.225566,-0.151665,-0.232655,7.139038e-15,TallBuilding


In [39]:
# TODO: read https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing
image_df.shape

(1500, 101)

# Cream els conjunts de test i d'entrenament

In [40]:
from sklearn.model_selection import train_test_split

# TODO: Es necesari? tenim un conjunt de test ja. Això seria un conjunt de validacio "per fer overfitting" :DD
X_train, X_test, y_train, y_test = train_test_split(image_df.drop('label', axis=1), image_df['label'], test_size=0.10, random_state=42)

# Entrenam el model

In [41]:
from sklearn.metrics import precision_score, make_scorer
from sklearn.svm import SVC

svm = SVC(C=1.0, kernel='poly', random_state=42)
svm.fit(X_train, y_train)

SVC(kernel='poly', random_state=42)

# Donam una predicció

In [42]:
y_predicted = svm.predict(X_test)

print(f"La precisició es de: {precision_score(y_test, y_predicted, average='micro')}")

La precisició es de: 0.08666666666666667


# Cercam els millors paràmetres del model

In [16]:
# Mostram tots el parametres del model
svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.5,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import itertools

# Funció que permet mostrar els millor hiper parametres
def showResults(grid_result, number_to_show = 5):
    print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    results = sorted(zip(means, stds, params), key=lambda x: x[0], reverse=True)
    for mean, stdev, param in itertools.islice(results, number_to_show):
        print(f"{mean} ({stdev}) with: {param}")

In [13]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'degree': [2, 4, 6, 8],
    #'coef0': [0.5, 1, 10],
    #'gamma': [0.1, 1, 100, 1000],
    #'C': [0.1, 1, 100, 1000],
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [14]:
# GridSearchCV pot ser molt lent ja que el random forest necesita ja un temps elevat inicialment
# Farem sevir una variant experimental que treballa dividint els entrenaments

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 900
Fitting 30 folds for each of 4 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 2
n_resources: 900
Fitting 30 folds for each of 2 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 1
n_resources: 900
Fitting 30 folds for each of 1 candidates, totalling 30 fits


In [17]:
showResults(grid_result)

Best: 0.5755555555555556 using {'kernel': 'poly'}
0.5755555555555556 (0.04955771461830439) with: {'kernel': 'poly'}
0.5740740740740741 (0.04930240382901848) with: {'kernel': 'poly'}
0.5733333333333334 (0.041928805031362704) with: {'kernel': 'linear'}
0.572962962962963 (0.05161453629214303) with: {'kernel': 'poly'}
0.5696296296296296 (0.0467840968137426) with: {'kernel': 'linear'}


In [22]:
model = grid_result.best_estimator_
pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, y_predicted, average='micro')}")

# Cercam els millor parametres per poly

In [32]:
svm = SVC(C=1.0, kernel='poly', random_state=42)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print(f"La precisició abans es de: {precision_score(y_test, pred, average='weighted')}")

La precisició abans es de: 0.5790054945054944


In [26]:
param_grid = {
    'degree': [1, 2, 3, 4, 6, 8],
    'coef0': [0.5, 1, 10]
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [27]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 18
n_resources: 900
Fitting 30 folds for each of 18 candidates, totalling 540 fits
----------
iter: 1
n_candidates: 9
n_resources: 900
Fitting 30 folds for each of 9 candidates, totalling 270 fits
----------
iter: 2
n_candidates: 5
n_resources: 900
Fitting 30 folds for each of 5 candidates, totalling 150 fits
----------
iter: 3
n_candidates: 3
n_resources: 900
Fitting 30 folds for each of 3 candidates, totalling 90 fits
----------
iter: 4
n_candidates: 2
n_resources: 900
Fitting 30 folds for each of 2 candidates, totalling 60 fits


In [28]:
showResults(grid_result)

Best: 0.5814814814814814 using {'coef0': 0.5, 'degree': 3}
0.5914814814814815 (0.04614196498616674) with: {'coef0': 10, 'degree': 8}
0.5907407407407407 (0.04408029834667594) with: {'coef0': 10, 'degree': 4}
0.5907407407407407 (0.04058049292499619) with: {'coef0': 10, 'degree': 3}
0.5903703703703703 (0.04533672221659069) with: {'coef0': 1, 'degree': 3}
0.59 (0.04673275743291799) with: {'coef0': 0.5, 'degree': 3}


In [33]:
model = grid_result.best_estimator_

model.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.5,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, pred, average='weighted')}")

La precisició es de: 0.5533333333333333


# Cercam els valors de C 

In [33]:
svm = SVC(C=1.0, kernel='poly', degree=3, coef0=0.5, random_state=42)
svm.fit(X_train, y_train)
pred = svm.predict(X_test)
print(f"La precisició abans es de: {precision_score(y_test, pred, average='weighted')}")

La precisició abans es de: 0.573478798978799


In [13]:
param_grid = {
    #'gamma': [0.1, 1, 100, 1000],
    'C': [0.1, 1, 100, 1000],
}

precion_micro_scorer = make_scorer(precision_score, average='micro')

In [18]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
grid_search = HalvingGridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=12, cv=cv, scoring=precion_micro_scorer,error_score=0, factor=2,aggressive_elimination=True, verbose=10)
grid_result = grid_search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 1
min_resources_: 900
max_resources_: 1350
aggressive_elimination: True
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 900
Fitting 30 folds for each of 4 candidates, totalling 120 fits
----------
iter: 1
n_candidates: 2
n_resources: 900
Fitting 30 folds for each of 2 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 1
n_resources: 900
Fitting 30 folds for each of 1 candidates, totalling 30 fits


In [19]:
showResults(grid_result)

Best: 0.5692592592592592 using {'C': 1000}
0.5796296296296297 (0.03785214176883973) with: {'C': 100}
0.5796296296296297 (0.03785214176883973) with: {'C': 1000}
0.5755555555555555 (0.05972502146755923) with: {'C': 1}
0.5755555555555555 (0.05972502146755923) with: {'C': 100}
0.5755555555555555 (0.05972502146755923) with: {'C': 1000}


In [27]:
model = grid_result.best_estimator_

model.get_params()

{'C': 1000,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.5,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
pred = model.predict(X_test)
print(f"La precisició es de: {precision_score(y_test, pred, average='weighted')}")

La precisició es de: 0.573478798978799
