This model training notebook is a step forward from the learnings of the notebook `Model_Training_RAPIDS.ipynb` to reduce the load on our computational resources.

#### **Key Steps Taken in this notebook:**

- We are not using the NVIDIA RAPIDS & CuPy library and transferring all our load on our CPU. This is because we aim to reducce our computational load to an extent that we do not require the use of a GPU.
- Reduced the train data size from 80% to 70%.
- Reduced the outer loop folds of nested cross validation from 5 to 3 while keeping the inner loop folds the same as 3.
- Saving intermediate models generated by each complete iteration of RandomizedSearchCV to avoid redundant iterations when retraining the same models.
- Freeing up memory using Python's garbage collection module(gc) after one complete iteration of the outer cross vaildation loop.
- Deleting variables not in use as & whenever required.

#### **Model Evaluation after performing nested cross validation:**

From the classification reports generated by our nested cross validation, we can see that both the Support Vector Machine (SVM) models and Random Forest models have an accuracy and macro average F1 score around 50%.

This poor performance of the models has been analyzed in `MODEL_PERFORMANCE.md`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform, loguniform, randint
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import classification_report

from joblib import dump
import os

In [None]:
import os
import cv2
import pandas as pd
#import cupy as cp  # CuPy for GPU-based NumPy operations
import numpy as np
import tensorflow as tf
import scipy
from skimage.feature import local_binary_pattern
from skimage.filters import gabor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.preprocessing.image import ImageDataGenerator

**Loading the saved data and labels**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load from the .npz files


# Paths to the .npz files
output_dir = '/content/drive/My Drive/Fabric Detection Project/Extracted Features'
data_path = os.path.join(output_dir, 'X.npz')
labels_path = os.path.join(output_dir, 'y.npz')
groups_path = os.path.join(output_dir, 'groups.npz')


# Loading the .npz files
data = np.load(data_path)
labels = np.load(labels_path)
groups = np.load(groups_path)

#  Access the arrays stored inside the .npz files
X = data['data']
y = labels['labels']
groups = groups['groups']

In [None]:
# Checking the data type of X,y and groups
print(f"Type of X {type(X)}")
print(f"Type of y {type(y)}")
print(f"Type of groups {type(groups)}")

Type of X <class 'numpy.ndarray'>
Type of y <class 'numpy.ndarray'>
Type of groups <class 'numpy.ndarray'>


In [None]:
print(f"Dtype of X {X.dtype}")
print(f"Dtype of y {y.dtype}")
print(f"Dtype of groups {groups.dtype}")

Dtype of X float16
Dtype of y uint8
Dtype of groups int16


In [None]:
# Checking the shape of the dataset and the labels
print(f"Dataset shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Groups shape: {groups.shape}")

Dataset shape: (17875, 49162)
Labels shape: (17875,)
Groups shape: (17875,)


In [None]:
# Checking the size of X in GB
print(f"Size(GB) of X {X.nbytes/1e9}")

Size(GB) of X 1.7575415


In [None]:
import numpy as np

# Check if any values have non-zero decimals
has_decimals = np.any(X != np.floor(X))

if not has_decimals:
    # Convert the array to int32 if no decimals
    X = X.astype(np.int32)
    print("Array converted to int32 without decimals.")
else:
    print("Array contains non-zero decimals.")

print(f"Size(GB) of X {X.nbytes/1e9}")


Array contains non-zero decimals.
Size(GB) of X 7.030166


In [None]:
len(np.unique(X))

5822

In [None]:
# Check if values exceed float32 limits
float32_min = np.finfo(np.float32).min
float32_max = np.finfo(np.float32).max

# Check if any value is outside the float32 range
if np.any(X < float32_min) or np.any(X > float32_max):
    print("Array contains values outside the float32 range.")
else:
    print("All values are within the float32 range.")


All values are within the float32 range.


In [None]:
# Converting the X to float32 as all the values are within range of float32


3.4028235e+38

In [None]:
# Changing the dtype of groups into int32
print(f"Size of groups before changing to int32 is {groups.nbytes/1e9}")
groups = groups.astype(np.int32)
print(f"Size of groups after changing to int32 is {groups.nbytes/1e9}")

Size of groups before changing to int32 is 0.000143
Size of groups after changing to int32 is 7.15e-05


**Encoding the target variable**

In [None]:
# Example array with categories
categories = ['linen', 'cotton', 'wool', 'denim', 'corduroy']

# Create a dictionary for manual mapping
category_mapping = { 'corduroy': 1, 'cotton': 2, 'denim': 3, 'linin': 4, 'wool': 5}

# Convert to pandas Series (optional if already in pandas)
y_series = pd.Series(y)

# Map categories to numbers
mapped_categories = y_series.map(category_mapping)
y = np.array(mapped_categories)


#print(y)
print(y.shape)

(17875,)


In [None]:
print(y_series.value_counts())
print(mapped_categories.value_counts())

cotton      3630
linin       3585
denim       3575
corduroy    3550
wool        3535
Name: count, dtype: int64
2    3630
4    3585
3    3575
1    3550
5    3535
Name: count, dtype: int64


In [None]:
del y_series
del mapped_categories

- **Splitting the original dataset into train(80%) and test(20%) using GroupShuffleSplit to ensure that the same group of images are either in the train or the test dataset only.**

In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42) # Reduced the train data size from 80% to 70%
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
groups_train = groups[train_idx]


## Nested cross-validation setup

###  Define Outer Loop: Model Evaluation

In [None]:
outer_cv = GroupKFold(n_splits=3) # Reduced the outer loop folds from 5 to 3

**Split Training Data into Outer Training and Validation Folds**

In [None]:
import gc
# outer_train_idx: Indices for the training data in the current fold
# outer_val_idx: Indices for the testing data in the current fold

# The following loops runs 5 times as the no of splits in outer_cv has been defined as 5
count=1
for outer_train_idx, outer_val_idx in outer_cv.split(X_train, y_train, groups=groups_train):
    # Train data for the current fold
    X_outer_train = X_train[outer_train_idx] # Training features
    y_outer_train = y_train[outer_train_idx] # Training lables
    groups_outer_train = groups_train[outer_train_idx] # Training groups

    # Validation data for the current fold
    X_outer_val = X_train[outer_val_idx] # Validation features
    y_outer_val = y_train[outer_val_idx] # Validation labels

    start_inner_loop()
    del X_outer_train
    del y_outer_train
    del groups_outer_train
    del X_outer_val
    del y_outer_val
    gc.collect()  # Collect garbage to free memory
    count+=1

Loading cached model
Best params for SVM Model: {'classifier__C': 0.8358887124992451, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf', 'pca__n_components': 3000}
Best scoring(F1) for SVM Model: 0.4401678657074341
              precision    recall  f1-score   support

           1       0.33      0.77      0.46       820
           2       0.45      0.46      0.46       805
           3       0.94      0.64      0.76       855
           4       0.48      0.37      0.42       870
           5       0.45      0.11      0.17       820

    accuracy                           0.47      4170
   macro avg       0.53      0.47      0.45      4170
weighted avg       0.53      0.47      0.45      4170

Loading cached model
Best params for RF Model: {'classifier__max_depth': 20, 'classifier__n_estimators': 264, 'pca__n_components': 3000}
Best scoring(F1) for RF Model: 0.5235103763222304
              precision    recall  f1-score   support

           1       0.87      0.35      0.50   



```
# NESTED CROSS VALIDATION RESULTS
Loading cached model
Best params for SVM Model: {'classifier__C': 0.8358887124992451, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf',
 'pca__n_components': 3000}
Best scoring(F1) for SVM Model: 0.4401678657074341
              precision    recall  f1-score   support

           1       0.33      0.77      0.46       820
           2       0.45      0.46      0.46       805
           3       0.94      0.64      0.76       855
           4       0.48      0.37      0.42       870
           5       0.45      0.11      0.17       820

    accuracy                           0.47      4170
   macro avg       0.53      0.47      0.45      4170
weighted avg       0.53      0.47      0.45      4170

Loading cached model
Best params for RF Model: {'classifier__max_depth': 20, 'classifier__n_estimators': 264, 'pca__n_components': 3000}
Best scoring(F1) for RF Model: 0.5235103763222304
              precision    recall  f1-score   support

           1       0.87      0.35      0.50       820
           2       0.50      0.77      0.60       805
           3       0.64      0.86      0.74       855
           4       0.51      0.24      0.33       870
           5       0.35      0.45      0.39       820

    accuracy                           0.53      4170
   macro avg       0.57      0.53      0.51      4170
weighted avg       0.57      0.53      0.51      4170

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting model
Best params for SVM Model: {'classifier__C': 86.63620998244761, 'classifier__gamma': 'scale',
'classifier__kernel': 'rbf', 'pca__n_components': 1000}
Best scoring(F1) for SVM Model: 0.4754196642685851
              precision    recall  f1-score   support

           1       0.42      0.51      0.46       820
           2       0.49      0.54      0.51       805
           3       0.81      0.71      0.76       855
           4       0.47      0.44      0.46       860
           5       0.35      0.32      0.33       830

    accuracy                           0.50      4170
   macro avg       0.51      0.50      0.50      4170
weighted avg       0.51      0.50      0.50      4170

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting model
Best params for RF Model: {'classifier__max_depth': 20, 'classifier__n_estimators': 217,
'pca__n_components': 1000}
Best scoring(F1) for RF Model: 0.5054808893841479
              precision    recall  f1-score   support

           1       0.93      0.35      0.50       820
           2       0.50      0.76      0.60       805
           3       0.65      0.87      0.74       855
           4       0.64      0.34      0.44       860
           5       0.39      0.49      0.43       830

    accuracy                           0.56      4170
   macro avg       0.62      0.56      0.54      4170
weighted avg       0.62      0.56      0.54      4170

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting model
Best params for SVM Model: {'classifier__C': 12.662571614120008, 'classifier__gamma': 'scale',
'classifier__kernel': 'rbf', 'pca__n_components': 2000}
Best scoring(F1) for SVM Model: 0.4821342925659473
              precision    recall  f1-score   support

           1       0.45      0.59      0.51       820
           2       0.45      0.51      0.48       800
           3       0.90      0.73      0.81       860
           4       0.46      0.42      0.44       860
           5       0.43      0.36      0.39       830

    accuracy                           0.52      4170
   macro avg       0.54      0.52      0.53      4170
weighted avg       0.54      0.52      0.53      4170

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting model
Best params for RF Model: {'classifier__max_depth': 20, 'classifier__n_estimators': 285,
 'pca__n_components': 2000}
Best scoring(F1) for RF Model: 0.5198941496646005
              precision    recall  f1-score   support

           1       0.65      0.36      0.47       820
           2       0.52      0.73      0.61       800
           3       0.67      0.88      0.76       860
           4       0.55      0.33      0.41       860
           5       0.38      0.43      0.41       830

    accuracy                           0.55      4170
   macro avg       0.55      0.55      0.53      4170
weighted avg       0.55      0.55      0.53      4170

```



### Inner Loop: Hyperparameter Tuning

**Define Inner Cross-Validation Loop**

- Using GroupKFold: For hyperparameter tuning within the outer training fold.

In [None]:
def start_inner_loop():
    if not os.path.exists("/content/drive/My Drive/Fabric Detection Project/Models"):
          os.makedirs("/content/drive/My Drive/Fabric Detection Project/Models")
    # Defining inner cross-validation strategy
    inner_cv = GroupKFold(n_splits=3)

    # Creating pipelines and parameters grids for SVM and RF models
    pipeline_svm, pipeline_rf = create_pipelines()
    param_grid_svm, param_grid_rf = create_param_grids()

    # Performing RandomSearchCV for SVM and RF
    random_search_svm = randomized_search_svm(pipeline_svm,param_grid_svm,inner_cv)
    best_model_svm = get_best_model_svm(random_search_svm)
    evaluate_best_model_svm(best_model_svm)
    dump(best_model_svm, f'/content/drive/My Drive/Fabric Detection Project/Models/svm_{count}.joblib')
    del best_model_svm

    random_search_rf = randomized_search_rf(pipeline_rf,param_grid_rf,inner_cv)
    best_model_rf = get_best_model_rf(random_search_rf)
    evaluate_best_model_rf(best_model_rf)
    dump(best_model_rf,f"/content/drive/My Drive/Fabric Detection Project/Models/rf_{count}.joblib")
    del best_model_rf



**Create Pipelines Including Preprocessing and Classifier**
- For SVM and Random Forest separately
- Include Preprocessing Steps:
   - Scaling (StandardScaler)
   - Principal Component Analysis (with number of components as hyperparameter)

In [None]:
# Pipeline for Support Vector Machine Classifier

def create_pipelines():
    pipeline_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("pca",PCA()),
        ("classifier", SVC())
    ])

    # Pipeline for Random Forest Classifier
    pipeline_rf = Pipeline([
        ("scaler", StandardScaler()),
        ("pca",PCA()),
        ("classifier", RandomForestClassifier())
    ])

    return pipeline_svm, pipeline_rf

In [None]:
# Parameter grid for SVM
def create_param_grids():
    param_grid_svm = {
        'pca__n_components': [1000,2000,3000],
        'classifier__C': loguniform(1e-3, 1e3),
        'classifier__kernel': ['rbf'],
        'classifier__gamma': ['scale', 'auto']
    }


    # Parameter grid for Random Forest Classifier
    param_grid_rf = {
        'pca__n_components': [1000,2000,3000],
        'classifier__n_estimators':randint(100,300),
        'classifier__max_depth': [10, 20]
    }

    return param_grid_svm, param_grid_rf


**Define Hyperparameter grids**

**Perform RandomizedSearch with Cross-validation for Hyperparameter Tuning**

In [None]:
from joblib import dump, load


def randomized_search_svm(pipeline_svm, param_grid_svm, inner_cv):
    cache_dir = '/content/drive/My Drive/Fabric Detection Project/model_cache_svm'
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    # Use joblib to cache RandomizedSearchCV
    random_search_svm = RandomizedSearchCV(
        pipeline_svm,
        param_distributions=param_grid_svm,
        cv=inner_cv,
        n_jobs=-1,
        n_iter=3,
        verbose=4
    )

    cached_search = os.path.join(cache_dir, f'random_search_svm_{count}.joblib')
    if os.path.exists(cached_search):
        print("Loading cached model")
        random_search_svm = load(cached_search)
    else:
        random_search_svm.fit(X_outer_train, y_outer_train,groups=groups_outer_train)
        print("Fitting model")
        dump(random_search_svm, cached_search)

    return random_search_svm


In [None]:
def get_best_model_svm(random_search_svm):
    best_model_svm = random_search_svm.best_estimator_
    best_params_svm = random_search_svm.best_params_
    best_scores_svm = random_search_svm.best_score_

    print(f"Best params for SVM Model: {best_params_svm}")
    print(f"Best scoring(F1) for SVM Model: {best_scores_svm}")

    return best_model_svm

In [None]:
def randomized_search_rf(pipeline_rf, param_grid_rf, inner_cv):
    cache_dir = '/content/drive/My Drive/Fabric Detection Project/model_cache_svm'
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    # Use joblib to cache RandomizedSearchCV
    random_search_rf = RandomizedSearchCV(
        estimator=pipeline_rf,
        param_distributions=param_grid_rf,
        cv = inner_cv,
        scoring="f1_macro",
        n_iter=3,
        n_jobs=-1,
        verbose=4 # Uses all processors
    )

    cached_search = os.path.join(cache_dir, f'random_search_rf_{count}.joblib')
    if os.path.exists(cached_search):
        print("Loading cached model")
        random_search_rf = load(cached_search)
    else:
        random_search_rf.fit(X_outer_train, y_outer_train, groups=groups_outer_train)
        print("Fitting model")
        dump(random_search_rf, cached_search)


    return random_search_rf

In [None]:
def get_best_model_rf(random_search_rf):
    best_model_rf = random_search_rf.best_estimator_
    best_params_rf = random_search_rf.best_params_
    best_scores_rf = random_search_rf.best_score_

    print(f"Best params for RF Model: {best_params_rf}")
    print(f"Best scoring(F1) for RF Model: {best_scores_rf}")

    return best_model_rf

**Evaluate Best Model from RandomizedSearch on Outer Validation Fold**

In [None]:
# For SVM Model
def evaluate_best_model_svm(best_model_svm):
    y_outer_val_pred_svm = best_model_svm.predict(X_outer_val)
    print(classification_report(y_outer_val,y_outer_val_pred_svm))


In [None]:
# For RF Model
def evaluate_best_model_rf(best_model_rf):
    y_outer_val_pred_rf = best_model_rf.predict(X_outer_val)
    print(classification_report(y_outer_val,y_outer_val_pred_rf))

**STEPS 9 ARE PENDING**