In this notebook we have only performed nested cross validation using Random Forest Model. Also we have narrowed down to two categories now: corduroy and denim.


Hyperparamters:

**PCA n_components have been taken as None(maximum)**.

**n_estimators** = (500,600)

Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform, loguniform, randint
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import classification_report

from joblib import dump
from joblib import load
import os

In [2]:
import os
import cv2
import pandas as pd
#import cupy as cp  # CuPy for GPU-based NumPy operations
import numpy as np
import tensorflow as tf
import scipy

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras.preprocessing.image import ImageDataGenerator

**Loading the saved data and labels**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load from the .npz files


# Paths to the .npz files
output_dir = '/content/drive/My Drive/Fabric Detection Project/Extracted_Features_New'
data_path = os.path.join(output_dir, 'X.npz')
labels_path = os.path.join(output_dir, 'y.npz')
groups_path = os.path.join(output_dir, 'groups.npz')


# Loading the .npz files
data = np.load(data_path)
labels = np.load(labels_path)
groups = np.load(groups_path)

#  Access the arrays stored inside the .npz files
X = data['data']
y = labels['labels']
groups = groups['groups']

In [5]:
# Checking the data type of X,y and groups
print(f"Type of X {type(X)}")
print(f"Type of y {type(y)}")
print(f"Type of groups {type(groups)}")

Type of X <class 'numpy.ndarray'>
Type of y <class 'numpy.ndarray'>
Type of groups <class 'numpy.ndarray'>


In [6]:
print(f"Dtype of X {X.dtype}")
print(f"Dtype of y {y.dtype}")
print(f"Dtype of groups {groups.dtype}")

Dtype of X float16
Dtype of y uint8
Dtype of groups int16


In [7]:
# Checking the shape of the dataset and the labels
print(f"Dataset shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Groups shape: {groups.shape}")

Dataset shape: (7175, 49162)
Labels shape: (7175,)
Groups shape: (7175,)


In [8]:
# Checking the size of X in GB
print(f"Size(GB) of X {X.nbytes/1e9}")

Size(GB) of X 0.7054747


- **Splitting the original dataset into train(80%) and test(20%) using GroupShuffleSplit to ensure that the same group of images are either in the train or the test dataset only.**

In [9]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
groups_train = groups[train_idx]


## Nested cross-validation setup

###  Define Outer Loop: Model Evaluation

In [None]:
outer_cv = GroupKFold(n_splits=2)

**Split Training Data into Outer Training and Validation Folds**

In [None]:
import gc
# outer_train_idx: Indices for the training data in the current fold
# outer_val_idx: Indices for the testing data in the current fold

# The following loops runs 5 times as the no of splits in outer_cv has been defined as 5
count=1
for outer_train_idx, outer_val_idx in outer_cv.split(X_train, y_train, groups=groups_train):
    # Train data for the current fold
    X_outer_train = X_train[outer_train_idx] # Training features
    y_outer_train = y_train[outer_train_idx] # Training lables
    groups_outer_train = groups_train[outer_train_idx] # Training groups

    # Validation data for the current fold
    X_outer_val = X_train[outer_val_idx] # Validation features
    y_outer_val = y_train[outer_val_idx] # Validation labels

    start_inner_loop()
    del X_outer_train
    del y_outer_train
    del groups_outer_train
    del X_outer_val
    del y_outer_val
    gc.collect()  # Collect garbage to free memory
    count+=1

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting model
Best params for RF Model: {'classifier__max_depth': 39, 'classifier__min_samples_split': 148, 'classifier__n_estimators': 565}
Best scoring(F1) for RF Model: 0.8888050116509408
              precision    recall  f1-score   support

           1       0.94      0.78      0.85      1230
           2       0.82      0.95      0.88      1280

    accuracy                           0.87      2510
   macro avg       0.88      0.87      0.87      2510
weighted avg       0.88      0.87      0.87      2510

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Fitting model
Best params for RF Model: {'classifier__max_depth': 38, 'classifier__min_samples_split': 128, 'classifier__n_estimators': 600}
Best scoring(F1) for RF Model: 0.8623728850478684
              precision    recall  f1-score   support

           1       0.94      0.80      0.87      1225
           2       0.84      0.95      0.89      1285

    accur

### Inner Loop: Hyperparameter Tuning

**Define Inner Cross-Validation Loop**

- Using GroupKFold: For hyperparameter tuning within the outer training fold.

In [None]:
def start_inner_loop():
    if not os.path.exists("/content/drive/My Drive/Fabric Detection Project/Models/Random_Forest_Models/iter_3"):
          os.makedirs("/content/drive/My Drive/Fabric Detection Project/Models/Random_Forest_Models/iter_3")
    # Defining inner cross-validation strategy
    inner_cv = GroupKFold(n_splits=3)

    # Creating pipelines and parameters grids for RF models
    pipeline_rf = create_pipelines()
    param_grid_rf = create_param_grids()

    # Performing RandomSearchCV for RF

    random_search_rf = randomized_search_rf(pipeline_rf,param_grid_rf,inner_cv)
    best_model_rf = get_best_model_rf(random_search_rf)
    evaluate_best_model_rf(best_model_rf)
    dump(best_model_rf,f"/content/drive/My Drive/Fabric Detection Project/Models/Random_Forest_Models/iter_3/rf_{count}.joblib")
    del best_model_rf



**Create Pipelines Including Preprocessing and Classifier**
- For SVM and Random Forest separately
- Include Preprocessing Steps:
   - Scaling (StandardScaler)
   - Principal Component Analysis (with number of components as hyperparameter)

In [None]:
# Pipeline for Support Vector Machine Classifier

def create_pipelines():
    # Pipeline for Random Forest Classifier
    pipeline_rf = Pipeline([
        ("scaler", StandardScaler()),
        ("pca",PCA(n_components=None)),
        ("classifier", RandomForestClassifier())
    ])

    return pipeline_rf


In [None]:
# Creating Parameter Grids
def create_param_grids():

  # Used Wolfram to decide mean and std_dev https://www.wolframalpha.com/input?i=X%7EN%28450%2C40%5E2%29+P%28300%3C%3DX%3C%3D500%29
  # Define parameters for the normal distribution
  mean = 550      # Center of the distribution
  std_dev =40    # Standard deviation (controls spread)

  # Generate a normal distribution of float values
  values = np.random.normal(loc=mean, scale=std_dev, size=1000)

  # Clip the values to ensure they lie within the range [300, 500]
  values = np.clip(values, 500, 600)

  # Convert the float values to integers
  n_estimators = values.astype(int)

  # Randomly sample from the integer values (e.g., sample 100 values)
  #n_estimators = np.random.choice(integer_values, size=100, replace=False)

  # Parameter grid for Random Forest Classifier
  param_grid_rf = {
      'classifier__n_estimators':n_estimators,
      'classifier__max_depth':randint(30,40),
      'classifier__min_samples_split': randint(100,200)
  }

  return param_grid_rf


**Define Hyperparameter grids**

**Perform RandomizedSearch with Cross-validation for Hyperparameter Tuning**

In [None]:
def randomized_search_rf(pipeline_rf, param_grid_rf, inner_cv):
    cache_dir = '/content/drive/My Drive/Fabric Detection Project/model_cache_rf/iter_3'
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    # Use joblib to cache RandomizedSearchCV
    random_search_rf = RandomizedSearchCV(
        estimator=pipeline_rf,
        param_distributions=param_grid_rf,
        cv = inner_cv,
        scoring="f1_macro",
        n_iter=3,
        n_jobs=-1,
        verbose=4 # Uses all processors
    )

    cached_search = os.path.join(cache_dir, f'random_search_rf_{count}.joblib')
    if os.path.exists(cached_search):
        print("Loading cached model")
        random_search_rf = load(cached_search)
    else:
        random_search_rf.fit(X_outer_train, y_outer_train, groups=groups_outer_train)
        print("Fitting model")
        dump(random_search_rf, cached_search)


    return random_search_rf

In [None]:
def get_best_model_rf(random_search_rf):
    best_model_rf = random_search_rf.best_estimator_
    best_params_rf = random_search_rf.best_params_
    best_scores_rf = random_search_rf.best_score_

    print(f"Best params for RF Model: {best_params_rf}")
    print(f"Best scoring(F1) for RF Model: {best_scores_rf}")

    return best_model_rf

**Evaluate Best Model from RandomizedSearch on Outer Validation Fold**

In [None]:
# For RF Model
def evaluate_best_model_rf(best_model_rf):
    y_outer_val_pred_rf = best_model_rf.predict(X_outer_val)
    print(classification_report(y_outer_val,y_outer_val_pred_rf))

**STEPS 9 ARE PENDING**

### Step 9: Final Model Training on Entire Training Data

We need a final pipeline for feature engineering on the whole training data. And then we will test the model on the test data.

In [10]:
final_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("pca",PCA(n_components=2510)),
        ("classifier", RandomForestClassifier(max_depth=38, min_samples_split=128, n_estimators=600))
    ])

In [None]:
Best params for RF Model: {'classifier__max_depth': 38, 'classifier__min_samples_split': 128, 'classifier__n_estimators': 600}

In [11]:
# Fit on entire training data
final_pipeline.fit(X_train, y_train)

### Step 10:

#### 10.1 Predict on Test Data

In [12]:
y_test_pred = final_pipeline.predict(X_test)

#### 10.2 Compute Performance Metrics

In [13]:
#test_accuracy = accuracy_score(y_test, y_test_pred)
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           1       0.96      0.74      0.83      1095
           2       0.78      0.97      0.86      1060

    accuracy                           0.85      2155
   macro avg       0.87      0.85      0.85      2155
weighted avg       0.87      0.85      0.85      2155

