In [1]:
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from joblib import dump

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


ModuleNotFoundError: No module named 'matplotlib'

In [2]:
# CUDA libraries
import cupy as cp 
import cudf
from cuml.preprocessing import StandardScaler
import dask_ml.model_selection as dcv
from cuml.ensemble import RandomForestClassifier
from cunl.svm import SVC


ModuleNotFoundError: No module named 'dask_ml'

**Loading the saved data and labels**

In [10]:
# Load from the .npz files

loaded_data = np.load('Extracted_features\\data.npz')
loaded_labels = np.load('Extracted_features\\labels.npz')
loaded_groups = np.load('Extracted_features\\groups.npz')

X = loaded_data["data"]
y = loaded_labels["labels"]
groups = loaded_groups["groups"]

In [11]:
# Checking the data type of X,y and groups
print(f"Type of X {type(X)}")
print(f"Type of y {type(y)}")
print(f"Type of groups {type(groups)}")

Type of X <class 'numpy.ndarray'>
Type of y <class 'numpy.ndarray'>
Type of groups <class 'numpy.ndarray'>


In [12]:
# Checking the shape of the dataset and the labels
print(f"Dataset shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"Groups shape: {groups.shape}")

Dataset shape: (17780, 49162)
Labels shape: (17780,)
Groups shape: (17780,)


In [13]:
print(y)

['cotton' 'cotton' 'cotton' ... 'wool' 'wool' 'wool']


**Encoding the target variable**

In [14]:
# Example array with categories
categories = ['linen', 'cotton', 'wool', 'denim', 'corduroy']

# Create a dictionary for manual mapping
category_mapping = { 'corduroy': 1, 'cotton': 2, 'denim': 3, 'linin': 4, 'wool': 5}

# Convert to pandas Series (optional if already in pandas)
y_series = pd.Series(y)

# Map categories to numbers
mapped_categories = y_series.map(category_mapping)
y = np.array(mapped_categories)


print(y)
print(y.shape)

[2 2 2 ... 5 5 5]
(17780,)


In [15]:
print(y_series.value_counts())
print(mapped_categories.value_counts())

linin       3585
denim       3575
corduroy    3550
cotton      3535
wool        3535
Name: count, dtype: int64
4    3585
3    3575
1    3550
2    3535
5    3535
Name: count, dtype: int64


In [16]:
# Since X,y and groups are numpy arrays , let's convert them to cupy arrays
X = cp.array(X)
y = cp.array(y)
groups= cp.array(groups)

  return _core.array(obj, dtype, copy, order, subok, ndmin, blocking)


In [17]:
# Checking the data type of X,y and groups
print(f"Type of X {type(X)}")
print(f"Type of y {type(y)}")
print(f"Type of groups {type(groups)}")

Type of X <class 'cupy.ndarray'>
Type of y <class 'cupy.ndarray'>
Type of groups <class 'cupy.ndarray'>


- **Splitting the original dataset into train(80%) and test(20%) using GroupShuffleSplit to ensure that the same group of images are either in the train or the test dataset only.**

In [21]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(cp.asnumpy(X), cp.asnumpy(y), groups=cp.asnumpy(groups)))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
groups_train = groups[train_idx]


OutOfMemoryError: Out of memory allocating 5,592,669,184 bytes (allocated so far: 6,993,131,008 bytes).

## Nested cross-validation setup

###  Define Outer Loop: Model Evaluation

In [12]:
outer_cv = GroupKFold(n_splits=5)

**Split Training Data into Outer Training and Validation Folds**

In [30]:
# outer_train_idx: Indices for the training data in the current fold
# outer_val_idx: Indices for the testing data in the current fold

# The following loops runs 5 times as the no of splits in outer_cv has been defined as 5
count=1
for outer_train_idx, outer_val_idx in outer_cv.split(cp.asnumpy(X_train), cp.asnumpy(y_train), groups=cp.asnumpy(groups_train)):
    # Train data for the current fold
    X_outer_train = X_train[outer_train_idx] # Training features
    y_outer_train = y_train[outer_train_idx] # Training lables
    groups_outer_train = groups_train[outer_train_idx] # Training groups

    # Validation data for the current fold
    X_outer_val = X_train[outer_val_idx] # Validation features
    y_outer_val = y_train[outer_val_idx] # Validation labels

    svm_model, rf_model = start_inner_loop()

    if not os.path.exists("Models"):
        os.makedirs("Models")

    dump(svm_model, f'Models\\svm_{count}.joblib')
    dump(rf_model,f"Models\\rf_{count}.joblib")
    count+=1

MemoryError: Unable to allocate 2.78 GiB for an array with shape (7585, 49162) and data type float64

### Inner Loop: Hyperparameter Tuning

**Define Inner Cross-Validation Loop**

- Using GroupKFold: For hyperparameter tuning within the outer training fold.

In [26]:
def start_inner_loop():

    # Defining inner cross-validation strategy
    inner_cv = GroupKFold(n_splits=3)

    # Creating pipelines and parameters grids for SVM and RF models
    pipeline_svm, pipeline_rf = create_pipelines()
    param_grid_svm, param_grid_rf = create_param_grids()

    # Performing RandomSearchCV for SVM and RF
    random_search_svm = randomized_search_svm(pipeline_svm,param_grid_svm,inner_cv)
    random_search_rf = randomized_search_rf(pipeline_rf,param_grid_rf,inner_cv)

    # Get the best models of SVM and RF
    best_model_svm = get_best_model_svm(random_search_svm)
    best_model_rf = get_best_model_rf(random_search_rf)

    evaluate_best_model_svm(best_model_svm)
    evaluate_best_model_rf(best_model_rf)

    return best_model_svm, best_model_rf
    

**Create Pipelines Including Preprocessing and Classifier**
- For SVM and Random Forest separately
- Include Preprocessing Steps:
   - Scaling (StandardScaler)
   - Principal Component Analysis (with number of components as hyperparameter)

In [17]:
# Pipeline for Support Vector Machine Classifier

def create_pipelines():
    pipeline_svm = Pipeline([
        ("scaler", StandardScaler()),
        ("pca",PCA()),
        ("classifier", SVC())
    ])

    # Pipeline for Random Forest Classifier
    pipeline_rf = Pipeline([
        ("scaler", StandardScaler()),
        ("pca",PCA()),
        ("classifier", RandomForestClassifier())
    ])

    return pipeline_svm, pipeline_rf

**Define Hyperparameter grids**

In [18]:
# Parameter grid for SVM
def create_param_grids():
    param_grid_svm = {
        'pca__n_components': [1000,2000,3000],
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__kernel': ['rbf'],
        'classifier__gamma': ['scale', 'auto']
    }


    # Parameter grid for Random Forest Classifier
    param_grid_rf = {
        'pca__n_components': [1000,2000,3000],
        'classifier__n_estimators': [100, 300],
        'classifier__max_depth': [10, 20]
    }

    return param_grid_svm, param_grid_rf


**Perform RandomizedSearch with Cross-validation for Hyperparameter Tuning**

In [28]:
# RandomziedSearchCV for SVM

def randomized_search_svm(pipeline_svm,param_grid_svm,inner_cv):
    random_search_svm = dcv.RandomizedSearchCV(
        estimator=pipeline_svm,
        param_distributions=param_grid_svm,
        cv = inner_cv.split(cp.asnumpy(X_outer_train),cp.asnumpy(y_outer_train), groups=cp.asnumpy(groups_outer_train)),
        scoring="f1_macro",
        n_iter=5,
        n_jobs=-1 # Uses all processors
    )

    random_search_svm.fit(X_outer_train, y_outer_train)

    return random_search_svm

In [21]:
def get_best_model_svm(random_search_svm):
    best_model_svm = random_search_svm.best_estimator_
    best_params_svm = random_search_svm.best_params_
    best_scores_svm = random_search_svm.best_score_

    print(f"Best params for SVM Model: {best_params_svm}")
    print(f"Best scoring(F1) for SVM Model: {best_scores_svm}")

    return best_model_svm

In [29]:
# RandomziedSearchCV for Random Forest Classifier
def randomized_search_rf(pipeline_rf, param_grid_rf, inner_cv):
    random_search_rf = dcv.RandomizedSearchCV(
        estimator=pipeline_rf,
        param_distributions=param_grid_rf,
        cv = inner_cv.split(cp.asnumpy(X_outer_train),cp.asnumpy(y_outer_train), groups=cp.asnumpy(groups_outer_train)),
        scoring="f1_macro",
        n_iter=5,
        n_jobs=-1 # Uses all processors
    )

    random_search_rf.fit(X_outer_train, y_outer_train)

    return random_search_rf

In [23]:
def get_best_model_rf(random_search_rf):
    best_model_rf = random_search_rf.best_estimator_
    best_params_rf = random_search_rf.best_params_
    best_scores_rf = random_search_rf.best_score_

    print(f"Best params for RF Model: {best_params_rf}")
    print(f"Best scoring(F1) for RF Model: {best_scores_rf}")

    return best_model_rf

**Evaluate Best Model from RandomizedSearch on Outer Validation Fold**

In [24]:
# For SVM Model
def evaluate_best_model_svm(best_model_svm):
    y_outer_val_pred_svm = best_model_svm.predict(X_outer_val)
    print(classification_report(y_outer_val,y_outer_val_pred_svm))


In [25]:
# For RF Model
def evaluate_best_model_rf(best_model_rf):
    y_outer_val_pred_rf = best_model_rf.predict(X_outer_val)
    print(classification_report(y_outer_val,y_outer_val_pred_rf))

**STEPS 9 ARE PENDING**