In [36]:
import numpy as np
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from torch.utils.data import DataLoader
from PIL import Image
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_curve,
    auc,
)


In [1]:
from dotenv import load_dotenv
load_dotenv()

True

## Metrics for the model

In [37]:
def metrics(y_test, y_pred, title):
    print(f"{title} metrics: ") 
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("F1", f1_score(y_test, y_pred, average='weighted'))
    print("Recall", recall_score(y_test, y_pred, average='weighted'))
    print("Precision", precision_score(y_test, y_pred, average='weighted'))
def plot_multiclass_roc_auc(y_true, y_pred_proba, n_classes):
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true == i, y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label=f"Class {i}: AUC = {roc_auc[i]:.2f}")

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Tasa de falsos positivos')
    plt.ylabel('Tasa de verdaderos positivos')
    plt.title('Curva ROC multiclase')
    plt.legend(loc="lower right")
    plt.show()

# Download dataset

In [31]:
!kaggle datasets download -d pratik2901/multiclass-weather-dataset 

Downloading multiclass-weather-dataset.zip to /home/davidh/Documentos/Uni/DataScience/Clasificación
 99%|█████████████████████████████████████▍| 90.0M/91.4M [00:05<00:00, 13.0MB/s]
100%|██████████████████████████████████████| 91.4M/91.4M [00:05<00:00, 16.1MB/s]


In [None]:
!unzip multiclass-weather-dataset.zip 
!rm multiclass-weather-dataset.zip
!mv "Multi-class Weather Dataset" dataset

# Load CNN model to extract features

In [20]:
model = models.resnet18(pretrained=True, progress=True)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval()

def extract_features(loader, model):
    features = []
    labels = []
    for images, image_labels in loader:
        with torch.no_grad():
            outputs = model(images)
            features.extend(outputs.view(outputs.size(0), -1).numpy())
            labels.extend(image_labels.numpy())
    return np.array(features), np.array(labels)


# Prepare dataset

In [33]:
size = 28
path = 'dataset'
batch_size=32
transform = transforms.Compose([
    transforms.Resize(size),
    transforms.CenterCrop(size),
    transforms.ToTensor(),
    transforms.Normalize([0.4001, 0.4313, 0.4275], [0.0229, 0.0205, 0.0189])
])
dataset = datasets.ImageFolder(root=path, transform=transform)

train_data, test_data = train_test_split(dataset, test_size=0.2)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

train_features, train_labels = extract_features(train_loader, model)
valid_features, valid_labels = extract_features(test_loader, model)

X_train = train_features
y_train = train_labels
X_test = valid_features
y_test = valid_labels
n_classes = len(np.unique(y_test))

# SVC

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear', 'poly']}

grid_search=GridSearchCV(SVC(),param_grid,cv=5,n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Support Vector Machine")
print("Best hyperparameters:")
print(grid_search.best_params_)
print("Best score:")
print(grid_search.best_score_)

In [None]:
svcClassifier = SVC(C=grid_search.best_params_['C'],kernel= grid_search.best_params_['kernel'], gamma=grid_search.best_params_['gamma'], decision_function_shape=grid_search.best_params_['decision_function_shape'])
svcClassifier.fit(X_train, y_train)
y_predSVC = svcClassifier.predict(X_test)
metrics(y_test, y_predSVC, "SVC")

In [None]:
plot_multiclass_roc_auc(y_test, svcClassifier.predict_proba(X_test), n_classes)

# Decission Tree

In [None]:
param_grid = {'criterion':['gini','entropy'],'max_depth':[4,8,12,20,50,120,150],'splitter':['best','random']}
dt_grid_search = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5,n_jobs=-1)
dt_grid_search.fit(X_train, y_train)
 
print("Decision Tree")
print("Best hyperparameters:")
print(dt_grid_search.best_params_)
print("Best score:")
print(dt_grid_search.best_score_)

In [None]:
dt_classifier = DecisionTreeClassifier(criterion=dt_grid_search.best_params_['criterion'],max_depth=dt_grid_search.best_params_['max_depth'],splitter=dt_grid_search.best_params_['splitter'])
dt_grid_search.fit(X_train, y_train)
y_pred_DT = dt_grid_search.predict(X_test)
metrics(y_test, y_pred_DT, "Decision Tree")

In [None]:
plot_multiclass_roc_auc(y_test, dt_grid_search.predict_proba(X_test), n_classes)

# Random Forest

In [None]:
param_grid = {'n_estimators':[10,50,100,200,500,1000],'criterion':['gini','entropy'],'max_depth':[4,8,12,20,50,120,150]}
rf_grid_search = GridSearchCV(RandomForestClassifier(),param_grid,cv=5,n_jobs=-1)

rf_grid_search.fit(X_train, y_train)
print("Random Forest")
print("Best hyperparameters:")
print(rf_grid_search.best_params_)
print("Best score:")
print(rf_grid_search.best_score_)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=rf_grid_search.best_params_['n_estimators'],criterion=rf_grid_search.best_params_['criterion'],max_depth=rf_grid_search.best_params_['max_depth'])
rf_classifier.fit(X_train, y_train)

y_pred_RF = rf_classifier.predict(X_test)

metrics(y_test, y_pred_RF, "Random Forest")

In [None]:
plot_multiclass_roc_auc(y_test, dt_grid_search.predict_proba(X_test), n_classes)