<a href="https://colab.research.google.com/github/Alberto-San/ExperimentosMonografia/blob/main/LOF_tunning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import pandas as pd 
import numpy as np 

def load_data():
  path = "color_statistics.csv"
  class_label = "class"
  label = "im_Superficial-Intermediate"
  path_label = "image_path"
  table = pd.read_csv(path)
  labels = list(table[class_label].drop_duplicates().values)
  data = {}

  for label in labels:
    table_label = table[table[class_label] == label]
    columns_to_drop = [class_label, path_label]
    X = table_label.drop(columns_to_drop, axis=1)
    data[label] = {}
    data[label]["table"] = X

  return data, labels

In [77]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

def find_best_hyperparameters(data, label):
  X = data[label]["table"]

  # Load your data
  X = X.to_numpy()

  # Define the range of values for n_neighbors and contamination to try
  n_neighbors_values = [5, 10, 15, 20]
  contamination_values = [value/100 for value in range(5, 50, 5)]

  # Initialize variables to store the best parameters and the corresponding mean squared error
  best_n_neighbors = None
  best_contamination = None
  best_mse = np.inf

  # Perform the grid search on the data
  for n_neighbors in n_neighbors_values:
      for contamination in contamination_values:
          # Define the LOF model with the current parameter values
          lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination, novelty=True)

          # Fit the model on the data
          lof.fit(X)

          # Compute the mean squared error of the predictions
          y_pred = lof.predict(X)
          mse = mean_squared_error(np.ones(len(X)), y_pred)

          # Update the best parameters and the corresponding mean squared error
          if mse < best_mse:
              best_n_neighbors = n_neighbors
              best_contamination = contamination
              best_mse = mse

  # Print the best parameters and the corresponding mean squared error
  print("Label is {}".format(label))
  print("Best n_neighbors:", best_n_neighbors)
  print("Best contamination:", best_contamination)
  print("Best mean squared error:", best_mse)

  data[label]["best_params"] = {
      "best_n_neighbors": best_n_neighbors,
      "best_contamination": best_contamination,
      "best_mse": best_mse
  }
  return data

data, labels = load_data()

for label in labels:
  data = find_best_hyperparameters(data, label)

Label is im_Superficial-Intermediate
Best n_neighbors: 5
Best contamination: 0.05
Best mean squared error: 0.11552346570397112
Label is im_Dyskeratotic
Best n_neighbors: 5
Best contamination: 0.05
Best mean squared error: 0.11316113161131611
Label is im_Parabasal
Best n_neighbors: 5
Best contamination: 0.05
Best mean squared error: 0.09148665819567979
Label is im_Metaplastic
Best n_neighbors: 5
Best contamination: 0.05
Best mean squared error: 0.10592686002522068
Label is im_Koilocytotic
Best n_neighbors: 5
Best contamination: 0.05
Best mean squared error: 0.11636363636363636
