In [8]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import OPTICS
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
import pandas as pd


data = pd.read_csv("./dimension_reduction_datasets/t-SNEmaxabsScaler-KNN.csv")
X = data[["PCA1", "PCA2"]]

# Define a custom silhouette score function
def custom_silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) > 1:
        silhouette_avg = silhouette_score(X, labels)
        return silhouette_avg
    else:
        return 0;

# Create an OPTICS model
optics = OPTICS()

# Define a parameter grid for tuning
param_grid = {
    'min_samples': [5, 10, 20],
    'xi': [0.05, 0.1, 0.2],
    'cluster_method': ['xi', 'dbscan']
}

# Create a GridSearchCV object with the custom silhouette scorer
grid_search = GridSearchCV(optics, param_grid, scoring=custom_silhouette_scorer, cv=5)

# Fit the GridSearchCV to your data
grid_search.fit(X)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best OPTICS model
best_optics = grid_search.best_estimator_

# Perform clustering with the best model
cluster_labels = best_optics.fit_predict(X)

# Evaluate cluster quality using silhouette score
if len(set(cluster_labels)) > 1:
    silhouette_avg = silhouette_score(X, cluster_labels)
    
    print("Best Parameters:", best_params)
    print("Silhouette Score:", silhouette_avg)


Best Parameters: {'cluster_method': 'xi', 'min_samples': 10, 'xi': 0.05}
Silhouette Score: -0.31275662746160615


In [2]:
import numpy as np
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.datasets import make_moons
from sklearn.metrics import silhouette_score

# Generate some example data (you can replace this with your dataset)
data, _ = make_moons(n_samples=300, noise=0.05, random_state=42)

# Define a range of hyperparameters to search
eps_values_dbscan = [0.1, 0.2, 0.3]
min_samples_values_dbscan = [2, 3, 5, 7]

min_samples_values_optics = [2, 3, 5, 7]
xi_values_optics = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5]

best_dbscan_silhouette_score = -1
best_dbscan_eps = None
best_dbscan_min_samples = None

best_optics_silhouette_score = -1
best_optics_min_samples = None
best_optics_xi = None

# Hyperparameter tuning for DBSCAN
for eps in eps_values_dbscan:
    for min_samples in min_samples_values_dbscan:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_labels = dbscan.fit_predict(data)
        
        silhouette = silhouette_score(data, dbscan_labels)
        
        if silhouette > best_dbscan_silhouette_score:
            best_dbscan_silhouette_score = silhouette
            best_dbscan_eps = eps
            best_dbscan_min_samples = min_samples

print("DBSCAN Hyperparameter Tuning Results:")
print(f"Best DBSCAN Silhouette Score: {best_dbscan_silhouette_score}")
print(f"Best EPS: {best_dbscan_eps}")
print(f"Best Min Samples: {best_dbscan_min_samples}")

# Hyperparameter tuning for OPTICS
for min_samples in min_samples_values_optics:
    for xi in xi_values_optics:
        optics = OPTICS(min_samples=min_samples, xi=xi)
        optics_labels = optics.fit_predict(data)
        
        silhouette = silhouette_score(data, optics_labels)
        
        if silhouette > best_optics_silhouette_score:
            best_optics_silhouette_score = silhouette
            best_optics_min_samples = min_samples
            best_optics_xi = xi

print("\nOPTICS Hyperparameter Tuning Results:")
print(f"Best OPTICS Silhouette Score: {best_optics_silhouette_score}")
print(f"Best Min Samples: {best_optics_min_samples}")
print(f"Best Xi: {best_optics_xi}")


Traceback (most recent call last):
  File "C:\Users\awotoroe\.conda\envs\jupyter_env\lib\site-packages\sklearn\model_selection\_validation.py", line 808, in _score
    scores = scorer(estimator, X_test)
  File "C:\Users\awotoroe\AppData\Local\Temp\ipykernel_13388\137391377.py", line 13, in custom_silhouette_scorer
    silhouette_avg = silhouette_score(X, labels)
  File "C:\Users\awotoroe\.conda\envs\jupyter_env\lib\site-packages\sklearn\utils\_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\awotoroe\.conda\envs\jupyter_env\lib\site-packages\sklearn\metrics\cluster\_unsupervised.py", line 130, in silhouette_score
    return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
  File "C:\Users\awotoroe\.conda\envs\jupyter_env\lib\site-packages\sklearn\utils\_param_validation.py", line 184, in wrapper
    return func(*args, **kwargs)
  File "C:\Users\awotoroe\.conda\envs\jupyter_env\lib\site-packages\sklearn\metrics\cluster\_unsupervis

Best Parameters: {'cluster_method': 'xi', 'min_samples': 10, 'xi': 0.05}
Silhouette Score: -0.31275662746160615


In [15]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN, OPTICS
from sklearn.datasets import make_moons
from sklearn.metrics import silhouette_score
import altair as alt

# Generate some example data (you can replace this with your dataset)
data, _ = make_moons(n_samples=300, noise=0.05, random_state=42)

# Define a range of hyperparameters to search
eps_values_dbscan = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
min_samples_values_dbscan = [3, 4, 5, 6, 7]

min_samples_values_optics = [3, 4, 5, 6, 7]
xi_values_optics = [0.01, 0.1, 0.2, 0.3, 0.5]

# Lists to store Silhouette Scores
dbscan_scores = []
optics_scores = []

# Hyperparameter tuning for DBSCAN
for eps in eps_values_dbscan:
    for min_samples in min_samples_values_dbscan:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_labels = dbscan.fit_predict(data)
        
        silhouette = silhouette_score(data, dbscan_labels)
        dbscan_scores.append((eps, min_samples, silhouette))

dbscan_scores = np.array(dbscan_scores)

# Hyperparameter tuning for OPTICS
for min_samples in min_samples_values_optics:
    for xi in xi_values_optics:
        optics = OPTICS(min_samples=min_samples, xi=xi)
        optics_labels = optics.fit_predict(data)
        
        silhouette = silhouette_score(data, optics_labels)
        optics_scores.append((min_samples, xi, silhouette))

optics_scores = np.array(optics_scores)

# Create DataFrames for Altair
dbscan_df = pd.DataFrame(dbscan_scores, columns=['EPS', 'Min Samples', 'Silhouette Score'])
optics_df = pd.DataFrame(optics_scores, columns=['Min Samples', 'Xi', 'Silhouette Score'])

# Create Altair line charts for DBSCAN and OPTICS
dbscan_chart = alt.Chart(dbscan_df).mark_line().encode(
    x='EPS',
    y='Silhouette Score',
    color='Min Samples:N'
).properties(
    title='DBSCAN Silhouette Scores',
    width=400,
    height=300
)

optics_chart = alt.Chart(optics_df).mark_line().encode(
    x='Xi',
    y='Silhouette Score',
    color='Min Samples:N'
).properties(
    title='OPTICS Silhouette Scores',
    width=400,
    height=300
)

# Combine the charts
combined_charts = dbscan_chart | optics_chart

combined_charts


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [29]:
import pandas as pd

# Example JSON data
data = {
    'device': [], 
    'other': [
        {
            'weight': 'b', 
            'mainterm': {'content': 'adult'}, 
            'ancestor': ['adult', 'groups by age', 'groups by age and sex']
        }, 
        {
            'weight': 'b', 
            'mainterm': {'content': 'early diagnosis'}, 
            'ancestor': ['diagnosis', 'diagnostic procedure', 'early diagnosis', 'medical procedures', 'procedures', 'procedures, parameters and devices']
        }, 
        {
            'weight': 'b', 
            'mainterm': {'content': 'female'}, 
            'ancestor': ['female', 'groups by age and sex', 'groups by sex']
        },
    ]
}

# Initialize variables to store 'domain' and 'other_domain' values
domain_values = []
other_domain_values = []

# Iterate over rows in the 'other' list
for row in data['other']:
    mainterm_content = row['mainterm']['content']
    ancestor_list = row['ancestor']
    
    # Accumulate 'domain' values in a list
    domain_values.append(mainterm_content)
    
    # Accumulate 'other_domain' values as a list of dictionaries
    other_domain_values.append({'ancestor': ancestor_list})

# Create a DataFrame with accumulated values
df_result = pd.DataFrame({
    'domain': [', '.join(domain_values)],
    'other_domain': [other_domain_values]
})

# Display the resulting DataFrame
df_result


Unnamed: 0,domain,other_domain
0,"adult, early diagnosis, female","[{'ancestor': ['adult', 'groups by age', 'grou..."
