**Cluster 1 Silhouette Coefficient**

In [None]:
import knime.scripting.io as knio
import pandas as pd
from sklearn.metrics import silhouette_score

# Load input table
df = knio.input_tables[0].to_pandas()

# Input Validation: Check required columns
required_columns = ['Count*(trip time)', 'Mean(trip time)', 'Cluster']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

# Handle NaN values: Drop rows with missing data
original_row_count = len(df)
df = df.dropna(subset=required_columns)

# Sampling to reduce dataset size if needed
df = df.sample(n=10000, random_state=42) if len(df) > 10000 else df

# Extract Features and Labels
features = df[['Count*(trip time)', 'Mean(trip time)']]
labels = df['Cluster']

# Check if we have enough data for silhouette computation
if len(set(labels)) < 2:
    raise ValueError("Silhouette Score requires at least 2 clusters.")

# Calculate Silhouette Score with error handling
try:
    score = silhouette_score(features, labels)
except Exception as e:
    raise RuntimeError(f"Error calculating Silhouette Score: {e}")

# Gather Cluster Metadata
cluster_sizes = df['Cluster'].value_counts().to_dict()
num_clusters = len(cluster_sizes)

# Create Output Table
output_df = pd.DataFrame({
    "Silhouette Score": [score],
    "Number of Clusters": [num_clusters],
    "Cluster Sizes": [str(cluster_sizes)],
    "Processed Rows": [len(df)],
    "Original Rows": [original_row_count]  # Size before dropping NaNs
})

# Output the result
knio.output_tables[0] = knio.Table.from_pandas(output_df)

**Cluster1 Python View**

In [None]:
import knime.scripting.io as knio
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_samples
import ast
import numpy as np

# Load the data from the input table
df = knio.input_tables[0].to_pandas()

# Extract relevant columns
silhouette_scores = df["Silhouette Score"]
num_clusters = df["Number of Clusters"]
cluster_sizes = df["Cluster Sizes"]
processed_rows = df["Processed Rows"]
original_rows = df["Original Rows"]

# Check if 'Feature Matrix' and 'Labels' columns exist
if 'Feature Matrix' in df.columns and 'Labels' in df.columns:
    # Convert the 'Feature Matrix' column from string to actual matrix
    X = np.array([ast.literal_eval(row) for row in df['Feature Matrix']])
    labels = np.array(df['Labels'])

    # Compute the silhouette scores for each sample
    silhouette_values = silhouette_samples(X, labels)

    # Create a figure and axis
    fig, ax = plt.subplots(2, 2, figsize=(14, 10))

    # Plot Silhouette Distribution per Cluster
    y_lower = 10
    unique_labels = np.unique(labels)
    silhouette_avg_per_cluster = []
    for i in unique_labels:
        cluster_values = [v for l, v in zip(labels, silhouette_values) if l == i]
        cluster_values.sort()
        y_upper = y_lower + len(cluster_values)
        ax[0, 0].fill_betweenx(range(y_lower, y_upper), 0, cluster_values, alpha=0.7)
        ax[0, 0].text(-0.05, y_lower + 0.5 * len(cluster_values), f'Cluster {i}')
        y_lower = y_upper + 10
        silhouette_avg_per_cluster.append(np.mean(cluster_values))
    ax[0, 0].set_title('Silhouette Plot for Each Cluster')
    ax[0, 0].set_xlabel('Silhouette Coefficient')
    ax[0, 0].set_ylabel('Cluster')
    ax[0, 0].axvline(x=silhouette_scores.mean(), color='red', linestyle='--')

    # Plot Processed Rows vs Original Rows as a bar chart
    ax[0, 1].bar(['Processed Rows', 'Original Rows'], [processed_rows.sum(), original_rows.sum()], color='lightgreen')
    ax[0, 1].set_title('Processed Rows vs Original Rows')
    ax[0, 1].set_ylabel('Number of Rows')

    # Plot Cluster Sizes
    cluster_sizes_list = [list(ast.literal_eval(size).values()) for size in cluster_sizes]
    ax[1, 0].boxplot(cluster_sizes_list)
    ax[1, 0].set_title('Cluster Sizes')
    ax[1, 0].set_xlabel('Clusters')
    ax[1, 0].set_ylabel('Size')

    # Plot Silhouette Distribution as a scatter plot
    ax[1, 1].scatter(range(len(silhouette_values)), silhouette_values, c=labels, cmap='viridis', alpha=0.7)
    ax[1, 1].set_title('Silhouette Distribution')
    ax[1, 1].set_xlabel('Sample Index')
    ax[1, 1].set_ylabel('Silhouette Coefficient')

    # Create a new figure for the silhouette values bar chart
    fig2, ax2 = plt.subplots(figsize=(10, 6))
    ax2.bar(unique_labels, silhouette_avg_per_cluster, color='skyblue')
    ax2.set_title('Average Silhouette Value per Cluster')
    ax2.set_xlabel('Cluster')
    ax2.set_ylabel('Average Silhouette Value')

    # Adjust layout
    plt.tight_layout()

    # Assign the figures to the output_view variable
    knio.output_view = knio.view([fig, fig2])
else:
    # If 'Feature Matrix' and 'Labels' columns are not present, create a simpler plot
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))

    # Plot Processed Rows vs Original Rows as a bar chart
    ax[0].bar(['Processed Rows', 'Original Rows'], [processed_rows.sum(), original_rows.sum()], color='lightgreen')
    ax[0].set_title('Processed Rows vs Original Rows')
    ax[0].set_ylabel('Number of Rows')

    # Plot Cluster Sizes
    cluster_sizes_list = [list(ast.literal_eval(size).values()) for size in cluster_sizes]
    ax[1].boxplot(cluster_sizes_list)
    ax[1].set_title('Cluster Sizes')
    ax[1].set_xlabel('Clusters')
    ax[1].set_ylabel('Size')

    # Adjust layout
    plt.tight_layout()

    # Assign the figure to the output_view variable
    knio.output_view = knio.view(fig)

**Cluster2 Silhouette Coefficient**

In [None]:
import knime.scripting.io as knio
import pandas as pd
from sklearn.metrics import silhouette_score

# Load input table
df = knio.input_tables[0].to_pandas()

# Input Validation: Check required columns
required_columns = ['start_lat', 'start_lng', 'end_lat', 'end_lng', 'trip_distance_km', 'Cluster']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

# Handle NaN values: Drop rows with missing data
original_row_count = len(df)
df = df.dropna(subset=required_columns)
print(f"Dropped {original_row_count - len(df)} rows due to missing data.")

# Remove or convert geometry column
if "geometry" in df.columns:
    df = df.drop(columns=["geometry"])  # Remove geometry column
    # OR convert to string if needed:
    # df["geometry"] = df["geometry"].astype(str)

# Convert 'Cluster' column to integer
df['Cluster'] = df['Cluster'].str.extract(r'(\d+)$').astype(int)

# Sampling to reduce dataset size if needed
if len(df) > 10000:
    df = df.sample(n=10000, random_state=42)
    print("Sampling applied: Dataset reduced to 10,000 rows.")
else:
    print(f"No sampling needed: {len(df)} rows retained.")

# Extract Features and Labels
features = df[['start_lat', 'start_lng', 'end_lat', 'end_lng', 'trip_distance_km']]
labels = df['Cluster']

# Check if we have enough data for silhouette computation
unique_clusters = set(labels)
if len(unique_clusters) < 2:
    raise ValueError(f"Silhouette Score requires at least 2 clusters. Found clusters: {unique_clusters}")

# Debugging: Log cluster sizes
print(f"Cluster sizes: {df['Cluster'].value_counts().to_dict()}")

# Calculate Silhouette Score with error handling
try:
    score = silhouette_score(features, labels)
    print(f"Silhouette Score calculated: {score}")
except Exception as e:
    raise RuntimeError(f"Error calculating Silhouette Score: {e}")

# Gather Cluster Metadata
cluster_sizes = df['Cluster'].value_counts().to_dict()
num_clusters = len(cluster_sizes)

# Create Output Table
output_df = pd.DataFrame({
    "Silhouette Score": [score],
    "Number of Clusters": [num_clusters],
    "Cluster Sizes": [str(cluster_sizes)],
    "Processed Rows": [len(df)],
    "Original Rows": [original_row_count]  # Size before dropping NaNs
})

# Output the result
knio.output_tables[0] = knio.Table.from_pandas(output_df)

# Print the output dataframe for debugging purposes
print(output_df.to_string(index=False))


**Cluster2 Python View**

In [None]:
import knime.scripting.io as knio
import pandas as pd
import matplotlib.pyplot as plt
import ast

# Load input table
df = knio.input_tables[0].to_pandas()

# Input Validation: Check required columns
required_columns = ['Silhouette Score', 'Number of Clusters', 'Cluster Sizes', 'Processed Rows', 'Original Rows']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

# Handle NaN values: Drop rows with missing data
original_row_count = len(df)
df = df.dropna(subset=required_columns)
print(f"Dropped {original_row_count - len(df)} rows due to missing data.")

# Convert 'Cluster Sizes' column to a dictionary
df['Cluster Sizes'] = df['Cluster Sizes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract key metrics
silhouette_score = df['Silhouette Score'].iloc[0]
num_clusters = df['Number of Clusters'].iloc[0]
cluster_sizes = df['Cluster Sizes'].iloc[0]
processed_rows = df['Processed Rows'].iloc[0]
original_rows = df['Original Rows'].iloc[0]

# Create Output Table
output_df = pd.DataFrame({
    "Silhouette Score": [silhouette_score],
    "Number of Clusters": [num_clusters],
    "Cluster Sizes": [str(cluster_sizes)],
    "Processed Rows": [processed_rows],
    "Original Rows": [original_rows]
})

# Visualization: Plotting
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Silhouette Score as Horizontal Bar Chart
ax[0].barh(['Silhouette Score'], [silhouette_score], color='skyblue')
ax[0].set_xlim(-1, 1)  # Silhouette Score range is [-1, 1]
ax[0].set_title('Silhouette Score')
ax[0].set_xlabel('Score')
ax[0].grid(axis='x', linestyle='--', alpha=0.7)

# Cluster Sizes Plot
cluster_labels = list(cluster_sizes.keys())
cluster_values = list(cluster_sizes.values())
ax[1].bar(cluster_labels, cluster_values, color='orange')
ax[1].set_title('Cluster Sizes')
ax[1].set_xlabel('Clusters')
ax[1].set_ylabel('Number of Points')

# Adjust layout
plt.tight_layout()

# Assign the output table and view
knio.output_tables = [knio.Table.from_pandas(output_df)]
knio.output_view = knio.view(fig)


**Self Organizing Maps**

In [None]:
import knime.scripting.io as knio

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Load and preprocess the combined cluster dataset
df = knio.input_tables[0].to_pandas()
features = ['Count*(trip time)', 'Mean(trip time)', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'trip_distance_km']

# Ensure numeric data for clustering
data = df[features].to_numpy()
data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0))  # Normalize data

# Initialize and train the KMeans
kmeans = KMeans(n_clusters=10, random_state=0, n_init=10)
df['Cluster'] = kmeans.fit_predict(data)

# Visualize the KMeans clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='start_lat', y='start_lng', hue='Cluster', data=df, palette='viridis')
plt.title("KMeans Cluster Visualization")
plt.xlabel("Start Latitude")
plt.ylabel("Start Longitude")
plt.savefig("kmeans_cluster_visualization.png")  # Save the plot instead of showing it

# Analyze cluster assignments
cluster_assignments = df.groupby('Cluster').mean(numeric_only=True)

# Save results if needed
if "geometry" in df.columns:
    df = df.drop(columns=["geometry"])  # Remove geometry column

# Save CSV files in KNIME workspace
workspace_path = knio.flow_variables["knime.workspace"]
output_csv_path = f"{workspace_path}/kmeans_clustered_results.csv"
cluster_summary_output_path = f"{workspace_path}/cluster_summary.csv"

df.to_csv(output_csv_path, index=False)
cluster_assignments.to_csv(cluster_summary_output_path, index=True)

# Output the modified dataframe
knio.output_tables[0] = knio.Table.from_pandas(df)

# Debugging: Print the cluster assignments
print(cluster_assignments)



**Combined Visualizations Python View**

In [None]:
import knime.scripting.io as knio
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cluster summary dataframe
df = knio.input_tables[0].to_pandas()

# Create subplots for visualizations
fig, ax = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Cluster Characteristics (Bar Plot)
metrics = ['Count*(trip time)', 'Mean(trip time)']
df[metrics].plot(kind='bar', ax=ax[0, 0], color=['skyblue', 'orange'])
ax[0, 0].set_title('Cluster Characteristics')
ax[0, 0].set_xlabel('Clusters')
ax[0, 0].set_ylabel('Values')
ax[0, 0].legend(loc='upper left')

# Plot 2: Cluster Locations (Scatter Plot)
sns.scatterplot(
    data=df, x='lon', y='lat', hue='Cluster', palette='viridis', s=100, ax=ax[0, 1]
)
ax[0, 1].set_title('Cluster Locations')
ax[0, 1].set_xlabel('Longitude')
ax[0, 1].set_ylabel('Latitude')
ax[0, 1].legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')

# Plot 3: Outlier Detection (Boxplot for `Count*(trip time)`)
sns.boxplot(data=df[['Count*(trip time)']], ax=ax[1, 0], palette='pastel')
ax[1, 0].set_title('Outlier Detection: Count*(trip time)')
ax[1, 0].set_ylabel('Count*(trip time)')

# Plot 4: Outlier Detection (Boxplot for `Mean(trip time)`)
sns.boxplot(data=df[['Mean(trip time)']], ax=ax[1, 1], palette='muted')
ax[1, 1].set_title('Outlier Detection: Mean(trip time)')
ax[1, 1].set_ylabel('Mean(trip time)')

# Adjust layout
plt.tight_layout()

# Assign the figure to the output view
knio.output_view = knio.view(fig)