In [14]:
import os
import pandas as pd
import plotly.express as px

# Custom color palette (your provided palette)
color_choices = [
    "#FFB6C1",  # 0
    "#90EE90",  # 1
    "#ADD8E6",  # 2
    "#FFFF00",  # 3
    "#808080",  # 4
    "#A52A2A",  # 5
    "#FF0000",  # 6
    "#8A2BE2",  # 7
    "#FF00FF",  # 8
    "#FFA500",  # 9
    "#00FFFF",  # 10
    "#FFD700",  # 11
    "#DA70D6",  # 12
    "#7FFF00",  # 13
    "#DC143C",  # 14
    "#00FA9A",  # 15
    "#008080",  # 16
    "#FF6347",  # 17
    "#708090",  # 18
    "#F0E68C",  # 19
]

# Function to assign consistent colors to clusters
def assign_cluster_colors(clusters):
    # Convert cluster values to integers and map them to the palette
    cluster_colors = {str(cluster): color_choices[int(float(cluster))] for cluster in sorted(clusters, key=lambda x: float(x))}
    return cluster_colors

# Function to map user clusters to the palette
def assign_cluster_colors_for_user(user_clusters):
    # Convert cluster values from strings (e.g., '0.0') to integers
    cluster_colors = {
        str(cluster): color_choices[int(float(cluster))] for cluster in sorted(user_clusters, key=lambda x: float(x))
    }
    return cluster_colors

# ========== 1) READ & VALIDATE DATA ==========
nonstat = pd.read_csv("/home/azureuser/cloudfiles/code/Users/Akshayanivashini.ChandrasekarVijayalakshmi/skinly_thailand_corrected/clustering/non-statinary_data_with_clusters.csv")

required_cols = {"Cluster", "user_id", "Distance_to_Centroid"}
missing_cols = required_cols - set(nonstat.columns)
if missing_cols:
    raise ValueError(f"Missing columns in dataset: {missing_cols}")

# Ensure 'Cluster' is treated as strings
nonstat["Cluster"] = nonstat["Cluster"].astype(str)

# ========== 2) FILTER DATA: 100 CLOSEST POINTS PER CLUSTER ==========
# Sort by Cluster and Distance_to_Centroid
nonstat_sorted = nonstat.sort_values(by=["Cluster", "Distance_to_Centroid"], ascending=[True, True])

# Keep only the first 100 points for each cluster
nonstat_filtered = nonstat_sorted.groupby("Cluster").head(100)
specific_user_id = 4383
# ========== 3) CALCULATE UNIQUE USERS AND TOTAL DATA POINTS ==========
# All points
all_cluster_stats = nonstat.groupby("Cluster").agg(
    Unique_Users=("user_id", "nunique"),
    Total_Data_Points=("user_id", "size")
).reset_index()

# Centroid logic (100 closest points)
centroid_cluster_stats = nonstat_filtered.groupby("Cluster").agg(
    Unique_Users=("user_id", "nunique"),
    Total_Data_Points=("user_id", "size")
).reset_index()

# Assign colors to clusters
all_cluster_colors = assign_cluster_colors(all_cluster_stats["Cluster"].unique())
centroid_cluster_colors = assign_cluster_colors(centroid_cluster_stats["Cluster"].unique())

# ========== 6) ANALYZE DATA WITH AND WITHOUT CENTROID LOGIC ==========
specific_user_id = 4383  # Replace with your specific user_id

# Data without filtering (full dataset)
user_data_full = nonstat[nonstat["user_id"] == specific_user_id]

# Data with filtering (centroid logic applied)
user_data_filtered = nonstat_filtered[nonstat_filtered["user_id"] == specific_user_id]

# Get the exact clusters present for the user
user_clusters_full = user_data_full["Cluster"].unique() if not user_data_full.empty else []
user_clusters_filtered = user_data_filtered["Cluster"].unique() if not user_data_filtered.empty else []

# Map clusters to the palette only for the clusters present for the user
cluster_colors_full = assign_cluster_colors_for_user(user_clusters_full)
cluster_colors_filtered = assign_cluster_colors_for_user(user_clusters_filtered)






# ========== 6) CREATE PLOTS ==========
# Plot 1: All Points
fig_all = px.bar(
    all_cluster_stats,
    x="Cluster",
    y="Unique_Users",
    color="Cluster",
    title="Unique Users Per Cluster (All Points)",
    labels={"Cluster": "Cluster", "Unique_Users": "Number of Unique Users"},
    template="plotly_white",
    hover_data={"Total_Data_Points": True},
    color_discrete_map=all_cluster_colors
)

# Update layout
fig_all.update_layout(
    xaxis=dict(title="Cluster", tickangle=45),
    yaxis=dict(title="Number of Unique Users"),
    showlegend=False
)

# Show the plot
fig_all.show()

# Plot 2: Centroid Logic
fig_centroid = px.bar(
    centroid_cluster_stats,
    x="Cluster",
    y="Unique_Users",
    color="Cluster",
    title="Unique Users Per Cluster (100 Closest Points to Centroid)",
    labels={"Cluster": "Cluster", "Unique_Users": "Number of Unique Users"},
    template="plotly_white",
    hover_data={"Total_Data_Points": True},
    color_discrete_map=centroid_cluster_colors
)

# Update layout
fig_centroid.update_layout(
    xaxis=dict(title="Cluster", tickangle=45),
    yaxis=dict(title="Number of Unique Users"),
    showlegend=False
)

# Show the plot
fig_centroid.show()
# ========== 7) CREATE A HISTOGRAM ==========
fig = px.histogram(
    clusters_per_user,
    x="Visited_Clusters",
    nbins=20,
    title="Number of Clusters Visited per User",
    labels={"Visited_Clusters": "Number of Clusters Visited", "count": "Number of Users"},
    template="plotly_white"
)

# Update layout for better readability
fig.update_layout(
    xaxis=dict(title="Number of Clusters Visited"),
    yaxis=dict(title="Number of Users"),
    bargap=0.2  # Adjust bar gap
)

# Show the plot
fig.show()

# ========== 4) CALCULATE CLUSTERS VISITED BY EACH USER ==========
# Group by user_id and count unique clusters visited
clusters_per_user = (
    nonstat_filtered.groupby("user_id")["Cluster"]
    .nunique()
    .reset_index()
)
clusters_per_user.columns = ["user_id", "Visited_Clusters"]

# Sort by number of visited clusters in descending order
clusters_per_user = clusters_per_user.sort_values(by="Visited_Clusters", ascending=False)

# Print the top users who visited the most clusters
print("Top users who visited the most clusters:")
print(clusters_per_user.head())



# ========== 5) CALCULATE CLUSTERS VISITED BY EACH USER ==========
# Group by user_id and count unique clusters visited
clusters_per_usernonstat = (
    nonstat.groupby("user_id")["Cluster"]
    .nunique()
    .reset_index()
)
clusters_per_usernonstat.columns = ["user_id", "Visited_Clusters"]

# Sort by number of visited clusters in descending order
clusters_per_usernonstat = clusters_per_usernonstat.sort_values(by="Visited_Clusters", ascending=False)

# Print the top users who visited the most clusters
print("Top users who visited the most clusters:")
print(clusters_per_usernonstat.head())


# ========== 8) CREATE A HISTOGRAM ==========
fignonstat = px.histogram(
    clusters_per_usernonstat,
    x="Visited_Clusters",
    nbins=20,
    title="Number of Clusters Visited per User",
    labels={"Visited_Clusters": "Number of Clusters Visited", "count": "Number of Users"},
    template="plotly_white"
)

# Update layout for better readability
fignonstat.update_layout(
    xaxis=dict(title="Number of Clusters Visited"),
    yaxis=dict(title="Number of Users"),
    bargap=0.2  # Adjust bar gap
)

# Show the plot
fignonstat.show()


# ========== 4) VISUALIZE DATA (WITHOUT CENTROID LOGIC) ==========
if not user_data_full.empty:
    # Plot 1: Data points per cluster (full dataset)
    cluster_counts_full = user_data_full["Cluster"].value_counts().reset_index()
    cluster_counts_full.columns = ["Cluster", "Data_Point_Count"]
    
    # Calculate total data points
    total_points_full = cluster_counts_full["Data_Point_Count"].sum()
    
    fig1_full = px.bar(
        cluster_counts_full,
        x="Cluster",
        y="Data_Point_Count",
        title=f"Cluster Distribution for User {specific_user_id} (Full Dataset)",
        labels={"Cluster": "Cluster", "Data_Point_Count": "Number of Data Points"},
        template="plotly_white",
        color="Cluster",
        color_discrete_map=cluster_colors_full
    )
    
    # Add annotation for total data points
    fig1_full.add_annotation(
        text=f"Total Data Points: {total_points_full}",
        xref="paper", yref="paper",
        x=0.5, y=1.1, showarrow=False,
        font=dict(size=14, color="black")
    )
    
    fig1_full.show()

    # Plot 2: Distance to centroid (full dataset)
    fig2_full = px.box(
        user_data_full,
        x="Cluster",
        y="Distance_to_Centroid",
        title=f"Distance to Centroid Distribution for User {specific_user_id} (Full Dataset)",
        labels={"Cluster": "Cluster", "Distance_to_Centroid": "Distance to Centroid"},
        template="plotly_white",
        color="Cluster",
        color_discrete_map=cluster_colors_full
    )
    fig2_full.show()

# ========== 5) VISUALIZE DATA (WITH CENTROID LOGIC) ==========
if not user_data_filtered.empty:
    # Plot 3: Data points per cluster (filtered dataset)
    cluster_counts_filtered = user_data_filtered["Cluster"].value_counts().reset_index()
    cluster_counts_filtered.columns = ["Cluster", "Data_Point_Count"]
    
    # Calculate total data points
    total_points_filtered = cluster_counts_filtered["Data_Point_Count"].sum()
    
    fig1_filtered = px.bar(
        cluster_counts_filtered,
        x="Cluster",
        y="Data_Point_Count",
        title=f"Cluster Distribution for User {specific_user_id} (Filtered Dataset)",
        labels={"Cluster": "Cluster", "Data_Point_Count": "Number of Data Points"},
        template="plotly_white",
        color="Cluster",
        color_discrete_map=cluster_colors_filtered
    )
    
    # Add annotation for total data points
    fig1_filtered.add_annotation(
        text=f"Total Data Points: {total_points_filtered}",
        xref="paper", yref="paper",
        x=0.5, y=1.1, showarrow=False,
        font=dict(size=14, color="black")
    )
    
    fig1_filtered.show()

    # Plot 4: Distance to centroid (filtered dataset)
    fig2_filtered = px.box(
        user_data_filtered,
        x="Cluster",
        y="Distance_to_Centroid",
        title=f"Distance to Centroid Distribution for User {specific_user_id} (Filtered Dataset)",
        labels={"Cluster": "Cluster", "Distance_to_Centroid": "Distance to Centroid"},
        template="plotly_white",
        color="Cluster",
        color_discrete_map=cluster_colors_filtered
    )
    fig2_filtered.show()



    # Filter data for the specific user
user_data = nonstat[nonstat["user_id"] == specific_user_id]

# Sort by date to analyze chronological transitions
user_data_sorted = user_data.sort_values(by="date")

# Add a shifted column to identify transitions
user_data_sorted["Previous_Cluster"] = user_data_sorted["Cluster"].shift(1)
user_data_sorted["Transition"] = user_data_sorted["Previous_Cluster"] + " → " + user_data_sorted["Cluster"]

# Count transitions between clusters
transition_counts = user_data_sorted["Transition"].value_counts()

# Display the transition counts
print(f"Transition counts for User {specific_user_id}:")
print(transition_counts)

# Optional: Calculate time spent in each cluster
user_data_sorted["date"] = pd.to_datetime(user_data_sorted["date"])  # Ensure date is in datetime format
user_data_sorted["Time_Difference"] = user_data_sorted["date"].diff()

# Aggregate time spent per cluster
time_spent = user_data_sorted.groupby("Cluster")["Time_Difference"].sum()
print(f"\nTime spent in each cluster for User {specific_user_id}:")
print(time_spent)


fig22 = px.scatter(
    user_data_sorted,
    x="date",
    y="Cluster",
    title=f"Cluster Transitions Over Time for User {specific_user_id}",
    labels={"date": "Date", "Cluster": "Cluster"},
    color="Cluster",
    template="plotly_white"
)
fig22.show()

Top users who visited the most clusters:
    user_id  Visited_Clusters
35     4383                 3
45    11382                 3
10     3527                 2
57    14574                 2
16     3740                 2
Top users who visited the most clusters:
     user_id  Visited_Clusters
494    11382                 4
377     4383                 4
114     3527                 3
631    14574                 3
733    17342                 3


Transition counts for User 4383:
0.0 → 0.0    417
4.0 → 4.0    106
3.0 → 3.0    102
2.0 → 2.0     29
2.0 → 3.0      1
3.0 → 4.0      1
4.0 → 0.0      1
Name: Transition, dtype: int64

Time spent in each cluster for User 4383:
Cluster
0.0   1459 days
2.0     42 days
3.0    128 days
4.0    153 days
Name: Time_Difference, dtype: timedelta64[ns]


In [9]:
import plotly.io as pio

# Directory to save the plots
output_dir = "/home/azureuser/cloudfiles/code/Users/Akshayanivashini.ChandrasekarVijayalakshmi/skinly_thailand_corrected/clustering/plots/plotlyhtml"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save the plots to the specified directory
pio.write_html(fig_all, os.path.join(output_dir, "fig_all.html"))
pio.write_html(fig_centroid, os.path.join(output_dir, "fig_centroid.html"))
pio.write_html(fig, os.path.join(output_dir, "histogram.html"))
pio.write_html(fignonstat, os.path.join(output_dir, "fignonstat.html"))
pio.write_html(fig1_full, os.path.join(output_dir, "fig1_full.html"))
pio.write_html(fig2_full, os.path.join(output_dir, "fig2_full.html"))
pio.write_html(fig1_filtered, os.path.join(output_dir, "fig1_filtered.html"))
pio.write_html(fig2_filtered, os.path.join(output_dir, "fig2_filtered.html"))
pio.write_html(fig22, os.path.join(output_dir, "fig22.html"))


In [16]:
! pip install -U kaleido




In [19]:
import plotly.io as pio
import os

# Directory to save the plots
output_dir = "/home/azureuser/cloudfiles/code/Users/Akshayanivashini.ChandrasekarVijayalakshmi/skinly_thailand_corrected/clustering/plots/plotlyhtml"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# List of figures to save
figures = {
    "Unique_Users_Per_Cluster_All_Points": fig_all,
    "Unique_Users_Per_Cluster_Centroid": fig_centroid,
    "Clusters_Visited_Per_User_Histogram": fig,
    "Clusters_Visited_Per_User_Histogram_Nonstat": fignonstat,
    "Cluster_Distribution_User_Full": fig1_full,
    "Distance_To_Centroid_User_Full": fig2_full,
    "Cluster_Distribution_User_Filtered": fig1_filtered,
    "Distance_To_Centroid_User_Filtered": fig2_filtered,
    "Cluster_Transitions_Over_Time_User": fig22
}

# Save the plots as PNG files with descriptive names
for name, fig in figures.items():
    try:
        pio.write_image(fig, os.path.join(output_dir, f"{name}.png"))
        print(f"Saved: {name}.png")
    except Exception as e:
        print(f"Error saving {name}.png: {e}")

Error saving Unique_Users_Per_Cluster_All_Points.png: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Error saving Unique_Users_Per_Cluster_Centroid.png: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Error saving Clusters_Visited_Per_User_Histogram.png: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Error saving Clusters_Visited_Per_User_Histogram_Nonstat.png: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Error saving Cluster_Distribution_User_Full.png: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Error saving Distance_To_Centroid_User_Full.png: 
Imag

In [20]:
import plotly.io as pio

# Test if kaleido is available
try:
    pio.kaleido.scope.default_format = "png"
    print("Kaleido is available for image export.")
except Exception as e:
    print("Kaleido is not available:", e)

Kaleido is not available: 'NoneType' object has no attribute 'default_format'


In [21]:
import plotly.io as pio
import plotly.offline as pyo
import os

# Directory to save the plots
output_dir = "/home/azureuser/cloudfiles/code/Users/Akshayanivashini.ChandrasekarVijayalakshmi/skinly_thailand_corrected/clustering/plots/plotlyhtml"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# List of figures to save
figures = {
    "Unique_Users_Per_Cluster_All_Points": fig_all,
    "Unique_Users_Per_Cluster_Centroid": fig_centroid,
    "Clusters_Visited_Per_User_Histogram": fig,
    "Clusters_Visited_Per_User_Histogram_Nonstat": fignonstat,
    "Cluster_Distribution_User_Full": fig1_full,
    "Distance_To_Centroid_User_Full": fig2_full,
    "Cluster_Distribution_User_Filtered": fig1_filtered,
    "Distance_To_Centroid_User_Filtered": fig2_filtered,
    "Cluster_Transitions_Over_Time_User": fig22
}

# Save the plots as HTML files with descriptive names
for name, fig in figures.items():
    try:
        pyo.plot(fig, filename=os.path.join(output_dir, f"{name}.html"), auto_open=False)
        print(f"Saved: {name}.html")
    except Exception as e:
        print(f"Error saving {name}.html: {e}")

Saved: Unique_Users_Per_Cluster_All_Points.html
Saved: Unique_Users_Per_Cluster_Centroid.html
Saved: Clusters_Visited_Per_User_Histogram.html
Saved: Clusters_Visited_Per_User_Histogram_Nonstat.html
Saved: Cluster_Distribution_User_Full.html
Saved: Distance_To_Centroid_User_Full.html
Saved: Cluster_Distribution_User_Filtered.html
Saved: Distance_To_Centroid_User_Filtered.html
Saved: Cluster_Transitions_Over_Time_User.html
