In [1]:
import pandas as pd
import plotly.graph_objects as go

# Sample data
data = pd.read_csv("C:/Users/alime/Dropbox/PC/Documents/Coding/2023/ProductHunt_EDA_and_Unspervised_Sentiment_Analysis/Stats_Code/Data_Wrangling/Topics_Clusters.csv")

df_topics = pd.DataFrame(data)

# # Sample data for topic groups
# topic_groups = [
#     ['Safari Extensions', 'API', 'WordPress'],
#     ['AI', 'Marketing'],
#     ['Business', 'Blockchain', 'AI']
# ]
topic_groups= pd.read_csv("C:/Users/alime/Dropbox/PC/Documents/Coding/2023/ProductHunt_EDA_and_Unspervised_Sentiment_Analysis/Stats_Code/Data_Wrangling/Topics_Groups.csv")



# Function to calculate the overlap between clusters based on topic groups
def calculate_cluster_overlap(topic_groups, df_topics):
    cluster_overlap = {}
    for group in topic_groups:
        for i, topic1 in enumerate(group):
            for topic2 in group[i+1:]:
                if (df_topics['Topic'] == topic1).any() and (df_topics['Topic'] == topic2).any():
                    cluster1 = df_topics[df_topics['Topic'] == topic1]['Cluster'].iloc[0]
                    cluster2 = df_topics[df_topics['Topic'] == topic2]['Cluster'].iloc[0]
                    key = frozenset([cluster1, cluster2])
                    cluster_overlap[key] = cluster_overlap.get(key, 0) + 1
                
    return cluster_overlap

# Calculate the number of topics in each cluster
cluster_sizes = df_topics['Cluster'].value_counts().to_dict()

# Calculate the overlap between clusters based on topic groups
cluster_overlap = calculate_cluster_overlap(topic_groups, df_topics)

# Create a Plotly scatter plot
fig = go.Figure()

# Add circles for each cluster
for cluster, size in cluster_sizes.items():
    fig.add_trace(go.Scatter(
        x=[cluster],
        y=[size],
        mode='markers',
        marker=dict(size=size*10, sizemode='diameter'),
        text=cluster,
        name=cluster
    ))

# Adjust the size of the overlap between clusters
for clusters, overlap in cluster_overlap.items():
    cluster1, cluster2 = clusters
    if cluster1 in cluster_sizes and cluster2 in cluster_sizes:
        avg_size = (cluster_sizes[cluster1] + cluster_sizes[cluster2]) / 2
        overlap_size = overlap / len(topic_groups) * 100
        fig.add_trace(go.Scatter(
            x=[cluster1, cluster2],
            y=[avg_size, avg_size],
            mode='lines',
            line=dict(width=overlap_size),
            hoverinfo='none',
            showlegend=False
        ))

# Set layout properties
fig.update_layout(
    title='Overlap between Clusters based on Topic Groups',
    xaxis=dict(title='Clusters'),
    yaxis=dict(title='Number of Topics in Cluster'),
    showlegend=True
)

# Show the plot
fig.show()

In [None]:

# Function to calculate the overlap between clusters based on topic groups
def calculate_cluster_overlap(topic_groups, df_topics):
    cluster_overlap = {}
    for group in topic_groups:
        for i, topic1 in enumerate(group):
            for topic2 in group[i+1:]:
                if 'Cluster' in df_topics.columns and (df_topics['Topic'] == topic1).any() and (df_topics['Topic'] == topic2).any():
                    cluster1 = df_topics[df_topics['Topic'] == topic1]['Cluster'].iloc[0]
                    cluster2 = df_topics[df_topics['Topic'] == topic2]['Cluster'].iloc[0]
                    key = frozenset([cluster1, cluster2])
                    cluster_overlap[key] = cluster_overlap.get(key, 0) + 1
                
    return cluster_overlap

# Calculate the number of topics in each cluster
cluster_sizes = df_topics['Cluster'].value_counts().to_dict()

# Calculate the overlap between clusters based on topic groups
cluster_overlap = calculate_cluster_overlap(topic_groups, df_topics)

# Create a graph with nodes representing the clusters
nodes = []
for cluster, size in cluster_sizes.items():
    nodes.append(go.Scatter(
        x=[cluster],
        y=[size],
        mode='markers',
        marker=dict(size=size*10, sizemode='diameter'),
        text=cluster,
        name=cluster
    ))

# Create edges between clusters based on the overlap
edges = []
for clusters, overlap in cluster_overlap.items():
    cluster1, cluster2 = clusters
    if cluster1 in cluster_sizes and cluster2 in cluster_sizes:
        avg_size = (cluster_sizes[cluster1] + cluster_sizes[cluster2]) / 2
        overlap_size = overlap / len(topic_groups) * 100
        edges.append(go.Scatter(
            x=[cluster1, cluster2],
            y=[avg_size, avg_size],
            mode='lines',
            line=dict(width=overlap_size),
            hoverinfo='none',
            showlegend=False
        ))

# Set layout properties
layout = go.Layout(
    title='Overlap between Clusters based on Topic Groups',
    xaxis=dict(title='Clusters'),
    yaxis=dict(title='Number of Topics in Cluster'),
    showlegend=False
)

# Create the figure with nodes and edges
fig = go.Figure(data=nodes + edges, layout=layout)

# Show the plot
fig.show()