In [None]:
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.5-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.5-py3-none-any.whl (23 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m145.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (7

In [None]:
%%writefile app.py
import streamlit as st
import numpy as np
import pandas as pd
import plotly.express as px
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import warnings

warnings.filterwarnings('ignore')

# Function to add random feedback column
def add_feedback_column(df):
    feedback_texts = [
        "Great service!", "Not satisfied", "Loved the products",
        "Service can improve", "Had a wonderful experience"
    ]
    df['feedback'] = [random.choice(feedback_texts) for _ in range(len(df))]
    return df

# Title of the Streamlit App
st.title("Mall Customer Segmentation")

# File Upload Section
uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])

if uploaded_file is not None:
    # Read the uploaded file
    df = pd.read_csv(uploaded_file)
    st.write("Dataset Preview:")
    st.dataframe(df.head())

    try:
        # Add feedback column if not present
        df = add_feedback_column(df)

        # Check if feedback column is added and display it
        st.write("Columns in the dataset after adding feedback:")
        st.write(df.columns)  # Check if 'feedback' column is added

        # Display the first few rows of the updated dataframe
        st.write("Updated Dataset Preview with Feedback Column:")
        st.dataframe(df.head())

    except Exception as e:
        st.error(f"Error adding feedback column: {str(e)}")

    # EDA Section
    st.subheader("Missing Values and Data Types")
    st.write(df.isnull().sum())
    st.write(df.dtypes)

    numeric_df = df.select_dtypes(include=['number'])

    # Create a correlation matrix
    corr_matrix = numeric_df.corr()

    # Plot the heatmap using Plotly
    st.subheader("Correlation Heatmap")
    fig = px.imshow(corr_matrix, text_auto=True, color_continuous_scale='RdBu_r', title='Correlation Heatmap')
    st.plotly_chart(fig)

    # Feature Scaling
    features = df.select_dtypes(include=[np.number]).dropna(axis=1)
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Optional: Text feature extraction using Sentence-BERT (if feedback column exists)
    if 'feedback' in df.columns:
        model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        text_embeddings = model.encode(df['feedback'].tolist())
        st.write(f"Text Embedding Shape: {np.array(text_embeddings).shape}")
    else:
        st.warning("No 'feedback' column found. Using dummy embeddings.")
        text_embeddings = np.zeros((df.shape[0], 300))

    # Combine Numerical and Text Features
    combined_features = np.hstack([scaled_features, text_embeddings])

    # Finding Optimal K (KMeans)
    wcss = []
    silhouette_scores = []
    K = range(2, 11)

    for k in K:
        kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
        kmeans.fit(combined_features)
        wcss.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(combined_features, kmeans.labels_))

    # Elbow and Silhouette Plot
    st.subheader("K-Means Clustering Evaluation Metrics")
    fig = make_subplots(rows=1, cols=2, subplot_titles=('Elbow Method', 'Silhouette Score'))

    # Elbow Method (WCSS)
    fig.add_trace(go.Scatter(x=list(K), y=wcss, mode='lines+markers', marker=dict(color='blue'), name='WCSS'), row=1, col=1)
    # Silhouette Score
    fig.add_trace(go.Scatter(x=list(K), y=silhouette_scores, mode='lines+markers', marker=dict(color='green'), name='Silhouette Score'), row=1, col=2)

    fig.update_layout(title_text='K-Means Clustering Evaluation Metrics', width=1000, height=400, showlegend=False)
    fig.update_xaxes(title_text="K (Number of Clusters)", row=1, col=1)
    fig.update_yaxes(title_text="WCSS", row=1, col=1)
    fig.update_xaxes(title_text="K (Number of Clusters)", row=1, col=2)
    fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)

    st.plotly_chart(fig)

    # Perform K-Means Clustering
    optimal_k = 4  # Update based on evaluation plots
    kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42)
    cluster_labels = kmeans.fit_predict(combined_features)
    df['KMeans_Cluster'] = cluster_labels

    # PCA for 2D Visualization
    pca = PCA(n_components=2)
    pca_components = pca.fit_transform(combined_features)

    st.subheader("PCA Explained Variance")
    for idx, var in enumerate(pca.explained_variance_ratio_):
        st.write(f"Component {idx+1}: {var*100:.2f}%")

    # Plot Cumulative Variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(1, len(cumulative_variance)+1)), y=cumulative_variance, mode='lines+markers', line=dict(dash='dash', color='royalblue'), marker=dict(size=8), name='Cumulative Variance'))
    fig.update_layout(title='Cumulative Explained Variance by PCA Components', xaxis_title='Number of PCA Components', yaxis_title='Cumulative Variance Explained', width=700, height=500, template='plotly_white')
    st.plotly_chart(fig)

    # Plot Clusters
    pca_df = pd.DataFrame({'PCA1': pca_components[:, 0], 'PCA2': pca_components[:, 1], 'Cluster': cluster_labels})
    fig = px.scatter(pca_df, x='PCA1', y='PCA2', color='Cluster', title='Customer Segmentation (KMeans + PCA)', color_continuous_scale='viridis', width=800, height=600)
    fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))
    fig.update_layout(legend_title_text='Cluster', template='plotly_white')
    st.plotly_chart(fig)

    # Cluster Profiling
    st.subheader("Cluster Profiling")
    profile = df.groupby('KMeans_Cluster').mean(numeric_only=True)
    st.write(profile)

    # Cluster Size Distribution
    cluster_counts = df['KMeans_Cluster'].value_counts().sort_index()
    cluster_counts_df = pd.DataFrame({'Cluster': cluster_counts.index, 'Count': cluster_counts.values})

    fig = px.bar(cluster_counts_df, x='Cluster', y='Count', color='Cluster', title='Number of Customers per Cluster', color_continuous_scale='viridis', width=800, height=500)
    fig.update_layout(xaxis_title='Cluster', yaxis_title='Number of Customers', template='plotly_white')
    st.plotly_chart(fig)

    # 3D Scatter Plot of Cluster Counts
    st.subheader("3D Scatter Plot of Cluster Counts")

    fig = px.scatter_3d(
        cluster_counts_df,
        x='Cluster',
        y='Count',
        z=[0] * len(cluster_counts_df),  # Z axis as 0
        color='Count',
    )

    fig.update_layout(
        title='3D Scatter Plot of Cluster Counts',
        scene=dict(
            xaxis_title='Cluster',
            yaxis_title='Count',
            zaxis_title='Z'
        ),
        width=800,
        height=600
    )

    st.plotly_chart(fig)

    # Pairplot of Features Colored by Cluster
    st.subheader("Pairplot of Features by Cluster")

    pairplot_data = features.copy()
    pairplot_data['Cluster'] = cluster_labels

    sns_plot = sns.pairplot(pairplot_data, hue='Cluster', palette='viridis')
    plt.suptitle('Pairplot by Cluster', y=1.02)

    st.pyplot(plt)
    plt.clf()  # Clear figure after displaying

    # Save Segmented Dataset
    st.subheader("Save Segmented Data")
    df.to_csv('customers_segmented.csv', index=False)
    st.write("Data saved as 'customers_segmented.csv'. You can download it below:")

    # Add download button for segmented data
    st.download_button(label="Download Segmented Data", data=df.to_csv(index=False), file_name="customers_segmented.csv", mime="text/csv")


Writing app.py


In [None]:
from pyngrok import ngrok

# Replace 'your_authtoken_here' with your actual ngrok authtoken
ngrok.set_auth_token("2vqRRRi5nGA5kfKqiOB8KaVyVz4_4PahZDHo9iGoaaLi6uBXg")

# Open a ngrok tunnel to the Streamlit app
public_url = ngrok.connect(8501)
print('Streamlit app is live at:', public_url)

# Run Streamlit in background
!streamlit run app.py &

Streamlit app is live at: NgrokTunnel: "https://43b3-34-125-9-124.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.9.124:8501[0m
[0m
2025-04-29 07:46:06.165284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745912766.180536     991 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745912766.185616     991 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been r