In [1]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import plotly.graph_objects as go
from pandas.plotting import parallel_coordinates
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.decomposition import PCA
import scipy.cluster.hierarchy as sch
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
data = pd.read_csv(r"C:\Users\my computer\Documents\Data Science\Projects\P342-Cluster\df_imputed.csv")  

In [3]:
data=data.drop('Encoded_Countries',axis=1)

In [4]:
scaler=StandardScaler()
d=scaler.fit_transform(data)

In [5]:
df=pd.DataFrame(data=data,columns=data.columns)

In [12]:
# Sidebar for user interaction
st.title("Agglomerative Clustering Model Deployment")

# Create a dictionary to store user inputs
st.subheader("Enter the values")
user_inputs = {}
for feature in data:
    user_inputs[feature] = st.number_input(f"Enter value for {feature}")

# Convert user inputs to a DataFrame
user_df = pd.DataFrame([user_inputs],columns=data.columns)
d1=scaler.fit_transform(user_df)
user_dfsc=pd.DataFrame(d1,columns=data.columns)

# Concatenate user input with the original dataset
combined_df = pd.concat([df, user_dfsc], ignore_index=True)

#Applying PCA 
pca = PCA(n_components=2)
df_pca = pca.fit_transform(combined_df)

# Initialize the Agglomerative Clustering model
model = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='complete')

# Display the user-entered data in a table
st.write("User-entered data:")
st.write(user_df)
    
if st.button("Predict"):
    # Fit the model to the combined data
    model.fit_predict(combined_df)
    # Display the number of clusters
    st.write("Number of clusters:", model.n_clusters_)

    # Display the clustered data
    st.write("Cluster assigned for the user entered data:",model.labels_[-1])
    if(model.labels_[-1]==0):
        st.write("Countries in Cluster 0 exhibit higher birth rates, slightly higher business tax rates, lower CO2 emissions, more bureaucratic processes for starting businesses, lower energy usage, lower GDP, lower health expenditure per capita and as a percentage of GDP, more hours required for tax compliance, higher infant mortality rates, lower internet usage, higher lending interest rates, lower female life expectancies, lower mobile phone adoption, and generally lower population and tourism figures")
    elif(model.labels_[-1]==1):
        st.write("Countries in Cluster 1 are characterized by lower birth rates, slightly lower business tax rates, higher CO2 emissions, simpler processes for starting businesses, higher energy usage, higher GDP, higher health expenditure per capita and as a percentage of GDP, fewer hours required for tax compliance, lower infant mortality rates, higher internet usage, lower lending interest rates, higher female life expectancies, higher mobile phone adoption, and generally higher population and tourism figures")
    
    # Plot the scatter plot of the first two principal components
    plt.figure(figsize=(8, 6))
    plt.scatter(df_pca[:, 0], df_pca[:, 1], c=model.labels_, cmap='rainbow')
    plt.scatter(df_pca[-1, 0], df_pca[-1, 1], c='yellow', marker='X', s=100, label="User Entered Data Point")
    plt.xlabel("First Principal Component")
    plt.ylabel("Second Principal Component")
    plt.title("Agglomerative Clustering - Scatter Plot using PCA(2 Components)")
    plt.legend()
    st.pyplot(plt)
   
    # Compute summary statistics for each cluster
    cluster_summary = pd.DataFrame()
    for cluster_label in range(model.n_clusters_):
        cluster_data = combined_df[model.labels_ == cluster_label]
        cluster_mean = cluster_data.mean()
        cluster_std = cluster_data.std()
        cluster_summary[f"Cluster {cluster_label} Mean"] = cluster_mean
        cluster_summary[f"Cluster {cluster_label} Std"] = cluster_std

     # Display the summary statistics
    st.write("Summary Statistics for Each Cluster:")
    st.write(cluster_summary)
    
    # Export and Download Options
    if st.button("Export Cluster Data to CSV"):
        df.to_csv('cluster_data.csv', index=False)
        st.success("Cluster data exported successfully!")