In [32]:
import pandas as pd
from sklearn.cluster import KMeans
import joblib
import streamlit as st
import numpy as np
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, Float, String
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
def load_data():
    url = "https://raw.githubusercontent.com/4GeeksAcademy/k-means-project-tutorial/main/housing.csv"
    df = pd.read_csv(url)
    return df
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [35]:
# Title and description
st.title("Housing Data Clustering")
st.write("This app clusters housing data using KMeans.")



In [20]:
# Define the base for SQLAlchemy
Base = declarative_base()

# Define the ExampleModel class to represent the housing data
class HousingData(Base):
    __tablename__ = 'housing_data'
    
    id = Column(Integer, primary_key=True)
    longitude = Column(Float)
    latitude = Column(Float)
    housing_median_age = Column(Float)
    total_rooms = Column(Float)
    total_bedrooms = Column(Float)
    population = Column(Float)
    households = Column(Float)
    median_income = Column(Float)
    median_house_value = Column(Float)


  Base = declarative_base()


In [22]:
# Train K-Means model
def train_kmeans(X, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(X)
    return kmeans

In [34]:
# clean data
def clean_data(df):
    df_cleaned = df.dropna()
    return df_cleaned

In [33]:

# Streamlit app
def main():
    st.title("Housing Data K-Means Clustering App")

    # Load and display data
    df = load_data()
    st.subheader("Raw Housing Data")
    st.write(df.head())

    df_cleaned = clean_data(df)

    # Feature selection for K-Means
    st.sidebar.title("K-Means Settings")
    st.sidebar.subheader("Select number of clusters")
    num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)

    X = df_cleaned[['MedInc', 'Latitude', 'Longitude']]
    
    # Train the model
    model = train_kmeans(X, n_clusters=num_clusters)
    
    # Assign cluster labels to the data
    df_cleaned['Cluster'] = model.labels_

    st.subheader("K-Means Clustering Results")
    st.write(df_cleaned[['MedInc', 'Latitude', 'Longitude', 'Cluster']].head())

    # Plotting the clusters
    st.subheader("Cluster Visualization")
    fig, ax = plt.subplots()
    sns.scatterplot(data=df_cleaned, x='Longitude', y='Latitude', hue='Cluster', palette='viridis', ax=ax)
    plt.title("Clusters based on Longitude and Latitude")
    st.pyplot(fig)

    st.subheader("Explore the Clusters")
    cluster_choice = st.selectbox("Select a cluster to explore", df_cleaned['Cluster'].unique())
    st.write(df_cleaned[df_cleaned['Cluster'] == cluster_choice].head())

# Run the app
if __name__ == "__main__":
    main()



In [9]:
# Select relevant features 
X = df_cleaned[['MedInc', 'Latitude', 'Longitude']]

In [10]:
# Train the model
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

In [11]:
# Save the model
joblib.dump(kmeans, 'kmeans_model.pkl')

['kmeans_model.pkl']

In [12]:
# Load the trained model
model = joblib.load('kmeans_model.pkl')

In [13]:
# Streamlit app title
st.title("Housing Clustering Application")

2024-10-18 23:29:50.771 
  command:

    streamlit run /home/vscode/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [14]:
# Input fields
median_income = st.number_input("Median Income", min_value=0.0, max_value=20.0, step=0.1)
latitude = st.number_input("Latitude", min_value=32.0, max_value=42.0, step=0.1)
longitude = st.number_input("Longitude", min_value=-125.0, max_value=-114.0, step=0.1)

2024-10-18 23:29:58.955 Session state does not function when running a script without `streamlit run`


In [15]:
# Predict cluster
if st.button("Predict Cluster"):
    user_input = np.array([[median_income, latitude, longitude]])
    cluster = model.predict(user_input)
    st.write(f"The data belongs to cluster: {cluster[0]}")

