In [None]:
import pandas as pd

import umap

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
df = pd.read_csv('cleaned_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
correlation = df['Floor_area'].corr(df['Price_in_taka'])

print(f"Pearson correlation coefficient between Floor_area and Price_in_taka: {correlation}")

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x='Floor_area', y='Price_in_taka', data=df, scatter_kws={'s': 50}, line_kws={'color': 'red'})

plt.xlabel('Floor Area (sq ft)')
plt.ylabel('Price in Taka')
plt.title('Scatter Plot with Regression Line: Floor Area vs Price in Taka')

plt.show()

In [None]:
scaler = StandardScaler()
df_scaled_standard = scaler.fit_transform(df.iloc[:, :-1])

In [None]:
df_scaled_standard = pd.DataFrame(df_scaled_standard, columns= ['Bedrooms', 'Bathrooms', 'Floor_area', 'Price_in_taka'])
df_scaled_standard.describe().round(3)

In [None]:
scaler = MinMaxScaler()
df_scaled_minmax = scaler.fit_transform(df.iloc[:, :-1])

In [None]:
df_scaled_minmax = pd.DataFrame(df_scaled_minmax, columns= ['Bedrooms', 'Bathrooms', 'Floor_area', 'Price_in_taka'])
df_scaled_minmax.describe().round(3)

## PCA

In [None]:
pca = PCA(n_components=2)
pca = pca.fit(df_scaled_minmax)

In [None]:
explained_variance = pca.explained_variance_ratio_

explained_variance = pd.DataFrame(explained_variance, index=[f'PC{i+1}' for i in range(2)], columns=['Explained Variance'])

explained_variance

In [None]:
loadings = pca.components_

loadings = pd.DataFrame(loadings.T, index=df.columns[:-1], columns=[f'PC{i+1}' for i in range(2)])

loadings

In [None]:
plt.figure(figsize=(8, 6))
loadings.plot(kind='bar', color=['skyblue','red'])
plt.title('Feature Loadings for the Principal Component')
plt.xlabel('Features')
plt.ylabel('Loading')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
transform_pca =pca.transform(df_scaled_minmax)
transform_pca

In [None]:
fig = px.scatter(
    x=transform_pca[:, 0], 
    y=transform_pca[:, 1], 
    labels={'x': 'Principal Component 1', 'y': 'Principal Component 2'},
    title='2D PCA Projection MinMax',
        hover_data={
        'Bedrooms': df['Bedrooms'],
        'Bathrooms': df['Bathrooms'],
        'Area': df['Floor_area'],
        'Price': df['Price_in_taka']
    },
    color_continuous_scale='Viridis'
)

fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')), mode='markers')

fig.update_layout(
    xaxis_title="Principal Component 1",
    yaxis_title="Principal Component 2",
    plot_bgcolor='red',
    xaxis_showgrid=True,
    yaxis_showgrid=True,
    title_font_size=20
)

fig.show()

In [None]:
df_pca = pd.DataFrame(transform_pca, columns=['PC1', 'PC2'])

df_pca.head()

In [None]:
df = pd.concat([df, df_pca], axis=1)

df.head()

## K-Means

In [None]:
inertia_values = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, max_iter=100000000, init='k-means++')
    kmeans.fit(transform_pca)
    inertia_values.append(kmeans.inertia_)

plt.plot(range(1, 10), inertia_values, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
inertia_values

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42, max_iter= 100000000 , algorithm='lloyd', init='k-means++')
clusters = kmeans.fit_predict(transform_pca)

In [None]:
inertia = kmeans.inertia_
inertia

In [None]:
score = silhouette_score(transform_pca, clusters)
print(f'Silhouette Score: {score}')

In [None]:
centroids = kmeans.cluster_centers_
centroids

In [None]:
fig = px.scatter(
    x=transform_pca[:, 0], 
    y=transform_pca[:, 1], 
    color=clusters, 
    labels={'x': 'Principal Component 1', 'y': 'Principal Component 2'},
    title='K-means Clustering on 2D PCA Projection',
        hover_data={
        'Bedrooms': df['Bedrooms'],
        'Bathrooms': df['Bathrooms'],
        'Area': df['Floor_area'],
        'Price': df['Price_in_taka']
    },
    color_continuous_scale='Viridis'
)

fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')), mode='markers')

fig.add_trace(go.Scatter(
    x=centroids[:, 0],
    y=centroids[:, 1],
    mode='markers',
    marker=dict(color='red', size=15, symbol='x'),
    name='Centroids',
    showlegend=False
))

fig.update_layout(
    xaxis_title="Principal Component 1",
    yaxis_title="Principal Component 2",
    plot_bgcolor='white',
    xaxis_showgrid=True,
    yaxis_showgrid=True,
    title_font_size=20
)

fig.show()

In [None]:
df['Cluster'] = clusters

centroids = df.groupby('Cluster').mean()

cluster_summary = df.groupby('Cluster').agg(['mean', 'std'])

In [None]:
centroids.iloc[:, :4].sort_values(by='Price_in_taka', ascending=True)

In [None]:
cluster_summary.iloc[:, :-6]

In [None]:
df.head()

In [None]:
clusters = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']

bedrooms_mean = cluster_summary[('Bedrooms', 'mean')].tolist()
bedrooms_std = cluster_summary[('Bedrooms', 'std')].tolist()

bathrooms_mean = cluster_summary[('Bathrooms', 'mean')].tolist()
bathrooms_std = cluster_summary[('Bathrooms', 'std')].tolist()

floor_area_mean = cluster_summary[('Floor_area', 'mean')].tolist()
floor_area_std = cluster_summary[('Floor_area', 'std')].tolist()

price_mean = cluster_summary[('Price_in_taka', 'mean')].tolist()
price_std = cluster_summary[('Price_in_taka', 'std')].tolist()

fig, ax = plt.subplots(2, 2, figsize=(12, 8))

ax[0, 0].bar(clusters, bedrooms_mean, yerr=bedrooms_std, color='skyblue', capsize=5)
ax[0, 0].set_title('Bedrooms')
ax[0, 0].set_ylabel('Mean ± Std')
ax[0, 0].grid(True)

ax[0, 1].bar(clusters, bathrooms_mean, yerr=bathrooms_std, color='salmon', capsize=5)
ax[0, 1].set_title('Bathrooms')
ax[0, 1].set_ylabel('Mean ± Std')
ax[0, 1].grid(True)

ax[1, 0].bar(clusters, floor_area_mean, yerr=floor_area_std, color='lightgreen', capsize=5)
ax[1, 0].set_title('Floor Area')
ax[1, 0].set_ylabel('Mean ± Std')
ax[1, 0].grid(True)

ax[1, 1].bar(clusters, price_mean, yerr=price_std, color='orange', capsize=5)
ax[1, 1].set_title('Price in Taka')
ax[1, 1].set_ylabel('Mean ± Std')
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()

##############################################################################################

In [None]:
n_neighbors_values = [5, 10, 20, 50, 100, 200, 500, 1000]

n_clusters_list = [2, 3, 4, 5, 6, 7, 8, 9, 10]

best_score = -1
best_n_neighbors = None
best_n_clusters = None
best_kmeans = None

for n_neighbors in n_neighbors_values:
    for n_clusters in n_clusters_list:
        
        reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, random_state=123)
        umap_results = reducer.fit_transform(df_scaled_minmax)
    
        kmeans = KMeans(n_clusters=n_clusters, random_state=123)
        cluster_labels = kmeans.fit_predict(umap_results)
    
        score = silhouette_score(umap_results, cluster_labels)

        if score > best_score:
            best_score = score
            best_n_neighbors = n_neighbors
            best_n_clusters = n_clusters

print(f'Best n_neighbors: {best_n_neighbors}, Best n_clusters: {best_n_clusters}, Best Silhouette Score: {best_score}')

UMAP

In [None]:
reducer = umap.UMAP(n_components=2,
                    random_state=123,
                    n_neighbors=1000,
                    metric='euclidean')
umap_result = reducer.fit_transform(df_scaled_minmax)

In [None]:
umap_df = pd.DataFrame(data=umap_result, columns=['UMAP 1', 'UMAP 2'])

umap_df['Bedrooms'] = df['Bedrooms']
umap_df['Bathrooms'] = df['Bathrooms']
umap_df['Floor_area'] = df['Floor_area']
umap_df['Price_in_taka'] = df['Price_in_taka']

In [None]:
umap_df

In [None]:
fig = px.scatter(umap_df, x='UMAP 1', y='UMAP 2', title='UMAP of Housing Data', hover_data=['Bedrooms', 'Bathrooms', 'Floor_area', 'Price_in_taka'])
fig.show()

In [None]:
kmeans = KMeans(n_clusters=6, random_state=123)
cluster_labels = kmeans.fit_predict(umap_result)

In [None]:
fig = px.scatter(
    x=umap_result[:, 0], 
    y=umap_result[:, 1], 
    color=cluster_labels, 
    labels={'x': 'UMAP 1', 'y': 'UMAP 2'},
    title='K-means Clustering on 2D UMAP Projection',
        hover_data={
        'Bedrooms': df['Bedrooms'],
        'Bathrooms': df['Bathrooms'],
        'Area': df['Floor_area'],
        'Price': df['Price_in_taka']
    },
    color_continuous_scale='Viridis'
)

fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')), mode='markers')

fig.update_layout(
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2",
    plot_bgcolor='white',
    xaxis_showgrid=True,
    yaxis_showgrid=True,
    title_font_size=20
)

fig.show()

In [None]:
umap_df['Cluster'] = cluster_labels

cluster_summary = umap_df.groupby('Cluster').agg(['mean', 'std'])

In [None]:
centroids = umap_df.groupby('Cluster').mean()

In [None]:
umap_df

In [None]:
centroids.iloc[:, 2:6].sort_values(by='Price_in_taka', ascending=True)

In [None]:
clusters = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']

bedrooms_mean = cluster_summary[('Bedrooms', 'mean')].tolist()
bedrooms_std = cluster_summary[('Bedrooms', 'std')].tolist()

bathrooms_mean = cluster_summary[('Bathrooms', 'mean')].tolist()
bathrooms_std = cluster_summary[('Bathrooms', 'std')].tolist()

floor_area_mean = cluster_summary[('Floor_area', 'mean')].tolist()
floor_area_std = cluster_summary[('Floor_area', 'std')].tolist()

price_mean = cluster_summary[('Price_in_taka', 'mean')].tolist()
price_std = cluster_summary[('Price_in_taka', 'std')].tolist()

fig, ax = plt.subplots(2, 2, figsize=(12, 9))

ax[0, 0].bar(clusters, bedrooms_mean, yerr=bedrooms_std, color='skyblue', capsize=5)
ax[0, 0].set_title('Bedrooms')
ax[0, 0].set_ylabel('Mean ± Std')
ax[0, 0].grid(True)

ax[0, 1].bar(clusters, bathrooms_mean, yerr=bathrooms_std, color='salmon', capsize=5)
ax[0, 1].set_title('Bathrooms')
ax[0, 1].set_ylabel('Mean ± Std')
ax[0, 1].grid(True)

ax[1, 0].bar(clusters, floor_area_mean, yerr=floor_area_std, color='lightgreen', capsize=5)
ax[1, 0].set_title('Floor Area')
ax[1, 0].set_ylabel('Mean ± Std')
ax[1, 0].grid(True)

ax[1, 1].bar(clusters, price_mean, yerr=price_std, color='orange', capsize=5)
ax[1, 1].set_title('Price in Taka')
ax[1, 1].set_ylabel('Mean ± Std')
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()