In [None]:
import pandas as pd

import umap

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv('cleaned_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Calculate the Pearson correlation coefficient
correlation = df['Floor_area'].corr(df['Price_in_taka'])

# Print the result
print(f"Pearson correlation coefficient between Floor_area and Price_in_taka: {correlation}")

In [None]:
# Create a scatter plot with a regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='Floor_area', y='Price_in_taka', data=df, scatter_kws={'s': 50}, line_kws={'color': 'red'})

# Adding labels and title
plt.xlabel('Floor Area (sq ft)')
plt.ylabel('Price in Taka')
plt.title('Scatter Plot with Regression Line: Floor Area vs Price in Taka')

# Display the plot
plt.show()

In [None]:
# Standardize the data, excluding the 'City' column
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.iloc[:, :-1])

In [None]:
df_scaled.shape

PCA

In [None]:
# Apply PCA with 2 component
pca = PCA(n_components=2)
pca = pca.fit(df_scaled)

In [None]:
# Access the component's loadings
loadings = pca.components_
loadings

In [None]:
# Create DataFrames for better readability
loadings = pd.DataFrame(loadings.T, index=df.columns[:-1], columns=[f'PC{i+1}' for i in range(2)])

loadings

The loadings (eigenvectors) associated with each principal component will show you how much each original feature contributes to the component.

The larger the absolute value of the loading, the more the feature contributes to that principal component.

The loading value for each feature can be positive or negative, indicating the direction of the contribution.

The magnitude of the loading (the absolute value) indicates the strength of the contribution.

Features with higher absolute values have a greater impact on the first principal component.

PC1:

Floor_area (0.534366) and Bathrooms (0.531761) have the highest loadings, meaning they contribute the most to the variance captured by the first principal component.

Price_in_taka (0.491189) also contributes significantly but slightly less than the first two.

Bedrooms (0.436367) has the lowest loading among the four features, indicating it has the least influence on this principal component, though it still contributes significantly.

PC2:

Bedrooms has a strong positive loading on PC2, meaning it contributes significantly to this component.

Price_in_taka has a strong negative loading, indicating an inverse relationship with PC2.

Floor_area also has a negative loading but is less significant compared to Price_in_taka.

Bathrooms has a smaller positive loading, suggesting it has a minor influence on PC2.


In [None]:
# Plotting the loadings
plt.figure(figsize=(8, 6))
loadings.plot(kind='bar', color=['skyblue','red'])
plt.title('Feature Loadings for the Principal Component')
plt.xlabel('Features')
plt.ylabel('Loading')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance

The explained_variance will show you how much of the total variance is explained by each principal component.

Components with higher explained variance are more important in capturing the overall variability in the data.

PC1 (67.57%): The first principal component explains 67.57% of the total variance in your data. 

This means that PC1 is capturing the majority of the information in your dataset.

PC2 (18.35%): The second principal component explains an additional 18.35% of the variance.

Combined with PC1, these two components capture about 85.92% of the total variance(Information), which is quite substantial.

In [None]:
# Create DataFrames for better readability
explained_variance = pd.DataFrame(explained_variance, index=[f'PC{i+1}' for i in range(2)], columns=['Explained Variance'])

explained_variance

In [None]:
transform_pca =pca.transform(df_scaled)
transform_pca

In [None]:
# Plot the 2D projection
plt.figure(figsize=(10, 7))
plt.scatter(transform_pca[:, 0], transform_pca[:, 1], c='skyblue', edgecolor='k', s=50)
plt.title('2D PCA Projection of the Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

Most of the data points are clustered on the left side of the plot, particularly around the origin, indicating that for these data points, 

both principal components do not deviate significantly from the mean.

There are a few outliers, particularly in the top-right and middle-right areas of the plot,

indicating that these points are quite different from the rest of the data in terms of the principal components.

In [None]:
# Convert the NumPy array to a DataFrame and assign column names
df_pca = pd.DataFrame(transform_pca, columns=['PC1', 'PC2'])

# Display the DataFrame
df_pca.head()

In [None]:
# Concatenate the PCA components with the original DataFrame
df = pd.concat([df, df_pca], axis=1)

# Display the merged DataFrame
df.head()

In [None]:
# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(transform_pca)

In [None]:
# Plotting the clusters on the 2D PCA projection
plt.figure(figsize=(10, 7))
plt.scatter(transform_pca[:, 0], transform_pca[:, 1], c=clusters, cmap='viridis', edgecolor='k', s=50)
plt.title('K-means Clustering on 2D PCA Projection')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()

In [None]:
clusters

In [None]:
df['Cluster'] = clusters

# Calculate the centroids of the clusters
centroids = df.groupby('Cluster').mean()

# Summarize the clusters by original features
cluster_summary = df.groupby('Cluster').agg(['mean', 'std'])

In [None]:
centroids.iloc[:, :4]

Cluster 0 budget

Cluster 1 luxury

Cluster 2 mid-range

In [None]:
cluster_summary.iloc[:, :-6]

In [None]:
cluster_summary[('Bedrooms', 'mean')].tolist()

In [None]:
bedrooms_mean = cluster_summary[('Bedrooms', 'mean')].tolist()
bedrooms_std = cluster_summary[('Bedrooms', 'std')].tolist()

bedrooms_std

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data from the provided summary
clusters = ['Cluster 0', 'Cluster 1', 'Cluster 2']

bedrooms_mean = cluster_summary[('Bedrooms', 'mean')].tolist()
bedrooms_std = cluster_summary[('Bedrooms', 'std')].tolist()

bathrooms_mean = cluster_summary[('Bathrooms', 'mean')].tolist()
bathrooms_std = cluster_summary[('Bathrooms', 'std')].tolist()

floor_area_mean = cluster_summary[('Floor_area', 'mean')].tolist()
floor_area_std = cluster_summary[('Floor_area', 'std')].tolist()

price_mean = cluster_summary[('Price_in_taka', 'mean')].tolist()
price_std = cluster_summary[('Price_in_taka', 'std')].tolist()

# Plotting the means with standard deviations as error bars
fig, ax = plt.subplots(2, 2, figsize=(10, 6))

# Bedrooms
ax[0, 0].bar(clusters, bedrooms_mean, yerr=bedrooms_std, color='skyblue', capsize=5)
ax[0, 0].set_title('Bedrooms')
ax[0, 0].set_ylabel('Mean ± Std')
ax[0, 0].grid(True)

# Bathrooms
ax[0, 1].bar(clusters, bathrooms_mean, yerr=bathrooms_std, color='salmon', capsize=5)
ax[0, 1].set_title('Bathrooms')
ax[0, 1].set_ylabel('Mean ± Std')
ax[0, 1].grid(True)

# Floor Area
ax[1, 0].bar(clusters, floor_area_mean, yerr=floor_area_std, color='lightgreen', capsize=5)
ax[1, 0].set_title('Floor Area')
ax[1, 0].set_ylabel('Mean ± Std')
ax[1, 0].grid(True)

# Price in Taka
ax[1, 1].bar(clusters, price_mean, yerr=price_std, color='orange', capsize=5)
ax[1, 1].set_title('Price in Taka')
ax[1, 1].set_ylabel('Mean ± Std')
ax[1, 1].grid(True)

plt.tight_layout()
plt.show()


Bedrooms: Cluster 1 has the highest mean with a large standard deviation, indicating a wide range of bedroom counts. Clusters 0 and 2 have smaller means and narrower ranges.

Bathrooms: Similar to bedrooms, Cluster 1 also has the highest mean and variability in the number of bathrooms.

Floor Area: Cluster 1 properties are significantly larger, with the highest mean floor area and considerable variability. Cluster 0 has the smallest floor areas.

Price in Taka: Cluster 1 properties are the most expensive, with a very high mean price and large variation. Cluster 0 has the lowest prices.

##############################################################################################

UMAP

In [None]:
reducer = umap.UMAP(n_components=2,     #For visualization purposes
                    random_state=123,    #Controls the randomness of the algorithm
                    n_neighbors=700,     #Determines the number of neighboring points
                    min_dist=0.9,       #The minimum distance between points
                    spread=0.9,         #Controls how far apart points can be
                    metric='euclidean') #distance computation ['euclidean', 'minkowski', 'cosine', 'jaccard', 'manhattan', 'correlation']
umap_results = reducer.fit_transform(df_scaled)

In [None]:
umap_df = pd.DataFrame(data=umap_results, columns=['UMAP 1', 'UMAP 2'])
# Create a DataFrame with the UMAP results
umap_df['Bedrooms'] = df['Bedrooms']
umap_df['Bathrooms'] = df['Bathrooms']
umap_df['Floor_area'] = df['Floor_area']
umap_df['Price_in_taka'] = df['Price_in_taka']
umap_df['City'] = df['City']

In [None]:
# 2D Scatter plot
fig = px.scatter(umap_df, x='UMAP 1', y='UMAP 2', color='City', title='UMAP of Housing Data by City', hover_data=['Bedrooms', 'Bathrooms', 'Floor_area', 'Price_in_taka'])
fig.show()