## Clustering with exact locations

In [16]:
import numpy as np
import pandas as pd
from tabulate import tabulate
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from fuzzywuzzy import fuzz
import re

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from kmodes.kprototypes import KPrototypes

In [2]:
to_cluster_df = pd.read_csv("../Dataset/New_Data/11_dist_cal_completed.csv", low_memory=False)

In [3]:
to_cluster_df

Unnamed: 0,ad_id,UID,street,city,heading,desc_,posted_dat,price,price_type,price_land_pp,...,dist_bus_s,dist_post_,dist_gover,dist_polic,dist_natio,dist_other,x_sld99,y_sld99,dist_road,dist_water
0,230585,75373,Buthgamuwe Road,Rajagiriya,13 Perch Land - Rajagiriya - Buthgamuwe Road,13.4 Perch Land - Immediate Sale <br />\nPrice...,1/2/2018,1600000,Per Perch,1600000,...,2.964581,1.686405,0.904207,1.870336,1.900797,0.690246,404691.6847,490562.3857,5.946677,218.669877
1,231969,85338,Siriwardena Road,Dehiwala,Land in Dehiwela close to Hill Street,Split Level 6.69 perches of land facing Siriwa...,1/13/2018,3000000,Per Perch,3000000,...,1.934521,1.218802,0.793445,0.374929,8.006813,0.311494,400665.6562,483291.9704,119.256539,724.454534
2,231889,84432,Watareka,Homagama,Land for sale in Homagama,à·„à·à¶¸à·à¶œà¶¸ à·€à¶§à¶»à·à¶š à·„à¶ºà·’à¶...,1/12/2018,285000,Per Perch,285000,...,8.847441,2.060439,1.397882,1.949055,5.777922,0.549640,423417.8759,483980.4234,135.197494,89.933025
3,234384,87065,Robert Gunawardana Mw,Malabe,10 P Residential Land Plots for Sale on Robert...,10 P highly residential land plots for sale in...,2/1/2018,1000000,Per Perch,1000000,...,4.537480,0.822716,0.790496,3.508479,4.167544,0.482050,410169.3009,489696.9550,250.725410,1262.706775
4,233356,91020,Old Kottawa Road,Mirihana,Land for Sale in Mirihana,Bare land for sale in Mirihana facing Old Kott...,1/24/2018,3250000,Per Perch,3250000,...,1.454383,1.612721,1.188002,0.287015,3.881640,0.690426,403916.3015,485971.0967,5.275334,410.043706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9975,5446309,195467,Mel road,Katubedda,කටුබැද්දෙන් පාර අයිනේ වටිනා ඉඩමක්,අවස්ථාව මග හැරගන්න එපා...<br />\nමේ ඔබට ලැබෙන ...,3/11/2023,1000000,Per Perch,1000000,...,2.648356,2.708873,1.260597,1.655175,7.669409,0.508840,403385.6823,478044.1298,658.682153,248.041042
9976,5446532,139136,,Homagama,27 perches bare land for sale in Homagama for ...,1.5km to Homagama town <br />\n4km to Athurugi...,3/11/2023,1325000,Per Perch,1325000,...,1.570572,2.931833,1.385384,2.075257,5.197747,0.894467,414950.7171,483992.1322,8.267908,1435.686814
9977,5446655,125887,Rathnarama mawatha,Malabe,59 perches bare land for sale in Malabe for Rs...,59 perches land<br />\nTwo storrey building<br...,3/11/2023,1550000,Per Perch,1550000,...,2.534638,2.320759,2.300348,3.476300,5.484368,0.110846,411398.0385,488088.3903,752.870005,673.707551
9978,5446833,120689,Malabe,Malabe,Commercial Land For Sale,"Address: Kaduwela Main Road (New Kandy Road), ...",3/12/2023,23250000,Per Perch,23250000,...,4.388711,0.617471,0.596578,3.453980,4.023999,0.253730,410051.2233,489363.5630,4.868584,1265.663064


In [18]:
selected_columns = ['main_city', 'price_land_pp', 'dist_Hospi', 'dist_rail_', 'dist_bus_s', 'dist_gover', 'dist_polic', 'dist_natio', 'dist_other', 'dist_road', 'dist_water']
to_cluster_filtered_df = to_cluster_df[selected_columns]

In [19]:
to_cluster_filtered_df = to_cluster_filtered_df.dropna()

to_cluster_filtered_df

Unnamed: 0,main_city,price_land_pp,dist_Hospi,dist_rail_,dist_bus_s,dist_gover,dist_polic,dist_natio,dist_other,dist_road,dist_water
0,rajagiriya,1600000,1.276662,2.961920,2.964581,0.904207,1.870336,1.900797,0.690246,5.946677,218.669877
1,dehiwala,3000000,1.623912,1.437849,1.934521,0.793445,0.374929,8.006813,0.311494,119.256539,724.454534
2,homagama,285000,1.953865,1.570756,8.847441,1.397882,1.949055,5.777922,0.549640,135.197494,89.933025
3,malabe,1000000,2.198801,6.721650,4.537480,0.790496,3.508479,4.167544,0.482050,250.725410,1262.706775
4,nugegoda,3250000,1.554399,0.934212,1.454383,1.188002,0.287015,3.881640,0.690426,5.275334,410.043706
...,...,...,...,...,...,...,...,...,...,...,...
9975,moratuwa,1000000,1.800155,2.993784,2.648356,1.260597,1.655175,7.669409,0.508840,658.682153,248.041042
9976,homagama,1325000,1.629373,1.686171,1.570572,1.385384,2.075257,5.197747,0.894467,8.267908,1435.686814
9977,malabe,1550000,2.565991,5.350379,2.534638,2.300348,3.476300,5.484368,0.110846,752.870005,673.707551
9978,malabe,23250000,2.396529,6.371488,4.388711,0.596578,3.453980,4.023999,0.253730,4.868584,1265.663064


In [23]:
to_cluster_filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9980 entries, 0 to 9979
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   main_city      9980 non-null   object 
 1   price_land_pp  9980 non-null   int64  
 2   dist_Hospi     9980 non-null   float64
 3   dist_rail_     9980 non-null   float64
 4   dist_bus_s     9980 non-null   float64
 5   dist_gover     9980 non-null   float64
 6   dist_polic     9980 non-null   float64
 7   dist_natio     9980 non-null   float64
 8   dist_other     9980 non-null   float64
 9   dist_road      9980 non-null   float64
 10  dist_water     9980 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 857.8+ KB


In [35]:
# #Choosing optimal K value
# cost = []
# X = to_cluster_filtered_df
# for num_clusters in list(range(2,20)):
#     kproto = KPrototypes(n_clusters=num_clusters, init='Huang', random_state=42,n_jobs=-2,max_iter=100,n_init=500) 
#     kproto.fit_predict(X, categorical=[0])
#     cost.append(kproto.cost_)

# plt.plot(cost)
# plt.xlabel('K')
# plt.ylabel('cost')
# plt.show

KeyboardInterrupt: 

In [34]:
# import pandas as pd
# from kmodes.kprototypes import KPrototypes
# import matplotlib.pyplot as plt
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# # Custom LabelEncoder for handling categorical variables in ColumnTransformer
# class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
#     def __init__(self, columns=None):
#         self.columns = columns

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         output = X.copy()
#         if self.columns is not None:
#             for col in self.columns:
#                 output[col] = LabelEncoder().fit_transform(output[col])
#         else:
#             for colname, col in output.iteritems():
#                 output[colname] = LabelEncoder().fit_transform(col)
#         return output

# # Assuming 'to_cluster_filtered_df' is your DataFrame
# # Specify the categorical column and numerical columns
# categorical_columns = ['main_city']
# numerical_columns = ['price_land_pp', 'dist_Hospi', 'dist_rail_', 'dist_bus_s', 'dist_gover', 'dist_polic', 'dist_natio', 'dist_other', 'dist_road', 'dist_water']

# # Preprocessing pipeline for numerical and categorical columns
# numeric_transformer = Pipeline(steps=[
#     ('scaler', StandardScaler())
# ])

# categorical_transformer = Pipeline(steps=[
#     ('label_encoder', MultiColumnLabelEncoder(columns=categorical_columns))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numerical_columns),
#         ('cat', categorical_transformer, categorical_columns)
#     ])

# # Combine preprocessing with k-prototype clustering
# def get_kprototypes_model(n_clusters):
#     return Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('kproto', KPrototypes(n_clusters=n_clusters, init='Cao', verbose=2))
#     ])

# # Define the range of clusters to try
# clusters_range = range(1, 11)  # Adjust the range as needed

# # Fit k-prototype clustering for each number of clusters and store cost
# costs = []
# for n_clusters in clusters_range:
#     model = get_kprototypes_model(n_clusters)
#     model.fit(to_cluster_filtered_df)
#     costs.append(model.named_steps['kproto'].cost_)

# # Plot the elbow curve
# plt.plot(clusters_range, costs, marker='o')
# plt.title('Elbow Method for Optimal k')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Cost')
# plt.show()


Initialization method and algorithm are deterministic. Setting n_init to 1.


NotImplementedError: No categorical data selected, effectively doing k-means. Present a list of categorical columns, or use scikit-learn's KMeans instead.

In [36]:
numerical_col = ['price_land_pp', 'dist_Hospi', 'dist_rail_', 'dist_bus_s', 'dist_gover', 'dist_polic', 'dist_natio', 'dist_other', 'dist_road', 'dist_water']

scaled_numerical = StandardScaler().fit_transform(to_cluster_filtered_df[numerical_col])
to_cluster_filtered_df[numerical_col] = scaled_numerical

In [37]:
# K-prototypes model gets a numpy array, thus converting the df to an array
df_to_array = to_cluster_filtered_df.values

#converting numerical columns datatype as float
numerical_col_index = [1,2,3,4,5,6,7,8,9,10]
df_to_array[:,numerical_col_index] = df_to_array[:,numerical_col_index].astype(float)

#index of categorical columns
categorical_index = [0]

In [45]:
from kmodes.kprototypes import KPrototypes

# Initiating the KPrototypes Clustering model 
KPro = KPrototypes(n_clusters=6, init='Huang', random_state=1234, n_jobs=-1)
# fit model and predict clusters
KPro.fit_predict(df_to_array, categorical=categorical_index)
print(KPro.cost_)
#new column for cluster labels associated with each subject
to_cluster_filtered_df['clusters'] = KPro.labels_

53272.940817034716


In [46]:
to_cluster_filtered_df

Unnamed: 0,main_city,price_land_pp,dist_Hospi,dist_rail_,dist_bus_s,dist_gover,dist_polic,dist_natio,dist_other,dist_road,dist_water,clusters
0,rajagiriya,-0.123890,-0.350081,-0.029598,-0.148467,-0.356297,0.078612,-1.386663,0.604416,-0.645188,-0.917079,0
1,dehiwala,-0.084061,-0.092774,-0.621891,-0.527265,-0.470881,-0.800643,1.163655,-0.549905,-0.434092,0.049464,1
2,homagama,-0.161300,0.151717,-0.570239,2.014917,0.154411,0.124896,0.232707,0.175894,-0.404394,-1.163092,5
3,malabe,-0.140959,0.333211,1.431530,0.429957,-0.473932,1.041790,-0.439904,-0.030100,-0.189167,1.078052,3
4,nugegoda,-0.076949,-0.144282,-0.817617,-0.703832,-0.062711,-0.852333,-0.559318,0.604966,-0.646439,-0.551368,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9975,moratuwa,-0.140959,0.037820,-0.017214,-0.264757,0.012389,-0.047897,1.022731,0.051548,0.570854,-0.860951,1
9976,homagama,-0.131714,-0.088727,-0.525386,-0.661104,0.141482,0.199099,-0.009616,1.226821,-0.640864,1.408613,2
9977,malabe,-0.125313,0.605294,0.898619,-0.306576,1.088015,1.022869,0.110098,-1.161417,0.746326,-0.047512,3
9978,malabe,0.492030,0.479725,1.295448,0.375248,-0.674541,1.009746,-0.499859,-0.725952,-0.647196,1.083701,3


In [47]:
# Get cluster counts
cluster_counts = to_cluster_filtered_df['clusters'].value_counts()

# Display the cluster counts
print(cluster_counts)

clusters
1    3422
0    2777
2    1320
3    1260
5     623
4     578
Name: count, dtype: int64
