In [2]:
import numpy as np
import pandas as pd
from kmodes.kmodes import KModes

Q: How does k-protypes know which of my features are numerical and which are categorical?

A: You tell it which column indices are categorical using the categorical argument. All others are assumed numerical. E.g., clusters = KPrototypes().fit_predict(X, categorical=[1, 2])

In [7]:
df = pd.read_csv('../../data/df_final_stationery.csv')
# df = pd.read_csv('../../final_dfs/df_final_without_labor.csv')

In [3]:
df.shape

(401, 179)

In [8]:
list(df.columns)

['cluster',
 'kreis',
 'ags5',
 'ags2',
 'supermarkets_population',
 'supermarkets_average_distance',
 'public_transport_availability',
 'average_distance_bus_stop',
 'average_distance_train_station',
 'average_distance_public_transport',
 'Unnamed:_0',
 'number_of_students',
 'number_of_hospitals',
 'number_of_hospital_beds',
 'number_of_hospital_beds_adj',
 'hospital_patiants',
 'households_of_1_person',
 'households_of_2_person',
 'households_of_3_person',
 'households_of_4_person',
 'households_of_5_person_or_more',
 'household_with_kids',
 'household_with_kids_under_3',
 'household_with_kids_over_3_under_6',
 'household_with_kids_over_6_under_10',
 'household_with_kids_over_10_under_15',
 'household_with_kids_over_15_under_18',
 'household_with_double_income_no_kids',
 'car_density',
 'no_of_paths_per_person_and_day',
 'kilometers_per_person_and_day',
 '_percentage_out_of_home',
 'share_of_journeys_on_foot',
 'share_of_journeys_on_bike',
 'proportion_of_motorised_vehicle_passenger

In [9]:
cat_cols = ['labor_market_region', 'growing_/_shrinking_circles',
            'labor_market_type', 'grw_funding_framework',
            'settlement_structure_type_of_labor_market_region',
            'room_type_location', 'district_settlement_structure',
            'type_of_settlement_structure', 'urban_/_rural',
            'metropolitan_region', 'metropolitan_area',
            'east_west', 'border_proximity',
            'support_area_status', 'eligible_area']

In [12]:
df[cat_cols]

Unnamed: 0,labor_market_region,growing_/_shrinking_circles,labor_market_type,grw_funding_framework,settlement_structure_type_of_labor_market_region,room_type_location,district_settlement_structure,type_of_settlement_structure,urban_/_rural,metropolitan_region,metropolitan_area,east_west,border_proximity,support_area_status,eligible_area
0,4,5,5,1,3,2,4,3,2,99,99,1,1,C,1
1,6,4,2,4,1,2,1,2,1,99,99,1,0,C/D,1
2,5,4,5,4,1,1,1,2,1,5,99,1,0,C/D,1
3,6,4,5,6,1,2,3,2,2,5,99,1,0,D,1
4,2,4,5,1,3,4,4,3,2,5,99,1,0,C,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,256,1,10,2,3,3,4,2,2,99,99,2,0,C*,1
397,244,2,10,2,1,3,3,2,2,99,99,2,0,C*,1
398,257,1,11,2,3,3,4,2,2,99,99,2,2,C*,1
399,243,2,11,2,1,3,2,2,1,99,99,2,0,C*,1


In [6]:
df1 = df.drop(columns=['kreis', 'ags5', 'ags2', 'Unnamed:_0', 'eligible_area'])
# df1 = df.drop(columns=['cluster', 'kreis', 'ags5', 'ags2', 'Unnamed:_0', 'support_area_status'])

In [7]:
km = KModes(n_clusters=3, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(df1, categorical=cat_cols)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 90, cost: 61554.0
Run 1, iteration: 2/100, moves: 33, cost: 61460.0
Run 1, iteration: 3/100, moves: 9, cost: 61442.0
Run 1, iteration: 4/100, moves: 1, cost: 61440.0
Run 1, iteration: 5/100, moves: 1, cost: 61438.0
Run 1, iteration: 6/100, moves: 0, cost: 61438.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 79, cost: 61778.0
Run 2, iteration: 2/100, moves: 63, cost: 61626.0
Run 2, iteration: 3/100, moves: 47, cost: 61525.0
Run 2, iteration: 4/100, moves: 15, cost: 61493.0
Run 2, iteration: 5/100, moves: 1, cost: 61493.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 62, cost: 61334.0
Run 3, iteration: 2/100, moves: 9, cost: 61325.0
Run 3, iteration: 3/100, moves: 0, cost: 61325.0
Init: initializing centroids
Init: initializing clusters
Starting i

In [8]:
#km.cluster_centroids_

In [9]:
#km.labels_

In [10]:
df['cluster'] = km.labels_

In [11]:
df_plot = df[['kreis', 'ags5', 'cluster']]

In [12]:
df_plot.to_csv('kmodes3.csv')

In [40]:
df['cluster'].value_counts()

2    190
0    114
1     97
Name: cluster, dtype: int64

In [38]:
pd.DataFrame(df['cluster'].value_counts())

Unnamed: 0,cluster
2,190
0,114
1,97


In [18]:
cat_cols.index

<function list.index(value, start=0, stop=9223372036854775807, /)>

In [24]:
options = list(df.columns)

In [26]:
[options.index(value) for value in cat_cols]

[164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178]