In [5]:
import pandas as pd;
import numpy as np;
from sklearn.metrics import silhouette_samples as ss
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import itertools 
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN

In [6]:
# Load dataset (replace with your actual data loading method)
survey_df = pd.read_csv("../../data/final_survey_table.csv", low_memory=False)

In [7]:
survey_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7799 entries, 0 to 7798
Data columns (total 16 columns):
 #   Column                                                                                                                             Non-Null Count  Dtype 
---  ------                                                                                                                             --------------  ----- 
 0   What is your age (# years)?                                                                                                        7799 non-null   object
 1   What is your gender?                                                                                                               7799 non-null   object
 2   In which country do you currently reside?                                                                                          7799 non-null   object
 3   What is the highest level of formal education that you have attained or plan to attain within

In [8]:
# Select only the desired columns
eda_features = ["In which country do you currently reside?", "What is your age (# years)?"]

# Extract the relevant data
eda_df = survey_df[eda_features]

In [9]:
# Create column transformer 

cat_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", cat_transformer, eda_features)
    ]
)


In [10]:
# Transform the features
X = preprocessor.fit_transform(eda_df)
X

<7799x70 sparse matrix of type '<class 'numpy.float64'>'
	with 15598 stored elements in Compressed Sparse Row format>

In [11]:
# Prepare the target variable for later evaluation
y = survey_df['income_category']
y

0       T20
1       B40
2       M40
3       T20
4       B40
       ... 
7794    B40
7795    M40
7796    M40
7797    T20
7798    T20
Name: income_category, Length: 7799, dtype: object

In [12]:
## Grid search to find the best parameters for DBSCAN
epsilons = np.linspace(0.01, 1, num=15)
epsilons

array([0.01      , 0.08071429, 0.15142857, 0.22214286, 0.29285714,
       0.36357143, 0.43428571, 0.505     , 0.57571429, 0.64642857,
       0.71714286, 0.78785714, 0.85857143, 0.92928571, 1.        ])

In [13]:
min_samples = np.arange(2, 20, step = 3)
min_samples

array([ 2,  5,  8, 11, 14, 17])

In [14]:
combinations = list(itertools.product(epsilons, min_samples))
combinations

[(0.01, 2),
 (0.01, 5),
 (0.01, 8),
 (0.01, 11),
 (0.01, 14),
 (0.01, 17),
 (0.08071428571428571, 2),
 (0.08071428571428571, 5),
 (0.08071428571428571, 8),
 (0.08071428571428571, 11),
 (0.08071428571428571, 14),
 (0.08071428571428571, 17),
 (0.15142857142857144, 2),
 (0.15142857142857144, 5),
 (0.15142857142857144, 8),
 (0.15142857142857144, 11),
 (0.15142857142857144, 14),
 (0.15142857142857144, 17),
 (0.22214285714285714, 2),
 (0.22214285714285714, 5),
 (0.22214285714285714, 8),
 (0.22214285714285714, 11),
 (0.22214285714285714, 14),
 (0.22214285714285714, 17),
 (0.29285714285714287, 2),
 (0.29285714285714287, 5),
 (0.29285714285714287, 8),
 (0.29285714285714287, 11),
 (0.29285714285714287, 14),
 (0.29285714285714287, 17),
 (0.3635714285714286, 2),
 (0.3635714285714286, 5),
 (0.3635714285714286, 8),
 (0.3635714285714286, 11),
 (0.3635714285714286, 14),
 (0.3635714285714286, 17),
 (0.4342857142857143, 2),
 (0.4342857142857143, 5),
 (0.4342857142857143, 8),
 (0.4342857142857143, 11),
 

In [15]:
N = len(combinations)
N

90

In [16]:
def get_scores_and_labels(combinations, X): 
    scores = []
    all_labels_list = []
    
    for i, (eps, num_samples) in enumerate(combinations):
        dbscan_cluster_model = DBSCAN(eps=eps, min_samples=num_samples).fit(X)
        labels = dbscan_cluster_model.labels_
        labels_set = set(labels)
        num_clusters = len(labels_set)
        if -1 in labels_set: 
            num_clusters = -1
            
        if (num_clusters < 2) or (num_clusters > 50):
            scores.append(-10)
            all_labels_list.append('bad')
            c = (eps, num_samples)
            print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
            continue
        
        scores.append(ss(X, labels))
        all_labels_list.append(labels)
        print(f"Index: {i}, Score {scores[-1]}, labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")
        
    best_index = np.argmax(scores)
    best_parameters = combinations[best_index]
    best_labels = all_labels_list[best_index]
    best_score = scores[best_index]
    
    return {
        "best_epsilon": best_parameters[0],
        "best_min_samples": best_parameters[1],
        "best_labels": best_labels,
        "best_score": best_score
    }
    
best_dict = get_scores_and_labels(combinations, X)

Combination (0.01, 2) on iteration 1 of 90 has -1 clusters. Moving on
Combination (0.01, 5) on iteration 2 of 90 has -1 clusters. Moving on
Combination (0.01, 8) on iteration 3 of 90 has -1 clusters. Moving on
Combination (0.01, 11) on iteration 4 of 90 has -1 clusters. Moving on
Combination (0.01, 14) on iteration 5 of 90 has -1 clusters. Moving on
Combination (0.01, 17) on iteration 6 of 90 has -1 clusters. Moving on
Combination (0.08071428571428571, 2) on iteration 7 of 90 has -1 clusters. Moving on
Combination (0.08071428571428571, 5) on iteration 8 of 90 has -1 clusters. Moving on
Combination (0.08071428571428571, 8) on iteration 9 of 90 has -1 clusters. Moving on
Combination (0.08071428571428571, 11) on iteration 10 of 90 has -1 clusters. Moving on
Combination (0.08071428571428571, 14) on iteration 11 of 90 has -1 clusters. Moving on
Combination (0.08071428571428571, 17) on iteration 12 of 90 has -1 clusters. Moving on
Combination (0.15142857142857144, 2) on iteration 13 of 90 ha

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances

# Scale the data
X_scaled = StandardScaler().fit_transform(X.toarray())

# Optionally, reduce dimensionality
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_scaled)

# Analyze distance percentiles for better eps range
distances = euclidean_distances(X_reduced)
distances = distances[np.triu_indices_from(distances, k=1)]
print(np.percentile(distances, [5, 25, 50, 75, 95]))

# Adjust parameter grid
epsilons = np.linspace(0.05, 1.5, num=20)  # Adjusted range based on distance percentiles
min_samples = np.arange(2, 20, step=3)
combinations = list(itertools.product(epsilons, min_samples))

# Rerun DBSCAN
best_dict = get_scores_and_labels(combinations, X_reduced)
print("Best DBSCAN Parameters:")
print(best_dict)


[0.40644973 1.24981823 2.06428413 2.92668506 3.99734543]
Combination (0.05, 2) on iteration 1 of 90 has -1 clusters. Moving on
Combination (0.05, 5) on iteration 2 of 90 has -1 clusters. Moving on
Combination (0.05, 8) on iteration 3 of 90 has -1 clusters. Moving on
Combination (0.05, 11) on iteration 4 of 90 has -1 clusters. Moving on
Combination (0.05, 14) on iteration 5 of 90 has -1 clusters. Moving on
Combination (0.05, 17) on iteration 6 of 90 has -1 clusters. Moving on
Combination (0.12631578947368421, 2) on iteration 7 of 90 has -1 clusters. Moving on
Combination (0.12631578947368421, 5) on iteration 8 of 90 has -1 clusters. Moving on
Combination (0.12631578947368421, 8) on iteration 9 of 90 has -1 clusters. Moving on
Combination (0.12631578947368421, 11) on iteration 10 of 90 has -1 clusters. Moving on
Combination (0.12631578947368421, 14) on iteration 11 of 90 has -1 clusters. Moving on
Combination (0.12631578947368421, 17) on iteration 12 of 90 has -1 clusters. Moving on
Comb

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (120,) + inhomogeneous part.

In [173]:
best_dict

{'best_epsilon': 0.05,
 'best_min_samples': 2,
 'best_labels': 'bad',
 'best_score': -10}

In [174]:
# Print results
print("Best DBSCAN Parameters:")
print(f"Epsilon: {best_dict['best_epsilon']}")
print(f"Min Samples: {best_dict['best_min_samples']}")
print(f"Best Silhouette Score: {best_dict['best_score']}")

Best DBSCAN Parameters:
Epsilon: 0.05
Min Samples: 2
Best Silhouette Score: -10


In [175]:
survey_df['cluster'] = best_dict['best_labels']

survey_df['cluster'].value_counts()

cluster
bad    7799
Name: count, dtype: int64