In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv'
df = pd.read_csv(url)

df.head()

In [None]:
lat_long = df[['latitude', 'longitude']]
lat, longg = df.latitude, df.longitude
X = lat_long.to_numpy()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score as ss
import itertools


epsilons = np.linspace(0.01, 1, num=15)
min_samples = np.arange(2, 20, step=3)
combinations = list(itertools.product(epsilons, min_samples))
N = len(combinations)


In [None]:
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []

  for i, (eps, num_samples) in enumerate(combinations):
    dbscan_cluster_model = DBSCAN(eps=eps, min_samples=num_samples).fit(X)
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 50):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    
    scores.append(ss(X, labels))
    all_labels_list.append(labels)
    print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}

best_dict = get_scores_and_labels(combinations, X)

In [None]:
print(best_dict)
df['cluster'] = best_dict['best_labels']

In [None]:
df.head()


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


continuous_features = X_train_comb[['Tenure', 'NumOfProducts', 'CreditScore', 'Age', 'Balance', 'EstimatedSalary', 'PointsEarned']]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(continuous_features)

silhouette_scores = []
best_score = -1
best_clusters = 0

for n_clusters in range(2, 10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')

    kmeans.fit(scaled_features)
    cluster_labels = kmeans.labels_
    silhouette_avg = silhouette_score(scaled_features, cluster_labels)
    
    silhouette_scores.append(silhouette_avg)
    
    if silhouette_avg > best_score:
        best_score = silhouette_avg
        best_clusters = n_clusters

print("Best number of clusters:", best_clusters)

# Plotting the silhouette scores
plt.plot(range(2, 10), silhouette_scores, marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for Different Numbers of Clusters")
plt.show()

#* This does not look promising, but we will go with 2 (maybe try 3-4 later) since it works with our binary classification problem

In [None]:
from sklearn.mixture import BayesianGaussianMixture
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming 'continuous_features' contains the continuous features from your dataframe

# Scale the continuous features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(continuous_features)

# Define the range of number of clusters to consider
n_clusters_range = range(2, 10)

# Initialize variables to store silhouette scores and BIC values
silhouette_scores = []
bic_values = []

# Iterate over different numbers of clusters
for n_clusters in n_clusters_range:
    # Fit the Bayesian Gaussian Mixture model
    bgm = BayesianGaussianMixture(n_components=n_clusters, random_state=42)
    bgm.fit(scaled_features)
    
    # Get cluster labels
    cluster_labels = bgm.predict(scaled_features)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(scaled_features, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    
    # Calculate BIC value manually
    log_likelihood = bgm.score(scaled_features)
    n_samples, n_features = scaled_features.shape
    n_parameters = n_clusters * (n_features + 1) + (n_clusters - 1)
    bic = -2 * log_likelihood + n_parameters * np.log(n_samples)
    bic_values.append(bic)

# Plotting the silhouette scores
import matplotlib.pyplot as plt

plt.plot(n_clusters_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')
plt.show()

# Plotting the BIC values
plt.plot(n_clusters_range, bic_values, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC Value')
plt.title('BIC Value vs. Number of Clusters')
plt.show()

In [None]:
X_train_comb['test10'] = X_train_comb.CardTypeOrd / X_train_comb.NumOfProducts  0.070721
X_train_comb['test11'] = X_train_comb.SurnameOrd / X_train_comb.IsActiveMember  0.013738
X_train_comb['test5'] = X_train_comb.EstimatedSalary / X_train_comb.SurnameOrd  0.231549
X_train_comb['tes10'] = X_train_comb.SatisfactionScore / X_train_comb.NumOfProducts 0.072405
X_train_comb['tes10'] = X_train_comb.Tenure / X_train_comb.NumOfProducts    0.066692
X_train_comb['test3'] = X_train_comb.Age / X_train_comb.GenderBinary    0.298647
X_train_comb['tes10'] = X_train_comb.Age / X_train_comb.NumOfProducts   0.280828 # dont use
X_train_comb['test3'] = X_train_comb.Balance / X_train_comb.GenderBinary    0.130356
X_train_comb['test9'] = X_train_comb.Balance / X_train_comb.CreditScore     0.124030
X_train_comb['test5'] = X_train_comb.GenderBinary / X_train_comb.SurnameOrd     0.214261
X_train_comb['tes10'] = X_train_comb.GenderBinary / X_train_comb.NumOfProducts  0.137544
X_train_comb['tes11'] = X_train_comb.GenderBinary / X_train_comb.IsActiveMember     0.110281








