In [39]:
import pandas as pd
from utils import expand_dataset
import numpy as np

In [45]:
df_business = pd.read_csv('../dataset/df_business_final.csv')

In [46]:
print(df_business.shape)
df_business.head(5)

150346


Unnamed: 0,AcceptsInsurance,AgesAllowed,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,ByAppointmentOnly,CoatCheck,Corkage,DietaryRestrictions,DogsAllowed,...,no_music,review_count,romantic,stars,street,touristy,trendy,upscale,valet,video
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,22.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,80.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,13.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def filter_by_city(city, df):
    return df[df['city'].str.contains(city) == True]


df_business_filtered = filter_by_city('Tucson', df_business)

df_business_filtered.head(5)

# Utils (duplicated from user_task)


In [None]:
from itertools import combinations

import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score


def score_plot_and_get_best(data_frame, algorithm, number_of_tests):
    if algorithm == "kmeans":
        parameter_to_detect = "n_clusters"
    else:
        parameter_to_detect = "eps"

    # fitted_kmeans = {}
    labels = {}
    df_scores = []
    inertias_for_kmeans = []

    for i in number_of_tests:
        if algorithm == "kmeans":
            model = KMeans(n_clusters=i)
        else:
            model = DBSCAN(eps=i, min_samples=10)

        i_labels = model.fit_predict(data_frame)

        if algorithm == "kmeans":
            inertias_for_kmeans.append(model.inertia_)

        # Insert fitted model and calculated cluster labels in dictionaries, for further reference.
        # fitted_kmeans[i] = kmeans
        labels[i] = i_labels

        # Calculate various scores, and save them for further reference.
        df_scores.append({
            parameter_to_detect: i,
            "silhouette_score": silhouette_score(data_frame, i_labels),
            "calinski_harabasz_score": calinski_harabasz_score(data_frame, i_labels),
            "davies_bouldin_score": davies_bouldin_score(data_frame, i_labels),
        })

    # Create a DataFrame of clustering scores, using `n_clusters` as index, for easier plotting.
    df_scores = pd.DataFrame(df_scores)
    df_scores.set_index(parameter_to_detect, inplace=True)

    print(df_scores)

    if algorithm == "kmeans":
        # Plot inertias
        plt.plot(number_of_tests, inertias_for_kmeans, 'bx-')
        plt.title('Inertias')
        plt.xlabel('Number of clusters')
        plt.ylabel('WCSS')

    best_scores_list = [
        df_scores["silhouette_score"].idxmax(),
        df_scores["calinski_harabasz_score"].idxmax(),
        df_scores["davies_bouldin_score"].idxmin()
    ]

    best_parameter = max(set(best_scores_list), key=best_scores_list.count)

    print("Best " + parameter_to_detect + ": ", best_parameter)

    return labels.get(best_parameter)


def add_new_column(algorithm, df_new_column, initial_data_frame):
    if algorithm == 'kmeans':
        df_new_column = pd.DataFrame({'cluster': df_new_column})
    else:
        df_new_column = pd.DataFrame({'eps': df_new_column})

    initial_data_frame = pd.concat([initial_data_frame, df_new_column], axis=1)

    initial_data_frame.head(50)

    return initial_data_frame


def plot_clusters(algorithm, data_frame, num_row, num_col, fig_width, fig_height):
    if algorithm == 'kmeans':
        c_column = 'cluster'
    else:
        c_column = 'eps'

    every_combinations = list(combinations(data_frame.drop(columns=c_column).columns, 2))

    figure, axis = plt.subplots(num_row, num_col, figsize=(fig_width, fig_height))

    next_row = 0
    next_column = 0

    for combination in every_combinations:
        first_feature = combination[0]
        second_feature = combination[1]

        axis[next_row, next_column].scatter(
            data_frame[first_feature],
            data_frame[second_feature],
            c=data_frame[c_column],
            cmap='rainbow'
        )

        axis[next_row, next_column].set_title(first_feature + " and " + second_feature)

        if next_column == (num_col - 1):
            next_row = next_row + 1
            next_column = 0
        else:
            next_column = next_column + 1

# K-Means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [None]:
df_business_filtered_kmeans = df_business_filtered.copy()

In [None]:
df_cluster_kmeans = score_plot_and_get_best(df_business_filtered_kmeans, "kmeans", np.arange(2, 5))

In [None]:
df_business_merged_filtered_kmeans = add_new_column('kmeans', df_cluster_kmeans, df_business_filtered_kmeans)

In [None]:
plot_clusters('kmeans', df_business_merged_filtered_kmeans, 2, 5, 30, 15)