In [None]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

%matplotlib inline

In [None]:
df = pd.read_csv('../Data/amazon_filtered_0513.csv', index_col=0)
df.head(2)

In [None]:
df.info()

In [None]:
columns_not_numeric = df.select_dtypes(exclude='number').columns
df.drop(columns=columns_not_numeric, inplace=True)
df.head(2)

In [None]:
df.info()

In [None]:
def _print_plots_kde(dataframe, name_plot, plot_cols=4):
    plots_quantity = len( dataframe.columns )

    plot_rows = int( np.ceil( plots_quantity / plot_cols ) )
    plot_size_y = plot_rows * 5
    plot_size_x = plot_cols * 10

    fig, ax = plt.subplots( plot_rows, plot_cols, figsize=(plot_size_x, plot_size_y) )

    for nrow in range(plot_rows):
        for ncol in range(plot_cols):
            if plots_quantity >= 1:
                plots_quantity -= 1
                dataframe.iloc[:, plots_quantity].plot(kind='kde', ax=ax[nrow][ncol], title=df.columns[plots_quantity - 1])
            else:
                break
                
    # plt.savefig(name_plot, bbox_inches='tight')

In [None]:
transform_normalized = preprocessing.Normalizer().fit_transform(df)
transform_maxmin = preprocessing.MinMaxScaler().fit_transform(df)
transform_standard = preprocessing.StandardScaler().fit_transform(df)
transform_robust = preprocessing.RobustScaler().fit_transform(df)


In [None]:
_print_plots_kde(df, 'original')

In [None]:
_print_plots_kde( pd.DataFrame(transform_normal), 'normalized')

In [None]:
_print_plots_kde( pd.DataFrame(transform_maxmin), 'min-max')

In [None]:
_print_plots_kde( pd.DataFrame(transform_standard), 'standard')

In [None]:
_print_plots_kde( pd.DataFrame(transform_robust), 'robust')

In [None]:
transforms = [
    (transform_normalized, 'normal'),
    (transform_standard, 'standar'),
    (transform_maxmin, 'max min'),
    (transform_robust, 'robust')
    ]

In [None]:
iteractions = range(1, 15)

plots_quantity = len( transforms )
plot_cols = 2

plot_rows = int( np.ceil( plots_quantity / plot_cols ) )
plot_size_y = plot_rows * 5
plot_size_x = plot_cols * 10

fig, ax = plt.subplots( plot_rows, plot_cols, figsize=(plot_size_x, plot_size_y) )

for nrow in range(plot_rows):
    for ncol in range(plot_cols):
        if plots_quantity >= 1:
            plots_quantity -= 1

            kmeans = [ KMeans(n_clusters=i) for i in iteractions ]
            score = [ kmeans[i].fit(transforms[plots_quantity][0]).score(transforms[plots_quantity][0]) for i in range( len(kmeans) ) ]

            ax[nrow][ncol].plot(iteractions,score)
            ax[nrow][ncol].set_title(transforms[plots_quantity][1])

        else:
            break

In [None]:
clusters = KMeans(n_clusters=8).fit(transform_standard)

In [None]:
df_clusters = pd.DataFrame(clusters.cluster_centers_, columns = df.columns)
df_clusters

In [None]:
df_clusters.describe()

In [None]:
df_clusters.iloc[0]

In [None]:
categories = df_clusters.columns

angles = [n / float(len(categories)) * 2 * np.pi for n in range(len(categories))]
angles += angles[:1]

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8),
                    subplot_kw=dict(polar=True))

plt.xticks(angles[:-1], categories, color='grey', size=12)


for cluster in df_clusters.index:
    values = df_clusters.iloc[cluster].values.tolist()
    values += values[:1] # repeat the first value to close the circular graph
    ax.plot(angles, values, linewidth=1, linestyle='solid', label='cluster_'+str(cluster))

plt.legend()
plt.show()