In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [None]:
from datacademy.modules import Module07

#module = Module07(server_address='https://devdatacademyapi.azurewebsites.net')
module = Module07(server_address='localhost')

In [None]:
df = pd.read_parquet("data/Wholesale_customers_data.parquet")
df.head()

<hr>

## A. Data Understanding
First we will get acquainted with the data, for which you have to follow the steps outlined in `Easy-LMS`. In between steps we allow you to validate the shape of your data frame, which enables you to check whether you executed the previous steps correctly. To do this, simply pass the `list(df.shape)` into the checker function, for which the code will be supplied.

In [None]:
#TODO: Investigate the first rows of the data frame using .head()
df.head()

In [None]:
#TODO: Analyse the numerical values of the data frame using .describe()
df.describe()

In [None]:
#TODO: Analyse the different columns of the data frame using .info()
df.info()

In [None]:
#TODO: Generate a pairplot using the Seaborn library.
sns.set(style="ticks")
sns.pairplot(df.loc[:, ~df.columns.isin(['Channel', 'Region'])])
plt.show()

<hr>

## B. Data Preparation
Now we have an understanding of our data, we can continue with preparing our data. The steps to do this are outlined in `Easy-LMS`, so follow these accordingly. In between you can check your data frame shapes in a similar manner as before, to validate if you executed the steps correctly.

In [None]:
#TODO: Print the box plots of all columns except the 'Channel' and 'Region' column.
plt.figure(figsize=(10,5))
df_values = df.loc[:, ~df.columns.isin(['Channel', 'Region'])]
boxplot = df_values.boxplot(column=list(df_values.columns))
plt.show()

In [None]:
#TODO: Develop a function that removes outliers based on a dictionary of cut-off points.
def remove_outliers(df:pd.DataFrame, cut_off:dict) -> pd.DataFrame:
    """Remove outliers of all columns given, based on the supplied cut-off points.

    Args:
        df (pd.DataFrame): Original data frame.
        cut_off (dict): Cut-off points of all columns.

    Returns:
        pd.DataFrame: Data frame with the outliers removed based on cut-off points.
    """
    for c in cut_off.keys():
        df = df[df[c] <= cut_off[c]]
    return df

In [None]:
#TODO: Set cut-off points and remove outliers.
cut_off_dict = {
    'Fresh': ..., 
    'Milk': ..., 
    'Grocery': ..., 
    'Frozen': ..., 
    'Detergents_Paper': ..., 
    'Delicassen': ...
    }
df = remove_outliers(df=df, cut_off=cut_off_dict)

#### B1 - Validate Outlier Removal
Execute the checker function below to evaluate if you executed the outlier removal steps correctly. The function will send the `maximum` values of all columns, as this will indicate whether the correct amount of outliers is removed.

In [None]:
#TODO: Fill in the maximal values for all requested columns and validate them using the checker function.
column_max_values = {
    'Fresh': ..., 
    'Milk': ..., 
    'Grocery': ..., 
    'Frozen': ..., 
    'Detergents_Paper': ..., 
    'Delicassen': ...
    }

module.check("E1_B1", column_max_values)

In [None]:
#TODO: Again print the box plots of all columns except the 'Channel' and 'Region' column.
plt.figure(figsize=(10,5))
df_values = df.loc[:, ~df.columns.isin(['Channel', 'Region'])]
boxplot = df_values.boxplot(column=list(df_values.columns))
plt.show()

In [None]:
#TODO: Apply the MinMax Scaler to our data frame.
scaler = ...
scaled_data = ...
scaled_df = ...
scaled_df.head()

#### B2 - Validate Scaled Data
Execute the checker function below to evaluate if you executed the scaling operation correctly. The function will send the `minimum`, `median` and `maximum` values of all columns, as this will indicate whether the values are correctly scaled.

In [None]:
#TODO: Run the code below to validate the statistic of the scaled data.
scaled_df_stats={
    "min": [round(x, 1) for x in np.min(scaled_df, axis=0)],
    "median": [round(x, 2) for x in np.median(scaled_df, axis=0)],
    "max": [round(x, 1) for x in np.max(scaled_df, axis=0)]
}

module.check("E1_B2", scaled_df_stats)

<hr>

## C. Modeling and Evaluation
Enough of the data preprocessing, it is time to develop and train a model! We will use the library `Scikit-Learn` to do so, more specifically the `KMeans()` algorithm. Please make sure that during initialisation (if possible) you set:
* `random_state` = 0, to fixate the end result.

In [None]:
#TODO: Apply KMeans with 2 clusters to our scaled data frame.
km = ...
y_predicted = ...
y_predicted

In [None]:
#TODO: Print the cluster centers using the code below.
km.cluster_centers_

#### C1 - Validate Cluster Centers
Execute the checker function below to evaluate if you created the clusters correctly. By checking the `cluster centers` we can identify whether the clusters are similar.

In [None]:
#TODO: Validate whether the cluster centers conform to our answers.
module.check("E1_C1", list([list(x) for x in km.cluster_centers_]))

In [None]:
#TODO: Use code below to plot the cluster centers on a scatter plot.
def plot_clustering(
     df:pd.DataFrame,
     kmeans:KMeans,
     scaler:MinMaxScaler,
     x_column:str,
     y_column:str):

        """
        Plot the output of the K-Means clustering algorithm using the given x and y columns.

        Args:
                df (pd.DataFrame): The unscaled data used for clustering.
                kmeans (KMeans): The (trained) KMeans object.
                scaler (MinMaxScaler): The used MinMaxScaler object.
                x_column (str): Column name to plot along the X-axis.
                y_column (str): Column name to plot along the y-axis.
        """

        data = df.copy(deep=True)
        data['clusters'] = kmeans.labels_

        columns = list(data.columns)
        centers = scaler.inverse_transform(kmeans.cluster_centers_)

        # Plot all data points and their cluster assignment.
        sns.scatterplot(
        x=x_column,
        y=y_column,
        data=data,
        hue='clusters')

        # Plot the centroids of the K-Means algorithm.
        sns.scatterplot(
        x=centers[:,columns.index(x_column)],
        y=centers[:,columns.index(y_column)],
        color='red',
        s=300,
        marker='X')

        plt.show()

In [None]:
#TODO: Use the plot_clustering() function to analyse the clusters on different intersections of the data.
plot_clustering(df=df, kmeans=km, scaler=scaler, x_column="Fresh", y_column="Milk")
plot_clustering(df=df, kmeans=km, scaler=scaler, x_column="Fresh", y_column="Grocery")
plot_clustering(df=df, kmeans=km, scaler=scaler, x_column="Frozen", y_column="Grocery")
plot_clustering(df=df, kmeans=km, scaler=scaler, x_column="Milk", y_column="Grocery")

In [None]:
#TODO: Apply KMeans on different amounts of clusters (1 to 10) and save the intertia_ parameter.
K = ...
WCV = []

for k in K:
    kmeans = ...
    fitted_kmeans = ...
    intertia = ...
    WCV.append(intertia)

In [None]:
#TODO: Fill in the plot function to plot the value for K on the x-axis and the WCV on the y-axis.
plt.plot(..., ...)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Within Cluster Variation (WCV)')

#### C2 - Validate Optimal Amount of Clusters
Execute the checker function below to evaluate if you decided on the correct value for optimal amount of clusters before continuing.

In [None]:
#TODO: Fill in the optimal amount of clusters and evaluate it using the checker function.
optimal_amount_of_clusters = ...

module.check("E1_C2", optimal_amount_of_clusters)

In [None]:
#TODO: Use the optimal amount of clusters and get the predicted clusters.
km=KMeans(n_clusters=optimal_amount_of_clusters)
y_predicted=km.fit_predict(scaled_df)

In [None]:
#TODO: For every column print a box plot, distinguishing between different clusters.
clustered_df = df.copy(deep=True)
clustered_df['cluster'] = y_predicted

boxplot = clustered_df.boxplot(
    column=[...], 
    by=..., 
    layout=..., 
    figsize=...
    )

In [None]:
#TODO: Use the code below to print the cluster statistics.
cluster_statistics = clustered_df.groupby(['cluster']).agg({
    'Channel': ['mean'],
    'Region': ['mean'],
    'Fresh': ['mean'], 
    'Milk': ['mean'], 
    'Grocery': ['mean'], 
    'Frozen': ['mean'], 
    'Detergents_Paper': ['mean'],
    'Delicassen': ['mean']
    })

cluster_statistics

In [None]:
#TODO: Run the code below to visualize your clusters in 3D plot(s).
def plot_3d(df:pd.DataFrame, x:str, y:str, z:str, colors=('r', 'g', 'b', 'y')):
    """
    Create a 3D scatter plot for the K-Means clustering results

    Args:
        df (pd.DataFrame): DataFrame containing the (unscaled) data
        x (str): Name of the column to plot along the X-axis
        y (str): Name of the column to plot along the Y-axis
        z (str): Name of the column to plot along the Z-axis
    """
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

    for _, row in df.iterrows():
        if row[x] < 20000 and row[y] < 30000 and row[z] < 30000:
            ax.scatter(row[x], row[y], row[z], color=colors[row['cluster']])
    
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_zlabel(z)

    plt.show()

In [None]:
%matplotlib widget

In [None]:
plot_3d(df=clustered_df, x='Fresh', y='Milk', z='Grocery')