### ------ Import module ------

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import matplotlib.pylab as plt

### ------ Define functions ------

In [3]:
def run_kmeans(n_clusters_f, init_f, df_f):
    # Complete this function
    # This function should at least take a dataframe as an argument. I have suggested additional arguments you may
    # want to provide, but these can be changed as you need to fit your solution.
    # The output of this function should be the input data frame will the model object KMeans and a data summary. The
    # function will need to add an additional column to the input dataframe called 'predict_cluster_kmeans'
    # that contains the cluster labels assigned by the algorithm.
    k_means_model_f = KMeans(n_clusters=n_clusters_f, init=init_f)
    k_means_model_f.fit(df_f)
    df_f['predict_cluster_kmeans'] = k_means_model_f.labels_

    # summarize cluster attributes
    k_means_model_f_summary = df_f.groupby(
        'predict_cluster_kmeans').agg(attribute_summary_method_dict)
    return k_means_model_f, k_means_model_f_summary

In [4]:
def get_hour(timestamp):
    return int(timestamp.split()[1].split(':')[0])

In [None]:
# --- set parameters
n_clusters = 3
init_point_selection_method = 'k-means++'

In [47]:
attribute_summary_method_dict = {'burger': np.mean, 'fries': np.mean, 'salad': np.mean, 'shake': np.mean, 'hour': np.mean, 'store_1': sum,
                                 'store_4': sum, 'store_6': sum, 'store_3': sum, 'store_9': sum, 'store_2': sum, 'store_8': sum, 'store_5': sum, 'store_7': sum}

In [50]:
test_df = df[['burger', 'fries', 'salad', 'shake', 'hour', 'store_1', 'store_2', 'store_3', 'store_4',
       'store_5', 'store_6', 'store_7', 'store_8', 'store_9']]

model = KMeans(3)
model.fit(test_df)

KMeans(n_clusters=3)

In [51]:
test_df['predict_cluster_kmeans'] = model.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predict_cluster_kmeans'] = model.labels_


In [52]:
test_df.groupby('predict_cluster_kmeans').agg(attribute_summary_method_dict)

Unnamed: 0_level_0,burger,fries,salad,shake,hour,store_1,store_4,store_6,store_3,store_9,store_2,store_8,store_5,store_7
predict_cluster_kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.345463,2.234386,1.888923,0.118537,12.222068,4521.0,971.0,917.0,4577.0,993.0,958.0,4660.0,4491.0,968.0
1,3.185515,3.385679,0.200057,2.831993,18.741487,928.0,6496.0,2306.0,977.0,6554.0,2188.0,1162.0,867.0,6509.0
2,1.733254,1.789796,0.056542,0.323736,0.187528,626.0,594.0,7985.0,609.0,639.0,7978.0,651.0,448.0,579.0


In [34]:
df.columns

Index(['ticket_id', 'burger', 'fries', 'salad', 'shake', 'location',
       'order_timestamp', 'hour', 'store_1', 'store_2', 'store_3', 'store_4',
       'store_5', 'store_6', 'store_7', 'store_8', 'store_9'],
      dtype='object')

### ------ Import data ------

In [5]:
df_transactions = pd.read_csv('transactions_n100000.csv')
df_transactions.head()

Unnamed: 0,ticket_id,order_timestamp,location,item_name,item_count,lat,long
0,0x209277,2019/3/2 19:59,8,shake,3,41.894202,-87.620965
1,0x209277,2019/3/2 19:59,8,burger,4,41.894202,-87.620965
2,0x209277,2019/3/2 19:59,8,fries,4,41.894202,-87.620965
3,0x30b900,2019/3/26 18:06,1,shake,3,41.880844,-87.630524
4,0x30b900,2019/3/26 18:06,1,burger,3,41.880844,-87.630524


In [6]:
df_transactions.shape

(185452, 7)

### ------ Engineer features -----

In [25]:
# --- convert from long to wide
df = df_transactions.pivot(
    index='ticket_id', columns='item_name', values='item_count').fillna(0)
df_transactions.reset_index(inplace=True)
df_transactions.drop(columns='index', inplace=True)

In [26]:
# --- add back date and location
df = df.merge(df_transactions[['ticket_id', 'location', 'order_timestamp']
                              ].drop_duplicates(), how='left', on='ticket_id')

In [27]:
# --- extract hour of day from datetime
df['hour'] = df['order_timestamp'].apply(get_hour)
#df['hour'] = df['order_timestamp'].apply(lambda x: x.hour)

In [28]:
# --- convert categorical store variables to dummies
# use sklearn.preprocessing.OneHotEncoder() to create a class object called encoded_data
encoded_data = OneHotEncoder(handle_unknown='ignore')

In [29]:
### WHAT SHOULD I FIT HERE? 

# call the method used to fit data for a OneHotEncorder object.
# Note: you will have to reshape data from a column of the data frame.
# useful functions may be DataFrame methods .to_list(), .reshape(), and .shape()
encoded_data.fit(X=np.array(df['location'].tolist()).reshape(df.shape[0], 1))

OneHotEncoder(handle_unknown='ignore')

In [30]:
# fixed split to regex because not all are 'x0_'
col_map_store_binary = dict(zip(list(encoded_data.get_feature_names()), [
    'store_' + re.split('x\d_', x)[1] for x in encoded_data.get_feature_names()]))

`for x in encoded_data.get_feature_names():`

    try:

        x.split('x0_')[1]

    except:

        #print(x.split('x0_'))

In [31]:
# fix transform data
df_store_binary = pd.DataFrame(
    encoded_data.fit_transform(df[['location']]).toarray())
# df_store_binary = pd.DataFrame(encoded_data.transform(
#    X=np.array(df['location'].tolist()).reshape(df.shape[0], 1)))
#df_store_binary.head()
df_store_binary.columns = encoded_data.get_feature_names()
df_store_binary.rename(columns=col_map_store_binary, inplace=True)

In [32]:
df = pd.concat([df, df_store_binary], axis=1)

### ------ RUN CLUSTERING -----

In [14]:
# --- set parameters
n_clusters = 3
init_point_selection_method = 'k-means++'

In [15]:
# --- select data
# specify list of attributes on which to base clusters
cols_for_clustering = ['hour', 'location']

# use reindex because loc list is deprecated
df_cluster = df.reindex(columns=cols_for_clustering)
#df_cluster = df.loc[:, cols_for_clustering]

#df_cluster.head()

In [16]:
# --- split to test and train
df_cluster_train, df_cluster_test, _, _, = train_test_split(
    df_cluster, [1]*df_cluster.shape[0], test_size=0.33)   # ignoring y values for unsupervised

In [17]:
# --- fit model
attribute_summary_method_dict = {'burger': np.mean, 'fries': np.mean, 'salad': np.mean, 'shake': np.mean, 'hour': np.mean, 'store_1': sum,
                                 'store_4': sum, 'store_6': sum, 'store_3': sum, 'store_9': sum, 'store_2': sum, 'store_8': sum, 'store_5': sum, 'store_7': sum}
col_output_order = ['burger', 'fries', 'salad', 'shake', 'hour', 'store_1', 'store_2', 'store_3', 'store_4',
                    'store_5', 'store_6', 'store_7', 'store_8', 'store_9']  # specify order of output columns for easy of readability

In [18]:
# training data
train_model, train_model_summary = run_kmeans(
    n_clusters, init_point_selection_method, df_cluster_train.reindex())

SpecificationError: nested renamer is not supported

In [None]:
# testing data
test_model, test_model_summary = run_kmeans(
    n_clusters, init_point_selection_method, df_cluster_test.reindex())
# all data
model, model_summary = run_kmeans(
    n_clusters, init_point_selection_method, df_cluster)

In [None]:
# --- run for various number of clusters
# add the code to run the clustering algorithm for various numbers of clusters
ks = range(1, 16)
inertias = []

for k in ks:
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(df_cluster)
    inertias.append(model.inertia_)

In [None]:
# --- draw elbow plot
# create an elbow plot for your numbers of clusters in previous step
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
# --- output tagged data for examination ----
store_col_names = ['store_1', 'store_2', 'store_3', 'store_4',
                   'store_5', 'store_6', 'store_7', 'store_8', 'store_9']
df_cluster['store'] = None
for t_col in store_col_names:
    df_cluster.loc[df_cluster[t_col] == 1, 'store'] = t_col.split('_')[1]

df_cluster.to_csv('clustering_output.csv')

In [None]:
# assign cluster mode to location
t_df = df_cluster.groupby('store')['predict_cluster_kmeans'].apply(
    lambda x: x.mode()).reset_index()[['store', 'predict_cluster_kmeans']]
df_transactions[['location', 'lat', 'long']].drop_duplicates().merge(
    t_df, how='left', left_on='location', right_on='store').to_csv('store_locations.csv')

In [None]:
df_cluster.isnull().sum()

In [None]:
X=np.array(df['location'].tolist()).reshape(df.shape[0], 1)

In [None]:
np.array(df['location'].tolist()).reshape(df.shape[0], 1)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df_cluster.isnull().sum()

In [None]:
df_cluster.head()