In [5]:
# Library import
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import *

import pickle

In [14]:
# Load data a separate into training and test set 
df = pd.read_csv('DRV_customer_clustering.csv')

# Selection of the good features
selected_features = ['recency',
                     'frequency',
                     'monetary_value',
                     'avg_review_score',
                     'customer_city_size']

df = df[selected_features]

data = df.sample(frac=.95, random_state=789)
data_unseen = df.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen data for predictions: ' + str(data_unseen.shape))

Data for Modeling: (2691, 5)
Unseen data for predictions: (142, 5)


In [16]:
# Preprocessing of features
scaler = StandardScaler()

X_scaled = scaler.fit_transform(df)

X_scaled

# Best model fitting
best_model = KMeans(n_clusters=5,
                    max_iter=1000,
                    n_init=10,
                    random_state=0)

best_model.fit(X_scaled)

KMeans(max_iter=1000, n_clusters=5, random_state=0)

In [17]:
# Save model as a pickle file

with open('challenge_model', 'wb') as file:
    pickle.dump(best_model, file)

with open('challenge_scaler', 'wb') as file:
    pickle.dump(scaler, file)

In [18]:
# Open the saved model and scaler
with open('pickle_model', 'rb') as file:
    model = pickle.load(file)

with open('pickle_scaler', 'rb') as file:
    scaler = pickle.load(file)

In [19]:
# Predict on test set 

data_unseen_scaled = scaler.transform(data_unseen)

data_unseen['cluster'] = model.predict(data_unseen_scaled)

In [20]:
data_unseen.head()

Unnamed: 0,recency,frequency,monetary_value,avg_review_score,customer_city_size,cluster
0,163,2,194.57,5.0,11253503.0,2
1,57,4,71.88,4.0,11253503.0,2
2,472,2,99.45,5.0,31309.0,0
3,385,2,514.49,1.0,11253503.0,3
4,277,2,423.86,4.5,463731.0,0


In [21]:
data_unseen['cluster'].unique()

array([2, 0, 3, 1, 4], dtype=int32)