# Machine Learning

### 2. Can you group the customers  by purchasing behavior? (clustering)

## Import Library

In [None]:
# Import Library
import pandas as pd
import os

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format

## Prepare Data

In [None]:
data_dir = "clean_data"
OCNR = pd.read_csv(f"{data_dir}/OCNR.csv")
SNR = pd.read_csv(f"{data_dir}/SNR.csv")
L = pd.read_csv(f"{data_dir}/L.csv")
P = pd.read_csv(f"{data_dir}/P.csv")
PS = pd.read_csv(f"{data_dir}/PS.csv")

In [None]:
LOCNR = pd.merge(left = L,
                 right=OCNR,
                 left_on='L_ORDERKEY',
                 right_on='O_ORDERKEY',
                 how='inner')

In [None]:
LOCNRP = pd.merge(left = LOCNR,
                  right = P,
                  left_on = 'L_PARTKEY',
                  right_on = 'P_PARTKEY',
                  how = 'inner')

In [None]:
LOCNRP['DUMMY_COUNT'] = 1

In [None]:
LOCNRP.info()

In [None]:
AGG_df2 = LOCNRP.groupby(['O_CUSTKEY'], as_index=False).agg(
    SUM_QUANTITY=("L_QUANTITY", 'sum'),
    MEAN_QUANTITY=("L_QUANTITY", 'mean'),
    MEAN_EXTENDEDPRICE=("L_EXTENDEDPRICE",'mean'),
    MEAN_TAX=("L_TAX",'mean'),
    MEAN_RETAILPRICE=("P_RETAILPRICE",'mean'),
    MEAN_LEADDAY=("LEADDAY",'mean'),
    MAX_LEADDAY=("LEADDAY",'max'),
    MIN_LEADDAY=("LEADDAY",'min'),
    SUM_TOTAL_SALE=("O_TOTALPRICE",'sum'),
    MEAN_TOTAL_SALE=("O_TOTALPRICE",'mean'),
    TOTAL_ORDERS=("DUMMY_COUNT",'sum')
)

In [None]:
AGG_df2.head()

## K-Mean Clustering

### Elbow Method

In [None]:
cs = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(AGG_df2)
    cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()

### Train Model

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4,random_state=1)

kmeans.fit(AGG_df2)

AGG_df2['CLUSTER'] = kmeans.labels_

In [None]:
AGG_df2.head(10)

### Result

In [None]:
x = AGG_df2['SUM_QUANTITY']
y = AGG_df2['SUM_TOTAL_SALE']
Cluster = AGG_df2['CLUSTER']

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)

scatter = ax.scatter(x,y,c=Cluster,s=50)
ax.set_xlabel('Total Quantities')
ax.set_ylabel('Total Sales')
plt.colorbar(scatter)

#fig.savefig('ClusterSales.png', bbox_inches='tight')
#fig.show()

In [None]:
x = AGG_df2['TOTAL_ORDERS']
y = AGG_df2['MEAN_LEADDAY']
Cluster = AGG_df2['CLUSTER']

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)

scatter = ax.scatter(x,y,c=Cluster,s=50)
ax.set_xlabel('Total Orders')
ax.set_ylabel('Average Lead Days')
plt.colorbar(scatter)

#fig.savefig('ClusterOrders.png', bbox_inches='tight')
#fig.show()

## Download Data

In [None]:
#Create output folder if not exist
if not os.path.exists("output"):
    os.makedirs("output")

data_dir = "output"

#Download modeled data into output folder
AGG_df2.to_csv(f"{data_dir}/Lab1-2.csv")