# Feature Engineering

In [1]:
import os
import sys
import pandas as pd 
import datetime 
import math 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import matplotlib.mlab as mlab

%load_ext autoreload
%autoreload 2
%matplotlib inline 

# Setting up directories
ROOT_DIR = os.path.join(os.getcwd(), '..')
# ROOT_DIR =os.getcwd()

SRC_DIR = os.path.join(os.getcwd(), '..', 'src')
DATA_DIR = os.path.join(ROOT_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "01_raw")

sys.path.append(ROOT_DIR)
sys.path.append(SRC_DIR)
os.chdir(ROOT_DIR)

ImportError: Unable to import required dependencies:
numpy: 

IMPORTANT: PLEASE READ THIS FOR ADVICE ON HOW TO SOLVE THIS ISSUE!

Importing the numpy c-extensions failed.
- Try uninstalling and reinstalling numpy.
- If you have already done that, then:
  1. Check that you expected to use Python3.6 from "C:\Users\malik\AppData\Local\Continuum\miniconda3\envs\python4da\python.exe",
     and that you have no directories in your PATH or PYTHONPATH that can
     interfere with the Python and numpy version "1.18.2" you're trying to use.
  2. If (1) looks fine, you can open a new issue at
     https://github.com/numpy/numpy/issues.  Please include details on:
     - how you installed Python
     - how you installed numpy
     - your operating system
     - whether or not you have multiple versions of Python installed
     - if you built from source, your compiler versions and ideally a build log

- If you're working with a numpy git repository, try `git clean -xdf`
  (removes all files not under version control) and rebuild numpy.

Note: this error has many possible causes, so please don't comment on
an existing issue about this - open a new one instead.

Original error was: No module named 'numpy.core._multiarray_umath'


### Load Data

In [None]:
cust_seg_df_raw = pd.read_excel(os.path.join(RAW_DATA_DIR, "Online Retail.xlsx"))

In [None]:
cust_seg_df = cust_seg_df_raw.copy()

### Separate data for UK

As 91% orders are within Uk. which is a largest segment by country. I will focus on that.

In [None]:
cust_seg_df = cust_seg_df[cust_seg_df['Country'] == 'United Kingdom']

### Attribute for total amount

In [None]:
cust_seg_df['Amount'] = cust_seg_df['Quantity'] * cust_seg_df['UnitPrice']

### Remove missing customerID records

In [None]:
cust_seg_df = cust_seg_df[cust_seg_df['CustomerID'].notnull()]

### Eliminate return (negative) transactions

In [None]:
cust_seg_df = cust_seg_df[cust_seg_df['Amount'] >= 0 ]

In [None]:
# Modified data look
cust_seg_df.head(2)

### Create Recency Feature Variable - Days since last purchase

Reference Date: one day after the last transaction <br>
Recency Variable: Number of days before the reference date when a customer last made a purchase.

##### Refernce Date:

In [None]:
last_date =  cust_seg_df['InvoiceDate'].max()
reference_date =  last_date  +   datetime.timedelta(days=1)
reference_date

##### Recency Period - by transaction:

In [None]:
cust_seg_df['RecencyPeriod'] = reference_date - cust_seg_df['InvoiceDate']

# Get number only of days
cust_seg_df['RecencyPeriod'] = cust_seg_df['RecencyPeriod'].astype('timedelta64[D]') 

### Customer History
##### Recency Period - by customer 
Becuase receny is customer based (not order/transaction based)

In [None]:
# Groub by minimum Recency Period as it the lastest one
customer_history = cust_seg_df.groupby('CustomerID')['RecencyPeriod'].min()\
.reset_index()[['CustomerID', 'RecencyPeriod']]
print(customer_history.head())

# Getting Description of RecencyPeriod
customer_history['RecencyPeriod'].describe()

### Examin distribution of Recency Period of customers

In [None]:
plt.hist(customer_history['RecencyPeriod'], bins=100)

plt.title("Recency Period Distribution", size=15)
plt.xlabel("Recency Period - In days")
plt.ylabel("Frequency")

plt.show()

### Create Monetary Feature Variable

In [None]:
customer_monetary_val = cust_seg_df[['CustomerID', 'Amount']].groupby("CustomerID").sum().reset_index()
customer_history = customer_history.merge(customer_monetary_val, how='outer')

# Adding small amount of 0.001  # As I look forward to do log scale transformation and having zero could cause an error
customer_history['Amount'] = customer_history['Amount'] + 0.001

In [None]:
customer_history.head()

In [None]:
plt.hist(customer_history['Amount'], bins=2000)

plt.title("Monetary Value Distribution", size=15)
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.xlim(0,10000)

plt.show()

### Crearing Frequency Feature Variable

In [None]:
customer_freq = cust_seg_df[['CustomerID', 'Amount']].groupby("CustomerID").count().reset_index()
customer_freq.rename(columns={'Amount':'Frequency'},inplace=True)

customer_history = customer_history.merge(customer_freq, how='outer')

In [None]:
customer_history.head()

In [None]:
plt.hist(customer_history['Frequency'], bins=1000)

plt.title("Frequency Distribution", size=15)
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.xlim(0,1000)

plt.show()

# Data Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
import math

In [None]:
### log Transformation 
customer_history['Recency_log'] = customer_history['RecencyPeriod'].apply(math.log)
customer_history['Frequency_log'] = customer_history['Frequency'].apply(math.log)
customer_history['Amount_log'] = customer_history['Amount'].apply(math.log)

In [None]:
customer_history.head()

### Feature Scaling

In [None]:
# Feature tht would be scaled
features_list = ['Amount_log', 'Recency_log', 'Frequency_log']
customer_history_to_scaled = customer_history[features_list]

# Scaling
# customer_history_scaled is a view of customer_history dataframe
# Hence, all changes are automatically done in customer_history df
scaler = StandardScaler()
scaler.fit(customer_history_to_scaled)
customer_history_scaled = scaler.transform(customer_history_to_scaled)

In [None]:
customer_history.head()

### Visualizing Recency vs Monetary Value (scaled)

In [None]:
plt.scatter(customer_history['Recency_log'], customer_history['Amount_log'], alpha=0.5)
plt.show()

### Visualizing Monetary Value distribution (scaled)¶


In [None]:
plt.hist(customer_history['Amount_log'], bins=1000)

plt.title("Monetary Value Distribution", size=15)
plt.xlabel("Amount")
plt.ylabel("Frequency")

plt.show()

### Visualizing Monetary Value, Frequency and Rececny (scaled)¶


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

xs =customer_history['Recency_log']
ys = customer_history['Frequency_log']
zs = customer_history['Amount_log']
ax.scatter(xs, ys, zs, s=5)

ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')

plt.show()

It seems people with less rececney but high freqency spend more.


# Modeling - Clustering

### K-Mean Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [None]:
X = customer_history_scaled

cluster_centers = dict()

range_n_clusters = [3, 4, 5, 6]

for n_clusters in range_n_clusters:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    #ax2 = plt.subplot(111, projection='3d')
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    silhouette_avg = silhouette_score(X, cluster_labels)
    cluster_centers.update({n_clusters :{
                                        'cluster_center':clusterer.cluster_centers_,
                                        'silhouette_score':silhouette_avg,
                                        'labels':cluster_labels}
                           })

    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        cmap = cm.get_cmap("Spectral")
        color = cmap(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    cmap = cm.get_cmap("Spectral")
    colors = cmap(cluster_labels.astype(float) / n_clusters)
    feature1 = 0
    feature2 = 2
    ax2.scatter(X[:, feature1], X[:, feature2], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')
    
    centers = clusterer.cluster_centers_
    ax2.scatter(centers[:, feature1], centers[:, feature2], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')
    for i, c in enumerate(centers):
        ax2.scatter(c[feature1], c[feature2], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')
    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature i.e. monetary value")
    ax2.set_ylabel("Feature space for the 2nd feature i.e. frequency")
    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')
    
    plt.show()

### Cluster Analysis
Retranforming Clusters center values to normal values for further analysis

In [None]:
range_n_clusters = [3, 4, 5, 6]
features_list = ['Amount_log', 'Recency_log', 'Frequency_log']

for i in range_n_clusters:
    print("for {} number of clusters".format(i))
    cent_transformed = scaler.inverse_transform(cluster_centers[i]['cluster_center'])
    print(pd.DataFrame(np.exp(cent_transformed),columns=features_list))
    print("Silhouette score for cluster {} is {}". format(i, cluster_centers[i]['silhouette_score']))
    print()

### Clusters Labels Assignment

In [None]:
labels = cluster_centers[5]['labels']   
customer_history['num_cluster5_labels'] = labels
labels = cluster_centers[3]['labels']
customer_history['num_cluster3_labels'] = labels

### Visualization

In [None]:
sns.boxenplot(x="num_cluster3_labels", y="Amount", hue="smoker",
                   data=customer_history, palette="Set3")

In [None]:
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()

x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4', 'Cluster 5']
cutoff_quantile = 100
field_to_plot = 'RecencyPeriod'

y0 = customer_history[customer_history['num_cluster5_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history[customer_history['num_cluster5_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history[customer_history['num_cluster5_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y3 = customer_history[customer_history['num_cluster5_labels']==3][field_to_plot].values
y3 = y3[y3<np.percentile(y3, cutoff_quantile)]
y4 = customer_history[customer_history['num_cluster5_labels']==4][field_to_plot].values
y4 = y4[y4<np.percentile(y4, cutoff_quantile)]
y_data = [y0,y1,y2,y3,y4]

colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []

for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints=False,
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=50,
        gridcolor='black',
        gridwidth=0.1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=False
)

fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)