In [1]:
# Basic machine learning models (Supervised and Unsupervised learning)
# """
# 1. Load the dataset
# 2. Explore the dataset using charts or some statistical models
# 3. Preprocess the data (Are there any null values?, Are all columns relevent? Are there outcome? Are the columns independent?)
# 4. Apply preprocess values (Transformation of the columns after preprocessing decisions)
# 5. Split training and test data
# 6. Select the model (What is the category of the problem? Classification or regression? labeled or non-labeled data?)
# 7. Fit the model with training data (what is the accuracy score?)
# 8. Validate the model with test data (what is the accuracy score?)
# 9. Optimize the model if the accuracy score is not satisfactory (Select parameters that can improve the accuracy and apply to model)
# 10. Run the model with new test data
# 11. Save the model
# 12. Deploy the model (Desktop or web app or mobile app or IOT device)
# """

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Step1 : Importing packages
import pandas as pd
import numpy as np
from sklearn import linear_model

In [4]:
# Step2 : Load the data set
retail=pd.read_csv('Retail_Case_Study_Data.csv')
retail.head()

Unnamed: 0,Cust Id,Months Since Last Buy,Spend Category,Spend Numeric,Mens Merchandise,Womens Merchandise,Area,New Customer,Purchase Channel,Visited Website,Sale Made
0,1,4,3) $200 - $350,243.46,0,1,Urban,0,Web,1,0
1,2,5,1) $0 - $100,79.79,0,1,Urban,1,Phone,0,0
2,3,9,1) $0 - $100,29.99,0,1,Surburban,1,Web,0,0
3,4,5,1) $0 - $100,29.99,1,0,Surburban,0,Web,0,0
4,5,7,3) $200 - $350,340.69,0,1,Surburban,0,Phone,1,1


In [5]:
# Step3 : Processing
# Check the missing value
retail.isnull().sum()

Cust Id                  0
Months Since Last Buy    0
Spend Category           0
Spend Numeric            0
Mens Merchandise         0
Womens Merchandise       0
Area                     4
New Customer             0
Purchase Channel         0
Visited Website          0
Sale Made                0
dtype: int64

In [6]:
retail['Area'].value_counts()

Surburban    786
Urban        691
Rural        266
Name: Area, dtype: int64

In [7]:
retail=retail.fillna('Surburban')

In [8]:
# Check missing value in chrecter data
retail.isnull().sum()

Cust Id                  0
Months Since Last Buy    0
Spend Category           0
Spend Numeric            0
Mens Merchandise         0
Womens Merchandise       0
Area                     0
New Customer             0
Purchase Channel         0
Visited Website          0
Sale Made                0
dtype: int64

In [9]:
# PCA
retail.var()

Cust Id                  337637.232447
Months Since Last Buy        10.105906
Spend Numeric             73531.246094
Mens Merchandise              0.244474
Womens Merchandise            0.247633
New Customer                  0.250083
Visited Website               0.206825
Sale Made                     0.170623
dtype: float64

In [10]:
retail.dtypes

Cust Id                    int64
Months Since Last Buy      int64
Spend Category            object
Spend Numeric            float64
Mens Merchandise           int64
Womens Merchandise         int64
Area                      object
New Customer               int64
Purchase Channel          object
Visited Website            int64
Sale Made                  int64
dtype: object

In [11]:
# retail[['Area']]
retail.columns

Index(['Cust Id', 'Months Since Last Buy', 'Spend Category', 'Spend Numeric',
       'Mens Merchandise', 'Womens Merchandise', 'Area', 'New Customer',
       'Purchase Channel', 'Visited Website', 'Sale Made'],
      dtype='object')

In [12]:
# Replace the values
retail.Area=retail.Area.replace({'Rural':0,'Surburban':1,'Urban':2})
retail.head()

Unnamed: 0,Cust Id,Months Since Last Buy,Spend Category,Spend Numeric,Mens Merchandise,Womens Merchandise,Area,New Customer,Purchase Channel,Visited Website,Sale Made
0,1,4,3) $200 - $350,243.46,0,1,2,0,Web,1,0
1,2,5,1) $0 - $100,79.79,0,1,2,1,Phone,0,0
2,3,9,1) $0 - $100,29.99,0,1,1,1,Web,0,0
3,4,5,1) $0 - $100,29.99,1,0,1,0,Web,0,0
4,5,7,3) $200 - $350,340.69,0,1,1,0,Phone,1,1


In [13]:
# Replace the values
retail['Purchase Channel']=retail['Purchase Channel'].replace({'Phone':0,'Web':1,'Multichannel':2})
retail.head()

Unnamed: 0,Cust Id,Months Since Last Buy,Spend Category,Spend Numeric,Mens Merchandise,Womens Merchandise,Area,New Customer,Purchase Channel,Visited Website,Sale Made
0,1,4,3) $200 - $350,243.46,0,1,2,0,1,1,0
1,2,5,1) $0 - $100,79.79,0,1,2,1,0,0,0
2,3,9,1) $0 - $100,29.99,0,1,1,1,1,0,0
3,4,5,1) $0 - $100,29.99,1,0,1,0,1,0,0
4,5,7,3) $200 - $350,340.69,0,1,1,0,0,1,1


In [14]:
retail=retail.drop(['Cust Id','Spend Category'], axis=1)
retail.head()

Unnamed: 0,Months Since Last Buy,Spend Numeric,Mens Merchandise,Womens Merchandise,Area,New Customer,Purchase Channel,Visited Website,Sale Made
0,4,243.46,0,1,2,0,1,1,0
1,5,79.79,0,1,2,1,0,0,0
2,9,29.99,0,1,1,1,1,0,0
3,5,29.99,1,0,1,0,1,0,0
4,7,340.69,0,1,1,0,0,1,1


In [15]:
#Step4 : Create train and test partition of retail data
from sklearn.model_selection import train_test_split

In [16]:
retail_train,retail_test = train_test_split(retail,test_size=0.20,random_state=5)

In [17]:
retail_train.head()

Unnamed: 0,Months Since Last Buy,Spend Numeric,Mens Merchandise,Womens Merchandise,Area,New Customer,Purchase Channel,Visited Website,Sale Made
487,3,665.58,1,1,2,1,0,0,0
1105,2,49.49,0,1,2,1,1,1,0
1596,5,29.99,1,0,2,1,1,0,0
7,5,326.32,0,1,1,0,1,0,0
1197,3,265.05,1,0,2,0,0,0,0


In [18]:
len(retail_test)

350

In [19]:
# Step5 : Create the model
# Logistic regration model
retail_model=linear_model.LogisticRegression()

In [20]:
# Step6 : Fit the model to the training data
# Prediction of the training data
# X is all column except 'Sale Made'traning vector(features)
# Y is 'Sale Made'= target vector(label or outcome)
X=retail_train[retail_train.columns[:-1]]
Y=retail_train[retail_train.columns[-1]]
retail_model.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
test_X=retail_test[retail_test.columns[0:8]]
test_Y=retail_test[retail_test.columns[8]]

In [30]:
# Validate the test data
# Prediction on the test data
all_predictions=retail_model.predict(retail_test[retail_test.columns[0:8]])
all_predictions

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [31]:
retail_test.iloc[0]

Months Since Last Buy      2.00
Spend Numeric            100.77
Mens Merchandise           0.00
Womens Merchandise         1.00
Area                       2.00
New Customer               1.00
Purchase Channel           0.00
Visited Website            0.00
Sale Made                  0.00
Name: 1098, dtype: float64

In [32]:
new_test=retail_test.iloc[15, :-1]
new_test

Months Since Last Buy      1.00
Spend Numeric            716.38
Mens Merchandise           1.00
Womens Merchandise         1.00
Area                       0.00
New Customer               1.00
Purchase Channel           2.00
Visited Website            1.00
Name: 166, dtype: float64

In [33]:
predictions=retail_model.predict([new_test])
predictions

array([0], dtype=int64)

In [34]:
#  Accuracy score on the training data
retail_model.score(X,Y)

0.7695060844667144

In [35]:
# Accuracy score on the testing data
retail_model.score(retail_test[retail_test.columns[0:8]],retail_test[retail_test.columns[8]])  

0.7828571428571428

In [36]:
# other way to find out score of test data
retail_model.score(test_X,test_Y)

0.7828571428571428

In [42]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
y_true=test_Y
y_pred=all_predictions
confusion_matrix(y_true,y_pred)

array([[266,  13],
       [ 63,   8]], dtype=int64)

In [44]:
from joblib import dump, load
dump(retail_model,'retail.joblib')

['retail.joblib']

In [45]:
# Reuse
clf=load('retail.joblib')

In [48]:
y_pred=clf.predict(test_X)
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,