# Minor Project

## Customer Segmentation Using Unsupervised Machine Learning Techniques

##### Data Cleaning


In [1]:
#Importing all the required libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Importing Online Retail Dataset
Retail_data = pd.read_csv('online_retail_dataset.csv', encoding = 'unicode_escape')
Retail_data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 08:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 08:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 08:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 08:26,3.39,17850.0,United Kingdom


In [3]:
#Checking the shape i.e number of columns and rows in the dataset
Retail_data.shape

(541910, 8)

In [4]:
#Customer Distribution by country
country_cus_data = Retail_data[['Country', 'Customer ID']].drop_duplicates()
country_cus_data.groupby(['Country'])['Customer ID'].aggregate('count').reset_index().sort_values('Customer ID', ascending = False)

Unnamed: 0,Country,Customer ID
36,United Kingdom,3950
14,Germany,95
13,France,87
31,Spain,31
3,Belgium,25
33,Switzerland,21
27,Portugal,19
19,Italy,15
12,Finland,12
1,Austria,11


In [5]:
#Keeping Only United Kingdom Data
Retail_data = Retail_data.query("Country == 'United Kingdom'").reset_index(drop = True)

In [6]:
#Checking for missing values in the dataset
Retail_data.isnull().sum(axis = 0)

Invoice             0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
Price               0
Customer ID    133600
Country             0
dtype: int64

In [7]:
#Removing the missing values from Customer ID column, can ignore missing values in description coiumn
Retail_data = Retail_data[pd.notnull(Retail_data['Customer ID'])]

In [8]:
#Validating if there are any negative values in Quantity Column
Retail_data.Quantity.min()

-80995

In [9]:
#Validating if there are any negative values in Price Column
Retail_data.Price.min()

0.0

In [10]:
#Filtering out recording with negative Values
Retail_data = Retail_data[(Retail_data['Quantity'] > 0)]

In [11]:
#Converting the string date field to datetime
Retail_data['InvoiceDate'] = pd.to_datetime(Retail_data['InvoiceDate'])

In [12]:
#Added new column depicting total amount
Retail_data['TotalAmount'] = Retail_data['Quantity'] * Retail_data['Price']

In [13]:
#Checking if the shape i.e. number of columns and rows in the dataset after data is cleaned
Retail_data.shape

(354345, 9)

In [14]:
Retail_data.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalAmount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-01-12 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-01-12 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-01-12 08:26:00,3.39,17850.0,United Kingdom,20.34
