In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

dt=pd.read_csv(r'C:\Users\DELL\Desktop\Ecommerce - UK Retailer.csv',encoding = 'ISO-8859-1')

import warnings
# current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

import missingno as msno # missing data visualization module for Python
import pandas_profiling

%matplotlib inline
color = sns.color_palette()

import gc
import datetime

In [None]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100)

In [None]:
dt.head()

In [None]:
dt.rename(index=str, columns={'InvoiceNo': 'invoice_num',
                              'StockCode' : 'stock_code',
                              'Description' : 'description',
                              'Quantity' : 'quantity',
                              'InvoiceDate' : 'invoice_date',
                              'UnitPrice' : 'unit_price',
                              'CustomerID' : 'cust_id',
                              'Country' : 'country'}, inplace=True)

In [None]:
dt.head()

In [None]:
dt.info()

In [None]:
dt.isnull().sum().sort_values(ascending=False)# check missing values for each column 

In [None]:
dt[dt.isnull().any(axis=1)].head()
# check out the rows with missing values

In [None]:
dt['invoice_date'] = pd.to_datetime(dt.invoice_date, format='%m/%d/%Y %H:%M')
# change the invoice_date format - String to Timestamp format

In [None]:
dt['description'] = dt.description.str.lower()
# change description - UPPER case to LOWER case

In [None]:
dt.head()

In [None]:
dt_n = dt.dropna()
# data_new without missing values

In [None]:
dt_n.isnull().sum().sort_values(ascending=False)
# check missing values for each column 

In [None]:
dt_n.info()

In [None]:
dt_n['cust_id'] = dt_n['cust_id'].astype('int64')
#change columns type - String to Int type 

In [None]:
dt_n.head()

In [None]:
dt_n.info()

In [None]:
dt_n.describe().round(2)

In [None]:
dt_n = dt_n[dt_n.quantity > 0]#removing negitive values

In [None]:
dt_n.describe().round(2)

In [None]:
dt_n['amount_spent'] = dt_n['quantity'] * dt_n['unit_price']# adding new column amount_spent

In [None]:
dt_n = dt_n[['invoice_num','invoice_date','stock_code','description','quantity','unit_price','amount_spent','cust_id','country']]
# rearranging the columns 

In [None]:
dt_n.insert(loc=2, column='year_month', value=dt_n['invoice_date'].map(lambda x: 100*x.year + x.month))
dt_n.insert(loc=3, column='month', value=dt_n.invoice_date.dt.month)
# +1 to make Monday=1.....until Sunday=7
dt_n.insert(loc=4, column='day', value=(dt_n.invoice_date.dt.dayofweek)+1)
dt_n.insert(loc=5, column='hour', value=dt_n.invoice_date.dt.hour)

In [None]:
dt_n.head()

In [None]:
dt_n.shape

In [None]:
dt_n.info()

# 1)a Boxplot - All Numerical Values

In [None]:
dt_n[['year_month','month','day','hour','quantity','unit_price','amount_spent']].plot.box()

# b)Histogram – All Numeric Variables

In [None]:
dt_n[['quantity']].plot.hist(bins = 20, title = 'Purchases Quantity Distribution')

# c) Distribution Plot – All Numeric Variables

In [None]:
plt.subplots(figsize=(10,8))
sns.distplot(dt_n.quantity[dt_n.quantity < 50], label='Unit Price').legend()

plt.xlabel('Unit Price')
plt.ylabel('Normalized Distribution')
plt.title('Unit Price Distribution')
plt.show()

# d)Aggregation for all numerical Columns

In [None]:
#sum groupby StockCode
df_quant_sold = dt_n[['stock_code','quantity']].groupby('stock_code').sum()
df_quant_sold.reset_index(inplace = True)
#order descending
df_quant_sold.sort_values(by = ['quantity'], ascending = False, inplace = True)
df_quant_sold.head()

# e)Unique Values across all columns

In [None]:
uniqueValues = dt_n.nunique()
print('Count of unique values in each column :')
print(uniqueValues)

# f)Duplicate values across all columns

In [None]:
dt_n.duplicated(subset=None, keep='first')

# g)Correlation – Heatmap - All Numeric Variables

In [None]:
corr = dt_n[['quantity','unit_price','amount_spent']].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# h)Regression Plot - All Numeric Variables

In [None]:
fig, [ax0,ax1] = plt.subplots(1,2)
fig.set_size_inches([12,6])
sns.regplot(data=dt_n,x='unit_price',y='quantity',ax=ax0)
sns.residplot(data=dt_n,x='unit_price',y='quantity',ax=ax1)
plt.show()

# i)Bar Plot – Every Categorical Variable vs every Numerical Variable

In [None]:
#unitprice vs country
plt.figure(figsize=(12,10))
sns.barplot(x = 'unit_price',y = 'country',data = dt_n,palette='ocean')
plt.show()

In [None]:
#amount spent vs country
plt.figure(figsize=(12,10))
sns.barplot(x = 'amount_spent',y = 'country',data = dt_n,palette='ocean')
plt.show()

In [None]:
#quantity vs country
plt.figure(figsize=(12,10))
sns.barplot(x = 'quantity',y ='country',data = dt_n,palette='ocean')
plt.show()

In [None]:
#quantity vs country
plt.figure(figsize=(12,10))
sns.barplot(x = 'cust_id',y ='country',data = dt_n,palette='ocean')
plt.show()

# j)Pair plot - All Numeric Variables

In [None]:
sns.pairplot(dt_n, vars = ['quantity', 'unit_price', 'amount_spent'])

# k)Line chart to show the trend of data - All Numeric/Date Variables

In [None]:
sns.lineplot(x = "unit_price", y = "year_month", data=dt_n, hue="quantity")
plt.show()

# l)Skewness all numeric variables

In [None]:
dt_n.skew(axis = 0, skipna = True)