In [10]:
import pandas as pd # for data science
import numpy as np  # linear algebra library
import matplotlib.pyplot as plt # plotting library
# import stats functions
from scipy import stats
# normal continuous random variable
from scipy.stats import norm 

# read and clean
retail_data = pd.read_csv('https://archive.ics.uci.edu/static/public/352/data.csv') 
df = pd.DataFrame(retail_data)

# Convert 'InvoiceDate' to datetime format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Add a 'TotalSales' column: Quantity * UnitPrice
df['TotalSales'] = df['Quantity'] * df['UnitPrice']

# Extract only United Kingdom retail data for analysis
uk_data = df[df['Country'] == 'United Kingdom']
uk_data.tail()


# Sales Analysis
# Total Sales
total_sales = uk_data['TotalSales'].sum()
print(f"Total Sales in UK: £{total_sales:.2f}")

# Average Sales per Transaction by invoiceNo
average_sales = uk_data.groupby('InvoiceNo')['TotalSales'].mean().mean()
print(f"Average Sales per Transaction: £{average_sales:.2f}")

# determine the top 10 Products by Sales
top_products = uk_data.groupby('Description')['TotalSales'].sum().sort_values(ascending=False).head(10)
print("Top 10 Products by Total Sales:")
print(top_products)

# Time-Series Analysis
# Total sales by Date (day) ignoring time 
sales_by_date = uk_data.groupby(uk_data['InvoiceDate'].dt.date)['TotalSales'].sum()



Total Sales in UK: £8187806.36
Average Sales per Transaction: £18.77
Top 10 Products by Total Sales:
Description
DOTCOM POSTAGE                        206245.48
REGENCY CAKESTAND 3 TIER              134405.94
WHITE HANGING HEART T-LIGHT HOLDER     93953.07
PARTY BUNTING                          92501.73
JUMBO BAG RED RETROSPOT                84516.44
PAPER CHAIN KIT 50'S CHRISTMAS         61888.19
ASSORTED COLOUR BIRD ORNAMENT          54662.15
CHILLI LIGHTS                          52986.86
PICNIC BASKET WICKER 60 PIECES         39619.50
BLACK RECORD COVER FRAME               39387.00
Name: TotalSales, dtype: float64
