In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import HoverTool

# Preprocessing

In [3]:
df = pd.read_csv('./Online Retail.csv')
df.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom


In [4]:
df.isnull().sum()

InvoiceNo         0
StockCode         0
Description    1454
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df['TotalAmount'] = df['UnitPrice'] * df['Quantity']

## Sales Overview

In [None]:
fig = px.line(df, x='InvoiceDate', y='TotalAmount', title='Total Sales Amount Over Time')
fig.update_xaxes(title='Month')
fig.update_yaxes(title='Total Sales Amount')
#fig.show()

## Product analysis

In [8]:
top_products = df.groupby('StockCode')['Quantity'].sum().nlargest(10).reset_index()
fig = px.bar(top_products, x='Quantity', y='StockCode', orientation='h', title='Top Selling Products')
fig.update_xaxes(title='Quantity')
fig.update_yaxes(title='Product')
fig.show()

## Customer Analysis

In [12]:
customer_purchases = df.groupby('CustomerID')['TotalAmount'].sum().reset_index()
customer_purchases['CustomerID'] = customer_purchases['CustomerID'].astype(str)  # Convert to string
customer_purchases = customer_purchases.nlargest(10, 'TotalAmount')  # Select top 10 customers
hover = HoverTool(tooltips=[('Customer ID', '@CustomerID'), ('Total Purchase Amount', '@TotalAmount')])

p = figure(x_range=customer_purchases['CustomerID'], plot_height=400, plot_width=800, title='Customer Purchase History',
           x_axis_label='Customer ID', y_axis_label='Total Purchase Amount', tools=[hover])
p.vbar(x='CustomerID', top='TotalAmount', source=customer_purchases, width=0.5, color='purple')
p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)



## Market Basket Analysis

In [13]:
from mlxtend.frequent_patterns import apriori, association_rules

basket_df = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo')
itemsets = basket_df.applymap(lambda x: 1 if x >= 1 else 0)
frequent_itemsets = apriori(itemsets, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)



DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = df[['Quantity', 'UnitPrice']]
y = df['TotalAmount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 187.9541672308356


In [22]:
output_notebook()