In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Exploratory Data Analysis

In [None]:
#read in dataset
cleaned_data = pd.read_csv('CleanedData.csv',encoding='latin1',low_memory=False)
cleaned_data.head(5)

In [None]:
cleaned_data.describe()

In [None]:
cleaned_data.info()

## Customer Analysis

In [None]:
#Unique Customers
print("Number of customers:",cleaned_data['CustomerID'].nunique())

### Transaction Frequency

In [None]:
#Transaction Frequency
transaction_frequency = cleaned_data.groupby('CustomerID')['InvoiceNo'].nunique()

# Define segments based on transaction frequency
bins = [0, 1, 5, 10, np.inf]
labels = ['1', '2-5', '6-10', '11+']
cleaned_data['TransactionFrequency'] = pd.cut(transaction_frequency, bins=bins, labels=labels)

# Now, analyze each segment
segment_counts = cleaned_data['TransactionFrequency'].value_counts()

# Plotting a pie chart
plt.figure(figsize=(8, 8))
plt.pie(segment_counts, labels=labels, autopct='%1.1f%%', startangle=90, colors=['lightcoral', 'peachpuff', 'lightskyblue', 'lightgreen'])
plt.title('Customer Segmentation by Transaction Frequency')
plt.show()

### Top Customers

In [None]:
# Total spending per customer
total_spending = cleaned_data.groupby('CustomerID')['TotalPrice'].sum()

# Sorting to get top customers
top_customers = total_spending.sort_values(ascending=False).head(10)

# Total spending per customer
total_spending = cleaned_data.groupby('CustomerID')['TotalPrice'].sum()

# Sorting to get top customers
top_customers = total_spending.sort_values(ascending=False).head(10)
top_customers

### Customer Retention

In [None]:
# Identify repeat customers
repeat_customers = cleaned_data[cleaned_data['TransactionFrequency'] > '1']

# Calculate retention rate
retention_rate = len(repeat_customers['CustomerID'].unique()) / len(cleaned_data['CustomerID'].unique())

In [None]:
# Calculate the percentage of repeat customers
percent_repeat_customers = retention_rate * 100

# Calculate the churn rate (1 - retention rate)
churn_rate = 1 - retention_rate

# Print the percentage of repeat customers
print(f"Percentage of Repeat Customers: {percent_repeat_customers:.2f}%")

## Product Analysis

In [None]:
#Unique Products
print("Number of products:",cleaned_data['StockCode'].nunique())

### Top Products

#### Top Products By Quantity Sold

In [None]:
# Calculate total quantity sold for the entire dataset
total_quantity = cleaned_data['Quantity'].sum()

# Print the total quantity
print(f"Overall Total Quantity Sold: {total_quantity}")


# Top 10 products by quantity sold
top_products_quantity = cleaned_data.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)

# Bar chart for top products by quantity
plt.figure(figsize=(12, 6))
top_products_quantity.plot(kind='bar', color='lightcoral')
plt.title('Top 10 Products by Quantity Sold')
plt.xlabel('Product Description')
plt.ylabel('Quantity Sold')
plt.show()

### Top Product By Revenue Generated

In [None]:
# Calculate total revenue for the entire dataset
total_revenue = cleaned_data['TotalPrice'].sum()

# Print the total revenue
print(f"Overall Total Revenue: {total_revenue:.2f}")

# Top 10 products by total revenue
top_products_revenue = cleaned_data.groupby('Description')['TotalPrice'].sum().sort_values(ascending=False).head(10)

# Bar chart for top products by revenue
plt.figure(figsize=(12, 6))
top_products_revenue.plot(kind='bar', color='skyblue')
plt.title('Top 10 Products by Total Revenue')
plt.xlabel('Product Description')
plt.ylabel('Total Revenue')
plt.show()

## Time Analysis

In [None]:
# Monthly sales trends
monthly_sales = cleaned_data.groupby('Month')['Quantity'].sum()

# Set month names as index
monthly_sales.index = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# Plotting the bar graph
plt.figure(figsize=(10, 6))
monthly_sales.plot(kind='bar', color='lightcoral')
plt.title('Monthly Sales Trends')
plt.xlabel('Month')
plt.ylabel('Total Quantity Sold')
plt.show()

In [None]:
# Daily sales trends
daily_sales = cleaned_data.groupby('Day')['Quantity'].sum()

# Bar chart for daily sales trends
plt.figure(figsize=(10, 6))
daily_sales.plot(kind='bar', color='lightskyblue')
plt.title('Daily Sales Trends')
plt.xlabel('Day of the Week')
plt.ylabel('Quantity Sold')

# Set xticks with specified labels
plt.xticks(range(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)

plt.show()


In [None]:
# Hourly sales trends
hourly_sales = cleaned_data.groupby('Hour')['Quantity'].sum()

# Line chart for hourly sales trends
plt.figure(figsize=(10, 6))
hourly_sales.plot(kind='bar', color='lightgreen')
plt.title('Hourly Sales Trends')
plt.xlabel('Hour of the Day')
plt.ylabel('Quantity Sold')
plt.show()


## Country Analysis

In [None]:
# Group by 'Country' and calculate total sales
country_sales = cleaned_data.groupby('Country')['TotalPrice'].sum()

# Sort the total sales
sorted_country_sales = country_sales.sort_values(ascending=False)

# Display the top 5 countries by total sales
top_5_country_sales = sorted_country_sales.head(5)
print(top_5_country_sales)

In [None]:
# Exclude entries for the United Kingdom
countries_excluding_uk = cleaned_data[cleaned_data['Country'] != 'United Kingdom']

# Group by 'Country' and calculate total sales
country_sales_excluding_uk = countries_excluding_uk.groupby('Country')['TotalPrice'].sum()

# Bar chart for country-wise sales excluding the United Kingdom
plt.figure(figsize=(12, 6))
country_sales_excluding_uk.sort_values(ascending=False).plot(kind='bar', color='lightcoral')
plt.title('Country-wise Sales (Excluding United Kingdom)')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.show()


In [None]:
# Group by 'Country' and calculate the number of unique customers
customer_count_per_country = cleaned_data.groupby('Country')['CustomerID'].nunique()

# Sort the customer counts
sorted_customer_count = customer_count_per_country.sort_values(ascending=False)

# Display the top 5 countries by the number of unique customers
top_5_customer_count = sorted_customer_count.head(5)
print(top_5_customer_count)


In [None]:
# Group by 'Country' and calculate total sales
country_CID_excluding_uk = countries_excluding_uk.groupby('Country')['CustomerID'].sum()

# Bar chart for country-wise sales excluding the United Kingdom
plt.figure(figsize=(12, 6))
country_CID_excluding_uk.sort_values(ascending=False).plot(kind='bar', color='lightskyblue')
plt.title('Country-wise Sales (Excluding United Kingdom)')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.show()

In [None]:
# Group by 'Country' and calculate the average transaction value
avg_transaction_value = cleaned_data.groupby('Country')['TotalPrice'].mean()

# Sort the average transaction values
sorted_avg_transaction_value = avg_transaction_value.sort_values(ascending=False)

# Display the top 5 countries by the average transaction value
top_5_avg_transaction_value = sorted_avg_transaction_value.head(5)
print(top_5_avg_transaction_value)


In [None]:
# Calculate average transaction value per country
avg_transaction_value = cleaned_data.groupby('Country')['TotalPrice'].mean()

# Bar chart for average transaction value per country
plt.figure(figsize=(12, 6))
avg_transaction_value.sort_values(ascending=False).plot(kind='bar', color='lightgreen')
plt.title('Average Transaction Value per Country')
plt.xlabel('Country')
plt.ylabel('Average Transaction Value')
plt.show()

In [None]:
# Count transaction frequency per country
transaction_frequency = cleaned_data.groupby('Country')['InvoiceNo'].nunique()

# Sort the transaction frequencies
sorted_transaction_frequency = transaction_frequency.sort_values(ascending=False)

# Display the top 5 transaction frequencies
top_5_transaction_frequency = sorted_transaction_frequency.head(5)
print(top_5_transaction_frequency)

In [None]:
# Group by 'Country' and calculate total sales
country_sales_excluding_uk = countries_excluding_uk.groupby('Country')['TotalPrice'].sum()

# Bar chart for country-wise sales excluding the United Kingdom
plt.figure(figsize=(12, 6))
country_sales_excluding_uk.sort_values(ascending=False).plot(kind='bar', color='peachpuff')
plt.title('Country-wise Sales (Excluding United Kingdom)')
plt.xlabel('Country')
plt.ylabel('Total Sales')
plt.show()
