In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

# Load the dataset (replace with the correct path to your file)
df = pd.read_csv('amazon_uk_products.csv')

# Part 1: Understanding Product Categories
# Business Question: What are the most popular product categories on Amazon UK, and how do they compare in terms of listing frequency?

# Frequency table for product categories
category_counts = df['product_category'].value_counts()

# Display the top 5 most listed product categories
top_5_categories = category_counts.head(5)
print("Top 5 Most Listed Product Categories:\n", top_5_categories)

# Bar chart for the distribution of product categories
plt.figure(figsize=(10, 6))
category_counts.head(10).plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Listed Product Categories on Amazon UK')
plt.xlabel('Product Categories')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.show()

# Pie chart for top categories (subset of top 5)
top_5_categories.plot(kind='pie', autopct='%1.1f%%', startangle=90, figsize=(8, 8))
plt.title('Proportion of Top 5 Product Categories')
plt.ylabel('')
plt.show()

# Part 2: Delving into Product Pricing
# Business Question: How are products priced on Amazon UK, and are there specific price points or ranges that are more common?

# Measures of Centrality: mean, median, and mode for price
mean_price = df['price'].mean()
median_price = df['price'].median()
mode_price = df['price'].mode()[0]

print(f"Mean Price: {mean_price}")
print(f"Median Price: {median_price}")
print(f"Mode Price: {mode_price}")

# Measures of Dispersion: variance, standard deviation, range, and interquartile range for price
price_variance = df['price'].var()
price_std_dev = df['price'].std()
price_range = df['price'].max() - df['price'].min()
price_iqr = df['price'].quantile(0.75) - df['price'].quantile(0.25)

print(f"Price Variance: {price_variance}")
print(f"Price Standard Deviation: {price_std_dev}")
print(f"Price Range: {price_range}")
print(f"Price IQR: {price_iqr}")

# Histogram of product prices
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=50, color='lightgreen', edgecolor='black')
plt.title('Distribution of Product Prices on Amazon UK')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Box plot to show spread and outliers
plt.figure(figsize=(10, 6))
plt.boxplot(df['price'], vert=False, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.title('Price Distribution with Box Plot')
plt.xlabel('Price')
plt.show()

# Part 3: Unpacking Product Ratings
# Business Question: How do customers rate products on Amazon UK, and are there any patterns or tendencies in the ratings?

# Measures of Centrality: mean, median, and mode for ratings
mean_rating = df['rating'].mean()
median_rating = df['rating'].median()
mode_rating = df['rating'].mode()[0]

print(f"Mean Rating: {mean_rating}")
print(f"Median Rating: {median_rating}")
print(f"Mode Rating: {mode_rating}")

# Measures of Dispersion: variance, standard deviation, and interquartile range for ratings
rating_variance = df['rating'].var()
rating_std_dev = df['rating'].std()
rating_iqr = df['rating'].quantile(0.75) - df['rating'].quantile(0.25)

print(f"Rating Variance: {rating_variance}")
print(f"Rating Standard Deviation: {rating_std_dev}")
print(f"Rating IQR: {rating_iqr}")

# Shape of the Distribution: skewness and kurtosis for ratings
rating_skewness = skew(df['rating'].dropna())  # Dropping NA values for skewness
rating_kurtosis = kurtosis(df['rating'].dropna())

print(f"Rating Skewness: {rating_skewness}")
print(f"Rating Kurtosis: {rating_kurtosis}")

# Histogram of product ratings
plt.figure(figsize=(10, 6))
plt.hist(df['rating'], bins=20, color='lightcoral', edgecolor='black')
plt.title('Distribution of Product Ratings on Amazon UK')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()
