In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, skew, kurtosis, pearsonr
import numpy as np
import scipy.stats as stats

# Load dataset (replace with the correct path to your dataset)
df = pd.read_csv('amazon_uk_products.csv')

# Part 1: Analyzing Best-Seller Trends Across Product Categories
# Objective: Understand the relationship between product categories and their best-seller status.

# Crosstab to analyze Best-Seller distribution across categories
best_seller_crosstab = pd.crosstab(df['product_category'], df['isBestSeller'])

# Proportion of Best-Sellers for each category
best_seller_proportion = best_seller_crosstab.iloc[:, 1] / best_seller_crosstab.sum(axis=1)
best_seller_proportion_sorted = best_seller_proportion.sort_values(ascending=False)
print("\nProportion of Best-Sellers per Category (Sorted):\n", best_seller_proportion_sorted)

# Chi-square test for independence between product category and Best-Seller status
chi2, p_value, _, _ = chi2_contingency(best_seller_crosstab)
print(f"\nChi-square test p-value: {p_value}")
if p_value < 0.05:
    print("There is a significant relationship between product category and best-seller status.")
else:
    print("There is no significant relationship between product category and best-seller status.")

# Cramér's V calculation
n = best_seller_crosstab.sum().sum()  # Total number of observations
min_dim = min(best_seller_crosstab.shape) - 1  # Minimum of rows-1, columns-1
cramers_v = np.sqrt(chi2 / (n * min_dim))
print(f"Cramér's V: {cramers_v}")

# Stacked bar chart to show best-seller proportions across categories
plt.figure(figsize=(12, 7))
best_seller_crosstab.div(best_seller_crosstab.sum(axis=1), axis=0).plot(kind='bar', stacked=True, color=['skyblue', 'lightcoral'])
plt.title('Best-Seller Status Across Product Categories')
plt.xlabel('Product Categories')
plt.ylabel('Proportion')
plt.xticks(rotation=90)
plt.show()

# Part 2: Exploring Product Prices and Ratings Across Categories and Brands
# Objective: Investigate how different product categories influence product prices.

# Removing outliers using IQR for product prices
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = df[(df['price'] >= Q1 - 1.5 * IQR) & (df['price'] <= Q3 + 1.5 * IQR)]

# Violin plot to visualize price distribution across categories (top 20 categories)
top_20_categories = df_no_outliers['product_category'].value_counts().head(20).index
plt.figure(figsize=(14, 7))
sns.violinplot(x='product_category', y='price', data=df_no_outliers[df_no_outliers['product_category'].isin(top_20_categories)], palette='muted')
plt.xticks(rotation=90)
plt.title('Price Distribution Across Product Categories (Top 20)')
plt.show()

# Median price per category (without filtering)
median_price_category = df_no_outliers.groupby('product_category')['price'].median()
highest_median_price_category = median_price_category.idxmax()
print(f"Category with the highest median price: {highest_median_price_category}")

# Bar chart comparing average price across the top 10 categories by count
top_10_categories = df_no_outliers['product_category'].value_counts().head(10).index
avg_price_per_category = df_no_outliers.groupby('product_category')['price'].mean().loc[top_10_categories]
plt.figure(figsize=(12, 6))
avg_price_per_category.sort_values(ascending=False).plot(kind='bar', color='lightgreen')
plt.title('Average Price Across Top 10 Product Categories')
plt.ylabel('Average Price')
plt.xlabel('Product Categories')
plt.xticks(rotation=45)
plt.show()

# Box plot for ratings distribution based on product category (Top 10 categories)
plt.figure(figsize=(14, 7))
sns.boxplot(x='product_category', y='rating', data=df_no_outliers[df_no_outliers['product_category'].isin(top_10_categories)], palette='muted')
plt.xticks(rotation=90)
plt.title('Distribution of Product Ratings Across Categories (Top 10)')
plt.show()

# Median rating per category (without filtering)
median_rating_category = df_no_outliers.groupby('product_category')['rating'].median()
highest_median_rating_category = median_rating_category.idxmax()
print(f"Category with the highest median rating: {highest_median_rating_category}")

# Part 3: Investigating the Interplay Between Product Prices and Ratings
# Objective: Analyze how product ratings correlate with product prices.

# Pearson correlation coefficient between price and ratings
correlation, _ = pearsonr(df_no_outliers['price'], df_no_outliers['rating'])
print(f"Correlation coefficient between price and ratings: {correlation}")

# Scatter plot for price vs ratings
plt.figure(figsize=(10, 6))
sns.scatterplot(x='price', y='rating', data=df_no_outliers, color='purple')
plt.title('Relationship Between Product Price and Rating')
plt.xlabel('Price')
plt.ylabel('Rating')
plt.show()

# Correlation heatmap for numerical variables
plt.figure(figsize=(8, 6))
corr_matrix = df_no_outliers.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Variables')
plt.show()

# QQ plot for checking normality of price distribution
plt.figure(figsize=(8, 6))
stats.probplot(df_no_outliers['price'], dist="norm", plot=plt)
plt.title('QQ Plot of Product Prices')
plt.show()
