In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset - point here to your folder where the dataset is 
df = pd.read_csv('amz_uk_price_prediction_dataset.csv')

In [None]:
df.head()

In [None]:
df.shape()

In [None]:
# Calculate IQR for the price column
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

# Check the shape of the data after removing outliers
df_no_outliers.shape

In [1]:
#Part 1: 
# Create a crosstab between category and isBestSeller status
crosstab_best_seller_category = pd.crosstab(df_no_outliers['category'], df_no_outliers['isBestSeller'])
crosstab_best_seller_category_prop = crosstab_best_seller_category.copy()

In [None]:
# Calculate the proportion of best-sellers for each category
crosstab_best_seller_category_prop['best_seller_proportion'] = crosstab_best_seller_category[True] / (crosstab_best_seller_category[True] + crosstab_best_seller_category[False])

# Sort categories based on best-seller proportion in descending order
sorted_categories_by_best_seller_proportion = crosstab_best_seller_category_prop.sort_values(by='best_seller_proportion', ascending=False)

# Display the sorted categories
sorted_categories_by_best_seller_proportion[['best_seller_proportion']]


The categories with the highest proportions of best-sellers are:

- Smart Home Security & Lighting: 6.41%
- Wind Instruments: 5.93%
- Grocery: 5.81%
- Health & Personal Car: 5.7%
- Material Handling Products: 4.81%

On the other end of the spectrum, there are several categories where no products have the best-seller status.

In [2]:
#2.

from scipy.stats import chi2_contingency

# Conducting the Chi-square test
chi2, p, _, _ = chi2_contingency(crosstab_best_seller_category)

chi2, p


from scipy.stats.contingency import association

# Computing the association between variables in 'crosstab_result' using the "cramer" method
association(crosstab_best_seller_category, method="cramer")


Given the extremely low \( p \)-value (essentially zero), we can reject the null hypothesis. This indicates that there is a significant association between the product category and its best-seller status.

The value of Cramér's V is approximately \(0.12\).

Cramér's V values can be interpreted as follows:
- \(0\): No association
- \(0.1\): Weak association
- \(0.2\): Moderate association
- \(0.3\): Relatively strong association
- \(1\): Perfect association

Given our result, the strength of association between product category and best-seller status is weak.


In [3]:
#3.
# Correcting the plotting for top 10 categories with highest best-seller proportions
top_categories = sorted_categories_by_best_seller_proportion.iloc[:10, :]

# Plotting the stacked bar chart
top_categories.plot(kind='bar', stacked=True, figsize=(12, 6))


In [None]:
#part 2:
#1.
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing the relevant columns

# Filter out the top 20 categories based on count
top_20_categories = df_no_outliers['category'].value_counts().nlargest(20).index
filtered_data = df_no_outliers[df_no_outliers['category'].isin(top_20_categories)]

# Create a violin plot to visualize the distribution of 'price' across different product 'categories'
plt.figure(figsize=(14, 10))
sns.violinplot(x='category', y='price', data=filtered_data)
plt.title('Distribution of Product Prices Across Top 20 Categories')
plt.xticks(rotation=45, ha='right')
plt.show()

# Identifying the category with the highest median price
category_highest_median = df_no_outliers.groupby('category')['price'].median().idxmax()
highest_median_price = df_no_outliers.groupby('category')['price'].median().max()

category_highest_median, highest_median_price


In [None]:
#2.

# Calculate average price for each category
average_price_by_category = df_no_outliers.groupby('category')['price'].mean()

# Select top 10 categories based on product count
top_10_categories = df_no_outliers['category'].value_counts().nlargest(10).index
filtered_avg_price_category = average_price_by_category.loc[top_10_categories]

# Calculate the average price for each of the top 10 categories
average_prices = df_top_10.groupby('category')['price'].mean().sort_values(ascending=False)

# Plotting the bar chart
plt.figure(figsize=(12, 6))
filtered_avg_price_category.sort_values(ascending=False).plot(kind='bar', color='coral')
plt.title('Average Product Price for Top 10 Categories')
plt.ylabel('Average Price (£)')
plt.xlabel('Product Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Identifying the category with the highest average price
category_highest_avg_price = df_no_outliers.groupby('category')['price'].mean().idxmax()
highest_avg_price_category = df_no_outliers.groupby('category')['price'].mean().max()

category_highest_avg_price, highest_avg_price_category


In [None]:
#3.

# Filtering out top 10 categories based on count for better visualization
top_10_categories = df_no_outliers['category'].value_counts().nlargest(10).index
filtered_data = df_no_outliers[df_no_outliers['category'].isin(top_10_categories)]

# Visualize the distribution of product ratings based on their category using box plots
plt.figure(figsize=(14, 10))
sns.boxplot(x='category', y='stars', data=filtered_data)
plt.title('Distribution of Product Ratings Across Categories')
plt.ylabel('Product Rating (Stars)')
plt.xlabel('Product Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Identify the category with the highest median rating
category_highest_median_rating = df_no_outliers.groupby('category')['stars'].median().idxmax()
highest_median_rating = df_no_outliers.groupby('category')['stars'].median().max()

category_highest_median_rating, highest_median_rating

In [None]:
#Part 3: 1. 

# Calculate the correlation coefficient between price and rating
correlation_price_rating = df_no_outliers['price'].corr(df_no_outliers['reviews'])

correlation_price_rating



# Calculate the correlation coefficient between price and rating
correlation_price_rating = df_no_outliers['price'].corr(df_no_outliers['reviews'], method='spearman')

correlation_price_rating

This value is close to zero, indicating a very weak negative correlation between product price and the number of reviews.

Practically, this suggests that the price of a product and the number of reviews it has are largely independent of each other, with only a slight negative trend.

In [None]:
#2.
# Scatter plot to visualize the relationship between product reviews and price
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df_no_outliers['reviews'], y=df_no_outliers['price'], alpha=0.5)
plt.title('Relationship Between Product Rating and Price')
plt.tight_layout()
plt.show()


Here's the scatter plot showcasing the relationship between the number of reviews and product price:

- The majority of products have fewer reviews and are clustered towards the left side of the plot.
- There doesn't seem to be a strong trend indicating that products with higher prices receive more or fewer reviews. The data points are dispersed without a clear pattern.
- Some products with a higher number of reviews are in the lower to mid-price range, suggesting that these products might be popular or widely purchased.

In [None]:
# Correlation heatmap
correlation_matrix = df_no_outliers['price', 'reviews', 'stars'].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Heatmap")
plt.show()

**Correlation Heatmap**:
   - The heatmap provides a visual representation of the correlation between numerical variables.
   
   If we talk about continous variables...
   - As we discussed earlier, the correlation between `price` and `reviews` is very weak. The correlation between `price` and `stars` (product rating) is also weak and negative, as indicated by the slightly blue shade.
   - The correlation between `reviews` and `stars` is positive but still weak, suggesting that products with more reviews don't necessarily have higher or lower ratings.

In [None]:
# QQ plot for product prices
plt.figure(figsize=(8, 6))
stats.probplot(df_no_outliers['price'], dist="norm", plot=plt)
plt.title("QQ Plot for Product Prices")
plt.show()

**QQ Plot for Product Prices**:
   - A QQ (quantile-quantile) plot is used to compare the distribution of a dataset with a theoretical normal distribution.
   - If the data points in the QQ plot closely follow the straight line (which represents the normal distribution), then the dataset is approximately normally distributed.
   - In this case, the QQ plot indicates that product prices are not  normally distributed.