In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
# Loading the amazon UK product dataset
df = pd.read_csv('/Users/benoi/Documents/IronHack/Week_5/amz_uk_price_prediction_dataset.csv')

In [None]:
df.head(10)

In [None]:
df

In [None]:
#Part 1: Analyzing Best-Seller Trends Across Product Categories
#Create a crosstab between the product category and the isBestSeller status

crosstab_result = pd.crosstab(df['category'], df['isBestSeller'])
crosstab_result

In [None]:
#Are there categories where being a best-seller is more prevalent?
cat_proportions = df.groupby('category')['isBestSeller'].mean().reset_index()#is best seller is boolean, using mean by category
cat_proportions_sorted = cat_proportions.sort_values(by='isBestSeller', ascending=False)
cat_proportions_sorted

In [None]:
#Statistical Tests:
#Conduct a Chi-square test to determine if the best-seller distribution is independent of the product category.

from scipy.stats import chi2_contingency
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value
#chi test results 0.0 not enough evidence to conclude a significant association between the variables

In [None]:
#Compute Cramér's V to understand the strength of association between best-seller status and category
from scipy.stats.contingency import association
association(crosstab_result, method="cramer")
#The Cramér's V value of 0.12 suggests a weak association between the two variables

In [None]:
#Visualize the relationship between product categories and the best-seller status using a stacked bar chart
crosstab_result.plot(kind="bar", stacked=True)

In [None]:
#Part 2: Exploring Product Prices and Ratings Across Categories and Brands
#Remove outliers in product prices

#creating Tukeys test function

def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify the outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    
    return outliers

#applying Tukeys function to price
outliers = tukeys_test_outliers(df['price'])
print(outliers)


In [None]:
#dropping outliers
index_drop=outliers.index
df_wo_outliers=df.drop(index_drop)
df_wo_outliers

In [None]:
#Filter out the top 20 categories based on count for better visualization
cat_count = df['category'].value_counts()
top_20_cat = cat_count.nlargest(20).index
df_top_20 = df[df['category'].isin(top_20_cat)]


In [None]:
#Use a violin plot to visualize the distribution of price across different product categories on df_top_20
sns.violinplot(data=df_top_20, x='category', y='price', hue='category', palette="coolwarm",legend=False)

In [None]:
#Filter out the top 5 categories based on count for better visualization
cat_count = df['category'].value_counts()
top_5_cat = cat_count.nlargest(5).index
df_top_5 = df[df['category'].isin(top_5_cat)]
sns.violinplot(data=df_top_5, x='category', y='price', hue='category', palette="coolwarm",legend=False)

In [None]:
#Create a bar chart comparing the average price of products for the top 5 product categories (based on count).
sns.barplot(data=df_top_5, x='category', y='price', hue='category', palette="coolwarm",legend=False)

In [None]:
#Which product category commands the highest average price? Don't filter here by top categories.
cat_avg_price = df.groupby('category')['price'].mean()
sort_cat = cat_avg_price.sort_values(ascending=False)
sort_cat
# Laptops have the highest average price
# second ranking is 3d printers

In [None]:
#Visualize the distribution of product ratings based on their category using side-by-side box plots on top 10 categories
sns.boxplot(data=df_top_5, x='category', y='price', hue='category', palette="coolwarm",legend=False)

In [None]:
#Which category tends to receive the highest median rating from customers?
cat_med_price = df.groupby('category')['price'].median()
sort_cat_2 = cat_med_price.sort_values(ascending=False)
sort_cat_2
# Laptops have the highest median price
# second ranking is desktop PCs

In [None]:
#Part 3: Investigating the Interplay Between Product Prices and Ratings

In [None]:
#Calculate the correlation coefficient between price and stars
#Pearson correlation
correlation_p = df['price'].corr(df['stars'])
correlation_p
# result suggest a weak negative correlation

In [None]:
#Sperman correlation
correlation_s = df['price'].corr(df['stars'],method='spearman')
correlation_s
# Spearman result also suggests a weak negative correlation

In [None]:
#Use a scatter plot to visualize the relationship between product rating and price. What patterns can you observe?
plt.scatter(df['stars'], df['price'])
plt.show

In [None]:
#Use a correlation heatmap to visualize correlations between all numerical variables
#select numerical variables from df
df_numerical = df.select_dtypes("number")
#heatmap
correlation_matrix = df_numerical.corr()
plt.figure(figsize=(10,10))
# creating mask 
mask = np.triu(np.ones_like(df_numerical.corr())) 
  
# plotting a triangle correlation heatmap 
dataplot = sns.heatmap(round(df_numerical.corr(),2), cmap="YlGnBu", annot=True, mask=mask) 

In [None]:
#Examine if product prices typically follow a normal distribution using a QQ plot
import statsmodels.api as sm
sm.qqplot(df['price'], line='s');