In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

plt.style.use("default")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 6)
df = pd.read_csv("amz_uk_price_prediction_dataset.csv")
df_part1 = df[["category", "isBestSeller"]].dropna()

In [None]:
#                                                                             PART 1

In [None]:
#                                                                     1. **Crosstab Analysis**:

In [None]:
df_part1 = df[["category", "isBestSeller"]].dropna()
ct_counts = pd.crosstab(df_part1["category"], df_part1["isBestSeller"])
ct_props = pd.crosstab(df_part1["category"], df_part1["isBestSeller"], normalize="index")
print(ct_props.columns.tolist())
print(df_part1['isBestSeller'].value_counts())
print(ct_props.head())
best_seller_rate = ct_props[True].sort_values(ascending=False)
print("\nTop 10 by % best-sellers:")
print(best_seller_rate.head(10))

In [None]:
ct_counts = pd.crosstab(df_part1["category"], df_part1["isBestSeller"])
ct_props = pd.crosstab(df_part1["category"], df_part1["isBestSeller"], normalize="index")
print("Top 10 categories by best sellers:")
best_seller_rate = ct_props[True].sort_values(ascending=False)
print(best_seller_rate.head(10))

In [None]:
#                                                                  2. **Statistical Tests**:

In [None]:
chi2, p, dof, expected = chi2_contingency(ct_counts)
print("Chi-square statistic:", chi2)
print("Degrees of freedom:", dof)
print("p-value:", p)

In [None]:
n = ct_counts.values.sum()
min_dim = min(ct_counts.shape) - 1
cramers_v = np.sqrt(chi2 / (n * min_dim))
print("Cram√©r's V:", cramers_v)

In [None]:
#                                                                       3. **Visualizations**:

In [None]:
ct_props.plot(kind="bar", stacked=True, figsize=(14, 6))
plt.title("Best-Sellers by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Proportion within Category")
plt.legend(title="isBestSeller", labels=["No", "Yes"])
plt.tight_layout()
plt.show()

In [None]:
#                                                                               PART 2

In [None]:
#                                                    0. **Preliminary Step: Remove outliers in product prices.**

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
print(f"Remove prices < {lower_bound:.2f} ou > {upper_bound:.2f}")
df_clean = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)].copy()
df_clean = df_clean[['price', 'category', 'stars']].dropna(subset=['price'])
print("without outliers:", df_clean.shape)

In [None]:
#                                                                                   1. **Violin Plots**:

In [None]:
top20_cats = df_clean['category'].value_counts().head(20).index
plt.figure(figsize=(14, 8))
sns.violinplot(data=df_clean[df_clean['category'].isin(top20_cats)], x='category', y='price')
plt.xticks(rotation=45, ha='right')
plt.title('Price by Top 20 categories')
plt.ylabel('Price')
plt.tight_layout()
plt.show()

In [None]:
#                                                                                        2. **Bar Charts**

In [None]:
top10_cats = df_clean['category'].value_counts().head(10).index

avg_pricetop10 = df_clean[df_clean['category'].isin(top10_cats)].groupby('category')['price'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
avg_pricetop10.plot(kind='bar')
plt.title('mean by top 10 categories')
plt.ylabel('mean price')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("top 3 by mean price:\n", avg_pricetop10.head(3))


In [None]:
#                                                                                        3. **Box Plots**

In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(data=df_clean[df_clean['category'].isin(top10_cats)], x='category', y='stars')
plt.xticks(rotation=45, ha='right')
plt.title('Ratings by top 10 categories')
plt.ylabel('Stars (Rating)')
plt.tight_layout()
plt.show()

In [None]:
#                                                                                             PART 3

In [None]:
#                                                                                1. **Correlation Coefficients**

In [None]:
from scipy import stats
corrprice_stars = df_clean['price'].corr(df_clean['stars'])
pearson_r, p_value = stats.pearsonr(df_clean['price'].dropna(), df_clean['stars'].dropna())
print(f"correlation(price vs stars): {corrprice_stars:.3f}")
print(f"p-value: {p_value:.1f}")
print(f"significative (p < 0.05)? {'yes' if p_value < 0.05 else 'no'}")

In [None]:
#                                                                                     2. **Visualizations**

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_clean['stars'], df_clean['price'], alpha=0.5, s=1)
plt.xlabel('Stars (Rating)')
plt.ylabel('Price')
plt.title(f'Relationship between Rating and Price (n={len(df_clean):,})')
plt.grid(True, alpha=0.3)

z = np.polyfit(df_clean['stars'], df_clean['price'], 1)
p = np.poly1d(z)
plt.plot(df_clean['stars'], p(df_clean['stars']), "r--", alpha=0.8, linewidth=2, 
         label=f'r={df_clean["price"].corr(df_clean["stars"]):.3f}')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#                             no clear linear relationship. Red line almost flat confirms weak correlation. Ratings at low/medium values; prices spread widely.

In [None]:
num_cols = df_clean.select_dtypes(include=[np.number]).columns
print("Numerical variables:", num_cols.tolist())
corr_matrix = df_clean[num_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, square=True, fmt='.3f', cbar_kws={'label': 'Correlation'})
plt.title('Correlation Matrix - Numerical Variables')
plt.tight_layout()
plt.show()


In [None]:
#                                                                               price-stars weakly correlated

In [None]:
import statsmodels.api as sm

fig, axes = plt.subplots(1, 2, figsize=(15, 5))
stats.probplot(df_clean['price'].dropna(), dist="norm", plot=axes[0])
axes[0].set_title('QQ Plot: Price (no outliers)')
df_clean['price'].hist(bins=50, density=True, alpha=0.7, ax=axes[1], color='skyblue')
x = np.linspace(df_clean['price'].min(), df_clean['price'].max(), 100)
mu, sigma = df_clean['price'].mean(), df_clean['price'].std()
axes[1].plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=3, label='Theoretical normal')
axes[1].set_title('Price Distribution vs Normal')
axes[1].set_xlabel('Price')
axes[1].legend()
plt.tight_layout()
plt.show()

In [None]:
#                                    histogram confirms right skew (many cheap products, few very expensive).  prices right-skewed, non-normal.