In [1]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.io as pio
import vizro.plotly.express as px

In [2]:
conn = sqlite3.connect("../../data/product_data.db")
df_main = pd.read_sql("SELECT * FROM product_main_data", conn)
df_variants = pd.read_sql("SELECT * FROM product_variants_data", conn)
df_reviews = pd.read_sql("SELECT * FROM product_review_data", conn)

# cleaning and preparation
# Replace "N/A" to NaN 
df_main.replace("N/A", np.nan, inplace= True)
df_variants.replace("N/A", np.nan, inplace= True)
# Drop rows where there is NaN in category or Name
df_main.dropna(subset = ["category", "master_name"], inplace=True)
df_variants.dropna(subset = ["category", "master_name"], inplace = True)


# Ensure numerical categories are int
columns_to_convert = [
    "True to size (size_ag)", "Too Big (size_ag)", "Small (size_ag)", "Too Small (size_ag)", "Not specified (size_ag)",
    "Big (size_ag)", "Not specified (experience)", "Quality (experience)", "Stylish (experience)", "Comfortable (experience)",
    "Verified Buyer", "Verified Reviewer", "Unverified"
]

df_main[columns_to_convert] = df_main[columns_to_convert].astype(float)

# Ensure correct category name and formatting
fix_categories = {"LOAFERS & FLATS" : "WOMEN'S - LOAFERS & FLATS",
                  "BOOTS & BOOTIES" : "WOMEN'S - BOOTS & BOOTIES",
                  "SLIP-ONS & SLIDES" : "WOMEN'S - SLIP-ONS & SLIDES",
                  "MULES" : "WOMEN'S - MULES"}
df_main["category"] = df_main["category"].replace(fix_categories)
df_main["category"] = df_main["category"].str.upper()

# Change review values to int
df_main["rating"] = df_main["rating"].replace(r" star rating", "", regex= True).astype("float")

# Ensure price values are in int and dont have currency symbols
df_variants["price"] = df_variants["price"].replace(r"From |[\$]", "", regex = True).astype("float")

# Remove duplicate rows in df_variants
df_variants.drop_duplicates(subset=["full_product_name"], inplace=True)


In [3]:
df_main.head(5)
pio.templates.default = "vizro_dark"

## EDA as much charts as possible (may need to combine them cause probably better and its too much charts)(after each as a report kinda)

### Number of master products per category

In [12]:
df = df_main.groupby("category")["master_name"].count().reset_index()
df = df.sort_values(by= "master_name", ascending= False)
fig = px.bar(df, y = "category", x = "master_name", color= "master_name", color_continuous_scale="Earth", title = "Product amount per category")
fig.update_layout(
    width=1000, 
    height=500,
    xaxis_title="Product amount",
    yaxis_title="Category",
    xaxis_title_font=dict(family="Arial", size=18, color="white"),
    yaxis_title_font=dict(family="Arial", size=18, color="white"),
    title={
        'x': 0.5,
        "y": 1,
        'xanchor': 'center',
        'font': {'size': 24, 'family': "Arial, sans-serif", 'color': 'white'}
    },
    margin=dict(l=50, r=50, t=50, b=50),
    bargap=0.1,       
    yaxis=dict(
        tickfont=dict(size=10, color = "white")  
    ),
    )
fig.show()

### Number of categories

In [5]:

print(f"Number of unqie categories - {df_main["category"].nunique()}")

Number of unqie categories - 31


# Review_rating distribution (histogram?)

In [6]:
# Review_rating distribution (histogram?)
df = df_main.groupby("rating")["review_amount"].sum().reset_index()
df = df.sort_values(by= "rating", ascending= True)
fig = px.bar(df, x = "review_amount", y = "rating", color= "review_amount", title = "Review - rating distribution")
fig.update_layout(
    width=1400, 
    height=500,
    xaxis_title="Rating",
    yaxis_title="Review amount",
    xaxis_title_font=dict(family="Arial", size=18, color="white"),
    yaxis_title_font=dict(family="Arial", size=18, color="white"),
    showlegend=False,
    title={
        'x': 0.5,
        "y": 1,
        'xanchor': 'center',
        'font': {'size': 24, 'family': "Arial, sans-serif", 'color': 'white'}
    },
    margin=dict(l=50, r=50, t=50, b=50),
    bargap=0.1,       
    xaxis=dict(
        tickmode='linear',
        tick0=1.0,        
        dtick=0.1         
    ),
    yaxis=dict(
        tickfont=dict(size=10, color = "white")  
    ),
    )
fig.show()

In [3]:
df.head(30)

NameError: name 'df' is not defined

In [8]:
# top 50 most reviwed products probably too much
# top something rated products
# three charts for size verified experience
# relationship between reviwes and size dissatisfaction
# Category vs. experience counts (use aggregation) ?
# Correlation with rating: do verified buyers leave higher ratings? most likely worthless

# ask about next two
# correlation matrix for numeric columns:
# review_amount, rating, size/ag flags, experience flags

# Scatter plots:
# review_amount vs rating
# Experience flags vs. size issues

# total variants
# variants per master name
# number of colors
# most common color by category probably useless maybe not nost probably worthless
# price histogram
# price distribution per category
# avg price per category
# Relationship between price and number of variants