In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
dataset_path = 'ecommerce_product_performance.csv'
data = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Explore the structure of the dataset
print("\nDataset Info:")
print(data.info())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Clean the dataset by filling missing values
data_cleaned = data.fillna(data.mean())

print("\nCleaned Dataset Info:")
print(data_cleaned.info())

# Compute basic statistics of numerical columns
print("\nBasic Statistics of Numerical Columns:")
print(data_cleaned.describe())

# Perform groupings on a categorical column and compute the mean of a numerical column for each group
categorical_column = 'product_category'  
numerical_column = 'sales'              

if categorical_column in data_cleaned.columns and numerical_column in data_cleaned.columns:
    print(f"\nMean of {numerical_column} grouped by {categorical_column}:")
    print(data_cleaned.groupby(categorical_column)[numerical_column].mean())
else:
    print("\nPlease update 'categorical_column' and 'numerical_column' with valid column names.")

# Visualization 1: Line chart (e.g., trends over time)
if 'date' in data_cleaned.columns and numerical_column in data_cleaned.columns:
    data_cleaned['date'] = pd.to_datetime(data_cleaned['date'])  # Ensure the date column is in datetime format
    data_cleaned.sort_values('date', inplace=True)
    plt.figure(figsize=(10, 6))
    plt.plot(data_cleaned['date'], data_cleaned[numerical_column], marker='o', label='Trend')
    plt.title('Sales Trend Over Time')
    plt.xlabel('Date')
    plt.ylabel('Sales')
    plt.legend()
    plt.grid()
    plt.show()

# Visualization 2: Bar chart (e.g., comparison across categories)
if categorical_column in data_cleaned.columns and numerical_column in data_cleaned.columns:
    category_means = data_cleaned.groupby(categorical_column)[numerical_column].mean()
    plt.figure(figsize=(10, 6))
    category_means.plot(kind='bar', color='skyblue')
    plt.title('Average Sales by Product Category')
    plt.xlabel('Product Category')
    plt.ylabel('Average Sales')
    plt.xticks(rotation=45)
    plt.show()

# Visualization 3: Histogram (e.g., distribution of a numerical column)
if numerical_column in data_cleaned.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(data_cleaned[numerical_column], kde=True, bins=20, color='purple')
    plt.title('Distribution of Sales')
    plt.xlabel('Sales')
    plt.ylabel('Frequency')
    plt.show()

# Visualization 4: Scatter plot (e.g., relationship between two numerical columns)
numerical_column_1 = 'sales' 
numerical_column_2 = 'profit' 

if numerical_column_1 in data_cleaned.columns and numerical_column_2 in data_cleaned.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=data_cleaned[numerical_column_1], y=data_cleaned[numerical_column_2], hue=data_cleaned[categorical_column], palette='viridis')
    plt.title('Relationship Between Sales and Profit')
    plt.xlabel('Sales')
    plt.ylabel('Profit')
    plt.legend(title='Product Category')
    plt.show()


First few rows of the dataset:
   Product_Price  Discount_Rate  Product_Rating  Number_of_Reviews  \
0     199.671415       0.177024        4.411071               62.0   
1     136.173570       0.041467        3.033534              201.0   
2     214.768854       0.276197        2.866881              479.0   
3     302.302986       0.094254        4.473473              252.0   
4     126.584663       0.411845        3.553082              671.0   

   Stock_Availability  Days_to_Deliver  Return_Rate  Category_ID  
0                 1.0              9.0     0.185116          5.0  
1                 1.0              3.0     0.384639         10.0  
2                 1.0             19.0     0.056410          4.0  
3                 1.0             11.0          NaN          7.0  
4                 1.0             14.0     0.672163          6.0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column              