Import Libraries

In [6]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from enum import Enum

Read the Dataset

In [2]:
df = pd.read_csv('/content/retail_sales_dataset.csv')

Viewing Data

In [3]:
print(df.head())

   Transaction ID        Date Customer ID  Gender  Age Product Category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21     CUST004    Male   37         Clothing   
4               5  2023-05-06     CUST005    Male   30           Beauty   

   Quantity  Price per Unit  Total Amount  
0         3              50           150  
1         2             500          1000  
2         1              30            30  
3         1             500           500  
4         2              50           100  


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB
None


In [5]:
print(df.describe())

       Transaction ID         Age     Quantity  Price per Unit  Total Amount
count     1000.000000  1000.00000  1000.000000     1000.000000   1000.000000
mean       500.500000    41.39200     2.514000      179.890000    456.000000
std        288.819436    13.68143     1.132734      189.681356    559.997632
min          1.000000    18.00000     1.000000       25.000000     25.000000
25%        250.750000    29.00000     1.000000       30.000000     60.000000
50%        500.500000    42.00000     3.000000       50.000000    135.000000
75%        750.250000    53.00000     4.000000      300.000000    900.000000
max       1000.000000    64.00000     4.000000      500.000000   2000.000000


Object-Oriented Analysis

In [7]:
class StatType(Enum):
    MEAN = 'mean'
    MEDIAN = 'median'
    MODE = 'mode'
    VARIANCE = 'variance'
    SD = 'std'
    RANGE = 'range'
    IQR = 'iqr'
    KURTOSIS = 'kurtosis'
    COVARIANCE = 'covariance'
    CORRELATION = 'correlation'

In [8]:
class RetailSalesAnalysis:
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        self.df.columns = [c.replace(' ', '_').lower() for c in self.df.columns]
        self.amount_col = 'total_amount'
        self.gender_col = 'gender'
        self.price_col = 'price_per_unit'
        self.age_col = 'age'

In [9]:
 def get_stat(self, column, stat_type: StatType):
        data = self.df[column]
        if stat_type == StatType.MEAN:
            return data.mean()
        elif stat_type == StatType.MEDIAN:
            return data.median()
        elif stat_type == StatType.MODE:
            return data.mode().iat[0]
        elif stat_type == StatType.VARIANCE:
            return data.var(ddof=1)
        elif stat_type == StatType.SD:
            return data.std(ddof=1)
        elif stat_type == StatType.RANGE:
            return data.max() - data.min()
        elif stat_type == StatType.IQR:
            return np.percentile(data, 75) - np.percentile(data, 25)
        elif stat_type == StatType.KURTOSIS:
            return data.kurt()

In [10]:
def groupby_stat(self, by_col, target_col, stat_type: StatType):
        func = {
            StatType.MEAN: 'mean',
            StatType.MEDIAN: 'median',
            StatType.MODE: lambda x: x.mode().iat[0] if len(x.mode()) > 0 else np.nan
        }[stat_type]
        if isinstance(func, str):
            return self.df.groupby(by_col)[target_col].agg(func)
        else:
            return self.df.groupby(by_col)[target_col].apply(func)

In [11]:
def group_dispersion(self):
        grouped = self.df.groupby(self.gender_col)[self.amount_col]
        out = grouped.agg([
            ('variance', 'var'),
            ('std', 'std'),
            ('range', lambda x: x.max() - x.min()),
            ('iqr', lambda x: np.percentile(x, 75) - np.percentile(x, 25)),
            ('kurtosis', 'kurt')
        ])
        return out

In [12]:
    def correlation_matrix(self):
        cols = ['age', 'quantity', 'price_per_unit', 'total_amount']
        return self.df[cols].corr()

In [13]:
def covariance_matrix(self):
        cols = ['age', 'quantity', 'price_per_unit', 'total_amount']
        return self.df[cols].cov()

In [14]:
def selected_covariance(self):
        return {
            'age-amount': self.df[[self.age_col, self.amount_col]].cov().iloc[0,1],
            'ppu-amount': self.df[[self.price_col, self.amount_col]].cov().iloc[0,1],
            'age-ppu': self.df[[self.age_col, self.price_col]].cov().iloc[0,1]
        }

In [15]:
def probability_dist(self, col):
        if col == 'quantity':
            vals = self.df[col].value_counts().sort_index()
            probabilities = vals / vals.sum()
            return probabilities
        elif col == 'total_amount':
            mu, sigma = self.df[col].mean(), self.df[col].std()
            dist = stats.norm(mu, sigma)
            return mu, sigma, dist
        else:
            return None


In [16]:
def central_limit_theorem(self, col, sample_size=30, repeats=1000):
        means = [self.df[col].sample(sample_size, replace=True).mean() for _ in range(repeats)]
        return means

In [19]:
def confidence_interval(self, col, sample_size, confidence=0.95):
        samples = self.df[col].sample(sample_size, replace=True)
        mean = samples.mean()
        std = samples.std(ddof=1)
        se = std / np.sqrt(sample_size)
        h = stats.t.ppf((1 + confidence) / 2, sample_size - 1) * se
        return mean, mean - h, mean + h, std, sample_size, h


Load and Explore Data

In [18]:
analysis = RetailSalesAnalysis('retail_sales_dataset.csv')
print(analysis.df.head())

   transaction_id        date customer_id  gender  age product_category  \
0               1  2023-11-24     CUST001    Male   34           Beauty   
1               2  2023-02-27     CUST002  Female   26         Clothing   
2               3  2023-01-13     CUST003    Male   50      Electronics   
3               4  2023-05-21     CUST004    Male   37         Clothing   
4               5  2023-05-06     CUST005    Male   30           Beauty   

   quantity  price_per_unit  total_amount  
0         3              50           150  
1         2             500          1000  
2         1              30            30  
3         1             500           500  
4         2              50           100  


In [20]:
print("Dataset shape:", analysis.df.shape)

Dataset shape: (1000, 9)


In [21]:
print("\nFirst 5 rows:")
display(analysis.df.head())



First 5 rows:


Unnamed: 0,transaction_id,date,customer_id,gender,age,product_category,quantity,price_per_unit,total_amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [22]:
print("\nLast 5 rows:")
display(analysis.df.tail())


Last 5 rows:


Unnamed: 0,transaction_id,date,customer_id,gender,age,product_category,quantity,price_per_unit,total_amount
995,996,2023-05-16,CUST996,Male,62,Clothing,1,50,50
996,997,2023-11-17,CUST997,Male,52,Beauty,3,30,90
997,998,2023-10-29,CUST998,Female,23,Beauty,4,25,100
998,999,2023-12-05,CUST999,Female,36,Electronics,3,50,150
999,1000,2023-04-12,CUST1000,Male,47,Electronics,4,30,120


In [23]:
print("\nColumn names and data types:")
print(analysis.df.dtypes)


Column names and data types:
transaction_id       int64
date                object
customer_id         object
gender              object
age                  int64
product_category    object
quantity             int64
price_per_unit       int64
total_amount         int64
dtype: object


In [24]:
print("\nSummary statistics (numeric columns):")
display(analysis.df.describe())


Summary statistics (numeric columns):


Unnamed: 0,transaction_id,age,quantity,price_per_unit,total_amount
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,41.392,2.514,179.89,456.0
std,288.819436,13.68143,1.132734,189.681356,559.997632
min,1.0,18.0,1.0,25.0,25.0
25%,250.75,29.0,1.0,30.0,60.0
50%,500.5,42.0,3.0,50.0,135.0
75%,750.25,53.0,4.0,300.0,900.0
max,1000.0,64.0,4.0,500.0,2000.0


In [25]:
print("\nSummary statistics (categorical columns):")
display(analysis.df.describe(include=['object']))


Summary statistics (categorical columns):


Unnamed: 0,date,customer_id,gender,product_category
count,1000,1000,1000,1000
unique,345,1000,2,3
top,2023-05-16,CUST1000,Female,Clothing
freq,11,1,510,351


In [26]:
print("\nMissing values per column:")
print(analysis.df.isnull().sum())


Missing values per column:
transaction_id      0
date                0
customer_id         0
gender              0
age                 0
product_category    0
quantity            0
price_per_unit      0
total_amount        0
dtype: int64


In [27]:
print("\nUnique values per column:")
print(analysis.df.nunique())


Unique values per column:
transaction_id      1000
date                 345
customer_id         1000
gender                 2
age                   47
product_category       3
quantity               4
price_per_unit         5
total_amount          18
dtype: int64


In [28]:
print("\nGender value counts:")
print(analysis.df['gender'].value_counts())


Gender value counts:
gender
Female    510
Male      490
Name: count, dtype: int64


In [29]:
print("\nProduct Category value counts:")
print(analysis.df['product_category'].value_counts())


Product Category value counts:
product_category
Clothing       351
Electronics    342
Beauty         307
Name: count, dtype: int64
