## Exploratory Data Analysis

### Import Libraries

In [1]:
import sys
import polars as pl
import polars.selectors as pol_sel

### Show Python & Library Versions

In [2]:
l = 8
r = 12

print("Python".rjust(l), ":", sys.version[0:6].ljust(r))
print("Polars".rjust(l), ":", pl.__version__.ljust(r))

  Python : 3.11.4      
  Polars : 1.12.0      


### Load CSV File into Polars DataFrame

In [3]:
df = pl.read_csv("data/data.csv")

df

Order_ID,Customer_ID,Customer_Type,Product,Category,Unit_Price,Quantity,Discount,Total_Price,Region,Order_Date
str,str,str,str,str,f64,i64,f64,f64,str,str
"""ORD1""","""CUS1496""","""B2B""","""Vio Wasser""","""Water""",1.66,53,0.1,79.18,"""Baden-Württemberg""","""2023-08-23"""
"""ORD1""","""CUS1496""","""B2B""","""Evian""","""Water""",1.56,90,0.1,126.36,"""Baden-Württemberg""","""2023-08-23"""
"""ORD1""","""CUS1496""","""B2B""","""Sprite""","""Soft Drinks""",1.17,73,0.05,81.14,"""Baden-Württemberg""","""2023-08-23"""
"""ORD1""","""CUS1496""","""B2B""","""Rauch Multivitamin""","""Juices""",3.22,59,0.1,170.98,"""Baden-Württemberg""","""2023-08-23"""
"""ORD1""","""CUS1496""","""B2B""","""Gerolsteiner""","""Water""",0.87,35,0.1,27.4,"""Baden-Württemberg""","""2023-08-23"""
…,…,…,…,…,…,…,…,…,…,…
"""ORD2999999""","""CUS7080""","""B2C""","""Schwip Schwap""","""Soft Drinks""",1.27,2,0.0,2.54,"""Sachsen""","""2023-07-03"""
"""ORD2999999""","""CUS7080""","""B2C""","""San Pellegrino""","""Water""",0.91,15,0.0,13.65,"""Sachsen""","""2023-07-03"""
"""ORD3000000""","""CUS6551""","""B2B""","""Red Bull""","""Soft Drinks""",2.94,64,0.1,169.34,"""Sachsen-Anhalt""","""2022-04-09"""
"""ORD3000000""","""CUS6551""","""B2B""","""Passion Fruit Juice""","""Juices""",4.04,79,0.1,287.24,"""Sachsen-Anhalt""","""2022-04-09"""


### Display First Few Rows to Understand Structure of Data

In [4]:
print(df.head())

shape: (5, 11)
┌──────────┬────────────┬───────────┬───────────┬───┬──────────┬───────────┬───────────┬───────────┐
│ Order_ID ┆ Customer_I ┆ Customer_ ┆ Product   ┆ … ┆ Discount ┆ Total_Pri ┆ Region    ┆ Order_Dat │
│ ---      ┆ D          ┆ Type      ┆ ---       ┆   ┆ ---      ┆ ce        ┆ ---       ┆ e         │
│ str      ┆ ---        ┆ ---       ┆ str       ┆   ┆ f64      ┆ ---       ┆ str       ┆ ---       │
│          ┆ str        ┆ str       ┆           ┆   ┆          ┆ f64       ┆           ┆ str       │
╞══════════╪════════════╪═══════════╪═══════════╪═══╪══════════╪═══════════╪═══════════╪═══════════╡
│ ORD1     ┆ CUS1496    ┆ B2B       ┆ Vio       ┆ … ┆ 0.1      ┆ 79.18     ┆ Baden-Wür ┆ 2023-08-2 │
│          ┆            ┆           ┆ Wasser    ┆   ┆          ┆           ┆ ttemberg  ┆ 3         │
│ ORD1     ┆ CUS1496    ┆ B2B       ┆ Evian     ┆ … ┆ 0.1      ┆ 126.36    ┆ Baden-Wür ┆ 2023-08-2 │
│          ┆            ┆           ┆           ┆   ┆          ┆           ┆

### Retrieve Basic Information About DataFrame

In [5]:
print(df.shape)
print(df.dtypes)

(8999910, 11)
[String, String, String, String, String, Float64, Int64, Float64, Float64, String, String]


### Display Summary Statistics for All Columns

In [6]:
summary = df.describe()
print(summary)

shape: (9, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ Order_ID  ┆ Customer_ ┆ Customer_ ┆ … ┆ Discount  ┆ Total_Pri ┆ Region    ┆ Order_Da │
│ ---       ┆ ---       ┆ ID        ┆ Type      ┆   ┆ ---       ┆ ce        ┆ ---       ┆ te       │
│ str       ┆ str       ┆ ---       ┆ ---       ┆   ┆ f64       ┆ ---       ┆ str       ┆ ---      │
│           ┆           ┆ str       ┆ str       ┆   ┆           ┆ f64       ┆           ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 8999910   ┆ 8999910   ┆ 8999910   ┆ … ┆ 8.99991e6 ┆ 8.99991e6 ┆ 8999910   ┆ 8999910  │
│ null_coun ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0.0       ┆ 0.0       ┆ 0         ┆ 0        │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ mean      ┆ null      ┆ null      ┆ null      ┆ … ┆ 0.029729  ┆ 130.74365 

### Find Longest Text Length in Each Column

In [7]:
# Create an empty list to store max lengths for each string column
longest_text_lengths = []

# Loop through the columns to check for string columns
string_columns = [col for col in df.columns if df[col].dtype == pl.Utf8]

max_lengths = {}
for col in string_columns:
    max_length = df.select(pl.col(col).str.len_chars().max()).to_numpy()[0, 0]
    max_lengths[col] = max_length

df_max_lengths = pl.DataFrame(max_lengths)

df_max_lengths

Order_ID,Customer_ID,Customer_Type,Product,Category,Region,Order_Date
u32,u32,u32,u32,u32,u32,u32
10,8,3,19,19,22,10


### Retrieve Data Types of All Columns

In [8]:
print("Column data types:\n", df.dtypes)

Column data types:
 [String, String, String, String, String, Float64, Int64, Float64, Float64, String, String]


### Count Unique Values in Each Column

In [9]:
all_columns = [col for col in df.columns]

for col in all_columns:
    unique_counts = df[col].n_unique()
    print(f"Unique values in {col} :".rjust(48), f"{unique_counts}".ljust(6))

                     Unique values in Order_ID : 3000000
                  Unique values in Customer_ID : 10000 
                Unique values in Customer_Type : 2     
                      Unique values in Product : 47    
                     Unique values in Category : 4     
                   Unique values in Unit_Price : 12778 
                     Unique values in Quantity : 100   
                     Unique values in Discount : 4     
                  Unique values in Total_Price : 203842
                       Unique values in Region : 16    
                   Unique values in Order_Date : 1094  


### Check Distribution of Numerical Columns

In [10]:
numerical_cols = [item for item in all_columns if item not in string_columns]
numerical_cols = [item for item in numerical_cols if item not in ['id']]

numerical_cols

for col in numerical_cols:
    distribution = df.select(col).describe()
    print(col)
    print(distribution, '\n\n')

Unit_Price
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ Unit_Price │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 8.99991e6  │
│ null_count ┆ 0.0        │
│ mean       ┆ 5.818037   │
│ std        ┆ 14.700501  │
│ min        ┆ 0.32       │
│ 25%        ┆ 1.05       │
│ 50%        ┆ 1.75       │
│ 75%        ┆ 3.21       │
│ max        ┆ 169.53     │
└────────────┴────────────┘ 


Quantity
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ Quantity  │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 8.99991e6 │
│ null_count ┆ 0.0       │
│ mean       ┆ 23.138134 │
│ std        ┆ 26.893207 │
│ min        ┆ 1.0       │
│ 25%        ┆ 6.0       │
│ 50%        ┆ 11.0      │
│ 75%        ┆ 30.0      │
│ max        ┆ 100.0     │
└────────────┴───────────┘ 


Discount
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ Discount  │
│ ---        ┆ ---       │
│ str        ┆ f6

### Retrieve Unique Values for Specific Columns

In [12]:
cols_to_check = ['Order_Date']

for col in cols_to_check:
    unique_values = df[col].unique().sort().to_list()
    print(f"Column: {col}")
    print(f"Unique values: {unique_values}\n")

Column: Order_Date
Unique values: ['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12', '2021-01-13', '2021-01-14', '2021-01-15', '2021-01-16', '2021-01-17', '2021-01-18', '2021-01-19', '2021-01-20', '2021-01-21', '2021-01-22', '2021-01-23', '2021-01-24', '2021-01-25', '2021-01-26', '2021-01-27', '2021-01-28', '2021-01-29', '2021-01-30', '2021-01-31', '2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08', '2021-02-09', '2021-02-10', '2021-02-11', '2021-02-12', '2021-02-13', '2021-02-14', '2021-02-15', '2021-02-16', '2021-02-17', '2021-02-18', '2021-02-19', '2021-02-20', '2021-02-21', '2021-02-22', '2021-02-23', '2021-02-24', '2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28', '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05', '2021-03-06', '2021-03-07', '2021-03-08', '2021-03-09', '2021-03-10',