## Exploratory Data Analysis

### Import Libraries

In [1]:
import sys
import polars as pl
import polars.selectors as pol_sel

### Show Python & Library Versions

In [2]:
l = 8
r = 12

print("Python".rjust(l), ":", sys.version[0:6].ljust(r))
print("Polars".rjust(l), ":", pl.__version__.ljust(r))

  Python : 3.11.4      
  Polars : 1.12.0      


### Load CSV File into Polars DataFrame

In [5]:
df = pl.read_csv("data/data.csv")

df

CustomerID,Gender,Age,Geography,Tenure,Contract,MonthlyCharges,TotalCharges,PaymentMethod,IsActiveMember,Churn
i64,str,i64,str,i64,str,f64,f64,str,i64,str
1000001,"""Male""",34,"""France""",14,"""Two-year""",21.58,7933.34,"""Bank transfer""",1,"""No"""
1000002,"""Female""",26,"""Spain""",14,"""Month-to-month""",27.71,5869.34,"""Credit card""",0,"""Yes"""
1000003,"""Male""",50,"""Germany""",57,"""Two-year""",111.12,6321.2,"""Bank transfer""",1,"""No"""
1000004,"""Male""",37,"""Spain""",34,"""Month-to-month""",55.49,7956.44,"""Bank transfer""",0,"""Yes"""
1000005,"""Male""",30,"""Spain""",53,"""Two-year""",62.48,4922.75,"""Direct debit""",1,"""No"""
…,…,…,…,…,…,…,…,…,…,…
1000996,"""Male""",42,"""France""",44,"""Two-year""",32.75,6815.82,"""Bank transfer""",1,"""Yes"""
1000997,"""Male""",26,"""France""",21,"""Two-year""",100.02,5544.36,"""Credit card""",1,"""Yes"""
1000998,"""Female""",21,"""France""",44,"""Two-year""",106.3,2462.34,"""Credit card""",1,"""No"""
1000999,"""Female""",31,"""Italy""",11,"""Month-to-month""",35.07,1771.42,"""Electronic check""",1,"""No"""


### Display First Few Rows to Understand Structure of Data

In [6]:
print(df.head())

shape: (5, 11)
┌────────────┬────────┬─────┬───────────┬───┬──────────────┬───────────────┬───────────────┬───────┐
│ CustomerID ┆ Gender ┆ Age ┆ Geography ┆ … ┆ TotalCharges ┆ PaymentMethod ┆ IsActiveMembe ┆ Churn │
│ ---        ┆ ---    ┆ --- ┆ ---       ┆   ┆ ---          ┆ ---           ┆ r             ┆ ---   │
│ i64        ┆ str    ┆ i64 ┆ str       ┆   ┆ f64          ┆ str           ┆ ---           ┆ str   │
│            ┆        ┆     ┆           ┆   ┆              ┆               ┆ i64           ┆       │
╞════════════╪════════╪═════╪═══════════╪═══╪══════════════╪═══════════════╪═══════════════╪═══════╡
│ 1000001    ┆ Male   ┆ 34  ┆ France    ┆ … ┆ 7933.34      ┆ Bank transfer ┆ 1             ┆ No    │
│ 1000002    ┆ Female ┆ 26  ┆ Spain     ┆ … ┆ 5869.34      ┆ Credit card   ┆ 0             ┆ Yes   │
│ 1000003    ┆ Male   ┆ 50  ┆ Germany   ┆ … ┆ 6321.2       ┆ Bank transfer ┆ 1             ┆ No    │
│ 1000004    ┆ Male   ┆ 37  ┆ Spain     ┆ … ┆ 7956.44      ┆ Bank transfer ┆

### Retrieve Basic Information About DataFrame

In [7]:
print(df.shape)
print(df.dtypes)

(1000, 11)
[Int64, String, Int64, String, Int64, String, Float64, Float64, String, Int64, String]


### Display Summary Statistics for All Columns

In [8]:
summary = df.describe()
print(summary)

shape: (9, 12)
┌────────────┬────────────┬────────┬───────────┬───┬─────────────┬────────────┬────────────┬───────┐
│ statistic  ┆ CustomerID ┆ Gender ┆ Age       ┆ … ┆ TotalCharge ┆ PaymentMet ┆ IsActiveMe ┆ Churn │
│ ---        ┆ ---        ┆ ---    ┆ ---       ┆   ┆ s           ┆ hod        ┆ mber       ┆ ---   │
│ str        ┆ f64        ┆ str    ┆ f64       ┆   ┆ ---         ┆ ---        ┆ ---        ┆ str   │
│            ┆            ┆        ┆           ┆   ┆ f64         ┆ str        ┆ f64        ┆       │
╞════════════╪════════════╪════════╪═══════════╪═══╪═════════════╪════════════╪════════════╪═══════╡
│ count      ┆ 1000.0     ┆ 1000   ┆ 1000.0    ┆ … ┆ 1000.0      ┆ 1000       ┆ 1000.0     ┆ 1000  │
│ null_count ┆ 0.0        ┆ 0      ┆ 0.0       ┆ … ┆ 0.0         ┆ 0          ┆ 0.0        ┆ 0     │
│ mean       ┆ 1000500.5  ┆ null   ┆ 43.799    ┆ … ┆ 4066.22196  ┆ null       ┆ 0.503      ┆ null  │
│ std        ┆ 288.819436 ┆ null   ┆ 15.133857 ┆ … ┆ 2233.265597 ┆ null     

### Find Longest Text Length in Each Column

In [9]:
# Create an empty list to store max lengths for each string column
longest_text_lengths = []

# Loop through the columns to check for string columns
string_columns = [col for col in df.columns if df[col].dtype == pl.Utf8]

max_lengths = {}
for col in string_columns:
    max_length = df.select(pl.col(col).str.len_chars().max()).to_numpy()[0, 0]
    max_lengths[col] = max_length

df_max_lengths = pl.DataFrame(max_lengths)

df_max_lengths

Gender,Geography,Contract,PaymentMethod,Churn
u32,u32,u32,u32,u32
6,7,14,16,3


### Retrieve Data Types of All Columns

In [10]:
print("Column data types:\n", df.dtypes)

Column data types:
 [Int64, String, Int64, String, Int64, String, Float64, Float64, String, Int64, String]


### Count Unique Values in Each Column

In [11]:
all_columns = [col for col in df.columns]

for col in all_columns:
    unique_counts = df[col].n_unique()
    print(f"Unique values in {col} :".rjust(48), f"{unique_counts}".ljust(6))

                   Unique values in CustomerID : 1000  
                       Unique values in Gender : 2     
                          Unique values in Age : 52    
                    Unique values in Geography : 5     
                       Unique values in Tenure : 59    
                     Unique values in Contract : 3     
               Unique values in MonthlyCharges : 952   
                 Unique values in TotalCharges : 999   
                Unique values in PaymentMethod : 4     
               Unique values in IsActiveMember : 2     
                        Unique values in Churn : 2     


### Check Distribution of Numerical Columns

In [12]:
numerical_cols = [item for item in all_columns if item not in string_columns]
numerical_cols = [item for item in numerical_cols if item not in ['id']]

numerical_cols

for col in numerical_cols:
    distribution = df.select(col).describe()
    print(col)
    print(distribution, '\n\n')

CustomerID
shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ CustomerID │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 1000.0     │
│ null_count ┆ 0.0        │
│ mean       ┆ 1000500.5  │
│ std        ┆ 288.819436 │
│ min        ┆ 1.000001e6 │
│ 25%        ┆ 1.000251e6 │
│ 50%        ┆ 1.000501e6 │
│ 75%        ┆ 1.00075e6  │
│ max        ┆ 1.001e6    │
└────────────┴────────────┘ 


Age
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ Age       │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 1000.0    │
│ null_count ┆ 0.0       │
│ mean       ┆ 43.799    │
│ std        ┆ 15.133857 │
│ min        ┆ 18.0      │
│ 25%        ┆ 31.0      │
│ 50%        ┆ 44.0      │
│ 75%        ┆ 57.0      │
│ max        ┆ 69.0      │
└────────────┴───────────┘ 


Tenure
shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ Tenure    │
│ ---        ┆ ---       │
│ str        ┆ f64      

### Retrieve Unique List of Values for Select Features/Columns

In [17]:
cols_to_check = [""
    ]

for col in df.columns:
    unique_values = df[col].unique().sort().to_list()
    print(f"Column: {col} [{len(unique_values)}]")
    print(f"Unique values: {unique_values}\n")

Column: CustomerID [1000]
Unique values: [1000001, 1000002, 1000003, 1000004, 1000005, 1000006, 1000007, 1000008, 1000009, 1000010, 1000011, 1000012, 1000013, 1000014, 1000015, 1000016, 1000017, 1000018, 1000019, 1000020, 1000021, 1000022, 1000023, 1000024, 1000025, 1000026, 1000027, 1000028, 1000029, 1000030, 1000031, 1000032, 1000033, 1000034, 1000035, 1000036, 1000037, 1000038, 1000039, 1000040, 1000041, 1000042, 1000043, 1000044, 1000045, 1000046, 1000047, 1000048, 1000049, 1000050, 1000051, 1000052, 1000053, 1000054, 1000055, 1000056, 1000057, 1000058, 1000059, 1000060, 1000061, 1000062, 1000063, 1000064, 1000065, 1000066, 1000067, 1000068, 1000069, 1000070, 1000071, 1000072, 1000073, 1000074, 1000075, 1000076, 1000077, 1000078, 1000079, 1000080, 1000081, 1000082, 1000083, 1000084, 1000085, 1000086, 1000087, 1000088, 1000089, 1000090, 1000091, 1000092, 1000093, 1000094, 1000095, 1000096, 1000097, 1000098, 1000099, 1000100, 1000101, 1000102, 1000103, 1000104, 1000105, 1000106, 1000