In [14]:
import polars as pl

train_applprev_2 = pl.read_csv("Data/csv_files/train/train_applprev_2.csv")

train_applprev_2.shape

(14075487, 6)

In [2]:
most_frequent_case_id = (
    train_applprev_2.filter(pl.col("credacc_cards_status_52L").is_not_null())
    .group_by("case_id")
    .agg(pl.len().alias("count"))
    .sort("count")
    .reverse()
    .select("case_id")
    .head(1)
    .to_series()
    [0]
)

print(most_frequent_case_id)

147994


## Exploring Data Indexing with num_group1 and num_group2
### Objective
To understand how num_group1 and num_group2 index historical records associated with each case_id for accurate data flattening and analysis.

### Process
Filtered Data: Identified a frequent case_id focusing on non-null credit card statuses to understand how records are indexed.
Indexed Data Attributes: Investigated the structure revealed by num_group1 and num_group2, determining their roles in categorizing data hierarchically, e.g., num_group1 for primary categorizations and num_group2 for secondary levels within each category.
Conclusion
Understanding the indexing mechanism is crucial for data preparation, ensuring accurate feature representation for model training. The exploration provided insights into the dataset's hierarchical structure, aiding in effective data analysis.

In [3]:
filtered_df = train_applprev_2.filter(pl.col("case_id") == most_frequent_case_id)

sorted_df = filtered_df.sort(["num_group1", "num_group2"])

sorted_df.head(30)

case_id,cacccardblochreas_147M,conts_type_509L,credacc_cards_status_52L,num_group1,num_group2
i64,str,str,str,i64,i64
147994,"""a55475b1""","""PRIMARY_MOBILE…",,0,0
147994,"""a55475b1""",,,0,1
147994,"""a55475b1""","""PRIMARY_MOBILE…","""CANCELLED""",1,0
147994,"""a55475b1""","""HOME_PHONE""",,1,1
147994,"""a55475b1""","""PRIMARY_MOBILE…","""ACTIVE""",2,0
147994,"""a55475b1""","""PHONE""",,2,1
147994,"""a55475b1""","""PRIMARY_MOBILE…","""ACTIVE""",3,0
147994,"""a55475b1""",,,3,1
147994,"""a55475b1""","""PRIMARY_MOBILE…","""CANCELLED""",4,0
147994,"""a55475b1""","""PHONE""",,4,1


In [4]:
df_lazy = train_applprev_2.group_by(["case_id", "num_group1"]) \
                  .agg([
                      pl.col("conts_type_509L").unique().count().alias("unique_contact_types"),
                      pl.col("cacccardblochreas_147M").max().alias("first_cacccardblochreas_147M"),
                      pl.col("credacc_cards_status_52L").max().alias("first_credacc_cards_status_52L")
                  ])

In [5]:
df_lazy.describe()

statistic,case_id,num_group1,unique_contact_types,first_cacccardblochreas_147M,first_credacc_cards_status_52L
str,f64,f64,f64,str,str
"""count""",6525978.0,6525978.0,6525978.0,"""6468134""","""316210"""
"""null_count""",0.0,0.0,0.0,"""57844""","""6209768"""
"""mean""",1431900.0,4.194369,2.069206,,
"""std""",787191.344959,4.274768,0.77185,,
"""min""",2.0,0.0,1.0,"""P133_119_56""","""ACTIVE"""
"""25%""",958657.0,1.0,2.0,,
"""50%""",1555975.0,3.0,2.0,,
"""75%""",1853312.0,6.0,3.0,,
"""max""",2703454.0,19.0,6.0,"""a55475b1""","""UNCONFIRMED"""


In [6]:
filtered_df_2 = df_lazy.filter(pl.col("case_id") == most_frequent_case_id)

sorted_df_2 = filtered_df_2.sort(["num_group1"])

sorted_df_2.head(30)

case_id,num_group1,unique_contact_types,first_cacccardblochreas_147M,first_credacc_cards_status_52L
i64,i64,u32,str,str
147994,0,2,"""a55475b1""",
147994,1,2,"""a55475b1""","""CANCELLED"""
147994,2,2,"""a55475b1""","""ACTIVE"""
147994,3,2,"""a55475b1""","""ACTIVE"""
147994,4,2,"""a55475b1""","""CANCELLED"""
147994,5,2,"""a55475b1""","""CANCELLED"""
147994,6,3,"""a55475b1""","""CANCELLED"""
147994,7,2,"""a55475b1""","""CANCELLED"""
147994,8,1,"""a55475b1""","""ACTIVE"""
147994,9,2,"""a55475b1""","""CANCELLED"""


In [24]:
unique_status_counts = df_lazy.group_by('case_id').agg(pl.col('first_credacc_cards_status_52L').n_unique().alias('unique_status_count'))

has_at_least_two_unique = unique_status_counts.filter(pl.col('unique_status_count') >= 5)

if has_at_least_two_unique.shape[0] > 0:
    print("At least one case_id has at least 2 unique credacc_cards_status_52L.")
    print(has_at_least_two_unique)
else:
    print("No case_id has at least 2 unique credacc_cards_status_52L.")


At least one case_id has at least 2 unique credacc_cards_status_52L.
shape: (2, 2)
┌─────────┬─────────────────────┐
│ case_id ┆ unique_status_count │
│ ---     ┆ ---                 │
│ i64     ┆ u32                 │
╞═════════╪═════════════════════╡
│ 257994  ┆ 5                   │
│ 227317  ┆ 5                   │
└─────────┴─────────────────────┘
