In [1]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df_pdf = pl.read_csv("datasets/PDF_All_features.csv")

In [3]:
# The file_path is dropped before duplication is removed because malicious samples are
# Collected from different sources and might be the same but saved/sent with a different name, therefore only the name will be 
# different which is not all that important.
df_pdf = df_pdf.drop("file_path")

In [4]:
original_height = df_pdf.height
df_pdf = df_pdf.unique()
ch1_height = df_pdf.height
print(f"df_pdf Size = {original_height}\nAfter removing duplicates = {ch1_height}\nNumber of Rows dropped = {original_height-ch1_height}")

df_pdf Size = 19296
After removing duplicates = 15353
Number of Rows dropped = 3943


In [5]:
numeric_cols = df_pdf.select(pl.col(pl.Int64, pl.Float64)).columns
X = df_pdf.select(numeric_cols).drop("label")
y = df_pdf["label"]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42
)

In [6]:
print(f"Original Shapes\nTrain: {X_train.shape}, Test: {X_test.shape}")

empty_cols = [
    col for col in X_train.columns 
    if (X_train[col] == 0).all()
]

print(f"Detected {len(empty_cols)} empty columns: {empty_cols}")

X_train = X_train.drop(empty_cols)
X_test = X_test.drop(empty_cols)

print(f"\nNew Shapes\n Train: {X_train.shape}, Test: {X_test.shape}")

Original Shapes
Train: (7676, 40), Test: (7677, 40)
Detected 8 empty columns: ['embedded_file_count', 'average_embedded_file_size', 'xref_count', 'xref_entries', 'submitform_count', 'jbig2decode_count', 'trailer_count', 'startxref_count']

New Shapes
 Train: (7676, 32), Test: (7677, 32)


> [!info] Check `ocr.ipynb` for why `used_ocr=1` rows were not dropped

## Data Preprocessing

Since the dataset is quite small, removing outliers is not an option. Therefore the decision is taken to Winsorize the outliers. Also, based on EDA of the dataset, it is clear that the dataset is quite skewed especially when studyimg the distribution of a feature based on the label. This will negatively impact the training of the model, and therefore must be fixed before proceeding. Thus log1p Transform is used to transform the data. The function below does both processes together

In [7]:
numeric_cols = X_train.select(pl.col(pl.Int64, pl.Float64)).columns

print(f"Processing {len(numeric_cols)} numeric features...")

caps_df = X_train.select([
    pl.col(c).quantile(0.99).alias(c) for c in numeric_cols
])
caps = caps_df.to_dict(as_series=False)


def apply_polars_preprocessing(df, caps_dict):
    ops = []

    valid_cols = [c for c in numeric_cols if c in df.columns]
    
    for col in valid_cols:
        cap_val = caps_dict[col][0]

        ops.append(
            pl.col(col)
            .clip(upper_bound=cap_val) 
            .log1p()                   
            .alias(col)
        )
    return df.with_columns(ops)

Processing 32 numeric features...


In [8]:
print("Applying transformations...")
X_train_clean = apply_polars_preprocessing(X_train, caps)
X_test_clean = apply_polars_preprocessing(X_test, caps)

Applying transformations...


After Preprocessing, it is important to drop sparse columns which will not be of any use. This is preferred over Feature Reduction or Dimensioality Reduction methods because of the data distribution. The problems are not solved even with Transformtion, and must be dropped manually.

In [9]:
print("Identifing useless (sparse) features...")

total_rows = X_train_clean.height
useless_cols = []

zero_counts = X_train_clean.select([
    (pl.col(c) == 0).sum().alias(c) for c in numeric_cols
]).to_dict(as_series=False)

for col, count in zero_counts.items():
    if (count[0] / total_rows) > 0.99: 
        useless_cols.append(col)

print(f"Dropping {len(useless_cols)} useless columns: {useless_cols}")

# Drop from both
X_train_final = X_train_clean.drop(useless_cols)
X_test_final = X_test_clean.drop(useless_cols)

# --- FINAL VALIDATION ---
print("\nPipeline Complete.")
print(f"Train Shape: {X_train_final.shape}")
print(f"Test Shape:  {X_test_final.shape}")

Identifing useless (sparse) features...
Dropping 4 useless columns: ['encrypted', 'uses_nonstandard_port', 'launch_count', 'richmedia_count']

Pipeline Complete.
Train Shape: (7676, 28)
Test Shape:  (7677, 28)
