In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from joblib import Parallel, delayed
from scipy import sparse
from os.path import join, dirname

In [12]:
df = pd.read_csv("../data/train_data.csv")

In [13]:
# Process numeric features
numeric_features = df.select_dtypes(exclude=["object"]).columns.to_list()
numeric_features.remove('TransactionID')
numeric_features.remove("isFraud")
numeric_df = df[numeric_features].copy()

# Filter columns with too many missing values
missing_ratio = numeric_df.isnull().mean()
numeric_df = numeric_df.loc[:, missing_ratio < 0.5]

# Add missing indicators
missing_indicators = MissingIndicator(features='all')
missing_matrix = missing_indicators.fit_transform(numeric_df)
for i, col in enumerate(numeric_df.columns):
    numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
numeric_df_imputed = pd.DataFrame(
    imputer.fit_transform(numeric_df),
    columns=numeric_df.columns
)

# Normalize numeric features
scaler = StandardScaler()
numeric_cols = [col for col in numeric_df_imputed.columns if not col.endswith('_missing')]
numeric_df_imputed[numeric_cols] = scaler.fit_transform(numeric_df_imputed[numeric_cols])

print("Numeric features processed:", numeric_df_imputed.shape)

  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)
  numeric_

Numeric features processed: (590540, 418)


In [14]:
# Process categorical features
categoric_features = df.select_dtypes(include=['object']).columns.to_list()
categoric_df = df[categoric_features].copy()

# Filter columns with too many missing values
missing_ratio = categoric_df.isnull().mean()
categoric_df = categoric_df.loc[:, missing_ratio < 0.5]


# Add missing indicators
missing_indicators = MissingIndicator(features='all')
missing_matrix = missing_indicators.fit_transform(categoric_df)
for i, col in enumerate(categoric_df.columns):
    categoric_df[f'{col}_missing'] = missing_matrix[:, i].astype(int)

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
categoric_df_imputed = pd.DataFrame(
    imputer.fit_transform(categoric_df),
    columns=categoric_df.columns
)

# Merge small categories
for col in categoric_df_imputed.columns:
    if not col.endswith('_missing'):
        value_counts = categoric_df_imputed[col].value_counts(normalize=True)
        small_categories = value_counts[value_counts < 0.05].index
        categoric_df_imputed.loc[categoric_df_imputed[col].isin(small_categories), col] = 'other'

# One-hot encode categorical features
categoric_cols = [col for col in categoric_df_imputed.columns if not col.endswith('_missing')]
categoric_df_encoded = pd.get_dummies(categoric_df_imputed, columns=categoric_cols)

print("Categorical features processed:", categoric_df_encoded.shape)

Categorical features processed: (590540, 47)


In [15]:
# Combine all features
final_df = pd.concat([
    numeric_df_imputed,
    categoric_df_encoded,
    df[["isFraud"]]
], axis=1)

print("Final dataset shape:", final_df.shape)
print("\nColumns in final dataset:", final_df.columns.tolist())

Final dataset shape: (590540, 466)

Columns in final dataset: ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D10', 'D11', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103'

In [18]:
# Save processed dataset
final_df.to_csv('../data/preproc_train_data.csv', index=False)
print('Processed dataset saved successfully.')

Processed dataset saved successfully.
