# Fraud Detection Data Preprocessing Notebook

In [None]:
dataset_copy = dataset.copy()

In [None]:
df_fraud = pd.DataFrame(dataset_copy['train'])

In [None]:
def data_preprocessing(data):
    def value_encode(data_field):
        return data_field.apply(lambda field: float(sum(bytearray(str(field).encode('utf-8')))))
    
    data_copy = data.copy()

    data_copy.drop('isFlaggedFraud', inplace=True, axis=1)
    data_copy.drop('step', inplace=True, axis=1)

    data_copy['type'] = value_encode(data_copy['type'])
    data_copy['nameOrig'] = value_encode(data_copy['nameOrig'])
    data_copy['nameDest'] = value_encode(data_copy['nameDest'])

    return data_copy

In [None]:
def build_pipeline():
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('quantile_transformer', QuantileTransformer(output_distribution='normal')),
        ('robust_scaler', RobustScaler()),
    ])
    
    return pipeline

In [None]:
def train_test_split_processing(data: pd.DataFrame, target: str, test_size: float, shuffle: bool=True):
    
    feature_set = data.drop(target, axis=1)
    target_set = data[target]
    
    numerical_columns = feature_set.columns.tolist()

    nm = NearMiss(sampling_strategy=0.1, n_jobs=-1)

    x_nm, y_nm = nm.fit_resample(feature_set, target_set)

    x_train, x_test, y_train, y_test = train_test_split(x_nm, y_nm, test_size=test_size, shuffle=shuffle, stratify=y_nm)

    full_pipeline = ColumnTransformer([
        ("num", build_pipeline(), numerical_columns),
    ])

    x_train_prepared = full_pipeline.fit_transform(x_train)
    x_test_prepared = full_pipeline.transform(x_test)

    x_train_prepared = pd.DataFrame(x_train_prepared, columns=numerical_columns)
    x_test_prepared = pd.DataFrame(x_test_prepared, columns=numerical_columns)


    return x_train_prepared, x_test_prepared, y_train, y_test