In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score as ROC 
from sklearn.metrics import recall_score as Recall 
from sklearn.metrics import precision_score as Precision

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split as TTS 

from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression as LogiR
from sklearn.linear_model import LinearRegression as LR

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor as GBR



from collections import Counter

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

In [2]:
df_raw = pd.read_csv("card_transdata.csv")
df_raw.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [3]:
df_raw.dtypes

distance_from_home                float64
distance_from_last_transaction    float64
ratio_to_median_purchase_price    float64
repeat_retailer                   float64
used_chip                         float64
used_pin_number                   float64
online_order                      float64
fraud                             float64
dtype: object

In [4]:
cols_bin = ["repeat_retailer",	"used_chip",	"used_pin_number",	"online_order"]
for col in cols_bin:
    print(df_raw[col].value_counts())

repeat_retailer
1.0    881536
0.0    118464
Name: count, dtype: int64
used_chip
0.0    649601
1.0    350399
Name: count, dtype: int64
used_pin_number
0.0    899392
1.0    100608
Name: count, dtype: int64
online_order
1.0    650552
0.0    349448
Name: count, dtype: int64


In [5]:
# change the data type of df[col]
df = df_raw.copy()
for col in cols_bin:
    df[col] = df[col].astype(int)


In [6]:
df.dtypes

distance_from_home                float64
distance_from_last_transaction    float64
ratio_to_median_purchase_price    float64
repeat_retailer                     int64
used_chip                           int64
used_pin_number                     int64
online_order                        int64
fraud                             float64
dtype: object

# Feature Engineer - Feature Generation

df should have more added columns than df_raw


In [7]:
cols_numerical_features = ["distance_from_home",	"distance_from_last_transaction",	"ratio_to_median_purchase_price"]
cols_bin = ["repeat_retailer",	"used_chip",	"used_pin_number",	"online_order"]

In [8]:
# Feature Enginner
df['chip_and_pin'] = df['used_chip'] & df['used_pin_number']
df['chip_and_order'] = df['used_chip'] & df['online_order']
df['chip_and_retailer'] = df['used_chip'] & df["repeat_retailer"]
df['order_and_retailer'] = df['online_order'] & df['repeat_retailer']
df['pin_and_retailer'] = df['used_pin_number'] & df['repeat_retailer']
df['pin_and_order'] = df['online_order'] & df['repeat_retailer']

df['num_features_used'] = df[['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order']].sum(axis=1).astype(int)
df['any_feature_used'] = df[['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order']].any(axis=1).astype(int)


In [9]:
df_raw.mean()

distance_from_home                26.628792
distance_from_last_transaction     5.036519
ratio_to_median_purchase_price     1.824182
repeat_retailer                    0.881536
used_chip                          0.350399
used_pin_number                    0.100608
online_order                       0.650552
fraud                              0.087403
dtype: float64

In [10]:
df["cartesian_product"] = df_raw[cols_bin].apply(lambda x: tuple(x.value_counts().index.tolist()), axis=1)

In [11]:
df["cartesian_product"] .value_counts()

cartesian_product
(1.0, 0.0)    684348
(0.0, 1.0)    271788
(0.0,)         23994
(1.0,)         19870
Name: count, dtype: int64

# Feature Engineer - Scaler


In [12]:
# scaler = StandardScaler()
# df[cols_numerical_features] = scaler.fit_transform(df[cols_numerical_features])
scaler = MinMaxScaler()
df[cols_numerical_features] = scaler.fit_transform(df[cols_numerical_features])

In [13]:
df

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,chip_and_pin,chip_and_order,chip_and_retailer,order_and_retailer,pin_and_retailer,pin_and_order,num_features_used,any_feature_used,cartesian_product
0,0.005443,0.000026,0.007250,1,1,0,0,0.0,0,0,1,0,0,0,2,1,"(1.0, 0.0)"
1,0.001018,0.000015,0.004816,1,0,0,0,0.0,0,0,0,0,0,0,1,1,"(0.0, 1.0)"
2,0.000478,0.000068,0.001581,1,0,0,1,0.0,0,0,0,1,0,1,2,1,"(1.0, 0.0)"
3,0.000211,0.000473,0.001338,1,1,0,1,0.0,0,1,1,1,0,1,3,1,"(1.0, 0.0)"
4,0.004156,0.000048,0.008284,1,1,0,1,0.0,0,1,1,1,0,1,3,1,"(1.0, 0.0)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.000207,0.000009,0.006058,1,1,0,0,0.0,0,0,1,0,0,0,2,1,"(1.0, 0.0)"
999996,0.001869,0.000226,0.010358,1,1,0,0,0.0,0,0,1,0,0,0,2,1,"(1.0, 0.0)"
999997,0.000274,0.000124,0.000798,1,1,0,1,0.0,0,1,1,1,0,1,3,1,"(1.0, 0.0)"
999998,0.000400,0.000020,0.001760,1,0,0,1,0.0,0,0,0,1,0,1,2,1,"(1.0, 0.0)"


In [65]:
cols_numerical_features = ["distance_from_home",	"distance_from_last_transaction",	"ratio_to_median_purchase_price"]
cols_bin = ["repeat_retailer",	"used_chip",	"used_pin_number",	"online_order"]
df.drop(cols_numerical_features+cols_bin+["fraud"], axis=1, inplace=True)

KeyError: "['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip', 'used_pin_number', 'online_order'] not found in axis"

In [66]:
df.drop("fraud", axis=1, inplace=True)

In [82]:
bindf = pd.read_csv('train_bin.csv')
bin_cols = ["distance_from_home_bin",	"distance_from_last_transaction_bin",	"ratio_to_median_purchase_price_bin"]

In [84]:
bindf.isnull().sum().sum()

0

In [86]:
for col in bin_cols:
    df[col] = bindf[col].astype(int)

In [87]:
bindf.dtypes

Unnamed: 0                              int64
distance_from_home                    float64
distance_from_last_transaction        float64
ratio_to_median_purchase_price        float64
repeat_retailer                       float64
used_chip                             float64
used_pin_number                       float64
online_order                          float64
fraud                                 float64
distance_from_home_bin                  int64
distance_from_last_transaction_bin      int64
ratio_to_median_purchase_price_bin      int64
dtype: object

In [88]:
df['home_and_transaction_bin'] = bindf['distance_from_home_bin'] & bindf['distance_from_last_transaction_bin']
df['home_and_median_bin'] = bindf['distance_from_home_bin'] & bindf["ratio_to_median_purchase_price_bin"]
df['transaction_and_median_bin'] = bindf["ratio_to_median_purchase_price_bin"] & bindf["distance_from_last_transaction_bin"]

df['num_features_used_bin'] = bindf[['distance_from_home_bin', "distance_from_last_transaction_bin", "ratio_to_median_purchase_price_bin"]].sum(axis=1).astype(int)
df['any_feature_used_bin'] = bindf[['distance_from_home_bin', "distance_from_last_transaction_bin","ratio_to_median_purchase_price_bin" ]].any(axis=1).astype(int)

In [92]:
df.index

RangeIndex(start=0, stop=1000000, step=1)

In [90]:
df.dtypes

chip_and_pin                           int64
chip_and_order                         int64
chip_and_retailer                      int64
order_and_retailer                     int64
pin_and_retailer                       int64
pin_and_order                          int64
num_features_used                      int64
any_feature_used                       int64
cartesian_product                     object
distance_from_home_bin                 int64
distance_from_last_transaction_bin     int64
ratio_to_median_purchase_price_bin     int64
home_and_transaction_bin               int64
home_and_median_bin                    int64
transaction_and_median_bin             int64
num_features_used_bin                  int64
any_feature_used_bin                   int64
dtype: object

In [91]:
df.to_csv("features.csv")