# IEEE Fraud Detection Dataset EDA - Data Processing and Model Prototyping

In [1]:
# Lib Imports
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression, LinearRegression, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, accuracy_score

ImportError: DLL load failed: %1 não é um aplicativo Win32 válido.

In [None]:
# Setting Pandas column display option
pd.set_option('display.max_columns', 500)

In [None]:
idee = pd.read_csv('train_identity.csv')
transaction = pd.read_csv('train_transaction.csv')

# Inner merging tables with Transactions on the Left

In [None]:
merge = transaction.merge(idee, how='outer', on='TransactionID')

In [None]:
merge.dtypes.value_counts()

# Processing Categorical Data

In [None]:
objects = merge.select_dtypes('object')
objects.astype('category')

In [None]:
objects.fillna("Unknown", inplace=True)
objects.head()

In [None]:
objects = pd.get_dummies(objects)
objects.head()

In [None]:
objects['TransactionID'] = merge['TransactionID']
objects['isFraud'] = merge['isFraud']
cols = objects.columns.to_list()
cols = cols[-1:] + cols[:-1]
objects = objects[cols]
objects.astype('int64')
print(objects.info())

# Categorical Feature Selection with Undersampling and LassoCV

In [None]:
# Resampling categorical data at 50/50

target = ['isFraud']
to_remove = ['isFraud', 'TransactionID']
features = objects.columns.to_list()
features = [x for x in features if not x in to_remove]

X = objects[features]
y = objects[target]

rusampler = RandomUnderSampler() 
X_rus, y_rus = rusampler.fit_resample(X, y)

In [None]:
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(
    X_rus, y_rus, test_size=0.2, random_state=123)
print(X_train_lasso.shape, y_train_lasso.shape)

In [None]:
model_lasso = LassoCV(
    tol=0.01, n_jobs=-1).fit(X_train_lasso, y_train_lasso)

penalized_features = pd.Series(model_lasso.coef_, index=X_train_lasso.columns)
penalized_features.head()

In [None]:
imp_coef = pd.concat([penalized_features.sort_values().head(10),
                      penalized_features.sort_values().tail(10)])
plt.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind="barh")
plt.title("Coefficients in the Lasso Model")

In [None]:
cat_features = penalized_features.sort_values().head(10).keys().to_list()
cat_features = cat_features + penalized_features.sort_values().tail(10).keys().to_list()
cat_features = sorted(cat_features)

# Processing Numerical Data

In [None]:
numbers = merge.select_dtypes(include=['float64', 'int64'])
numbers.fillna(numbers.mean(), inplace=True)

In [None]:
 numbers.head()

# Numerical Feature Selection with Undersampling and LassoCV

In [None]:
# Resampling categorical data at 50/50
features = numbers.columns.to_list()
features = [x for x in features if not x in to_remove]

Xn = numbers[features]
yn = numbers[target]
Xn_rus, yn_rus = rusampler.fit_resample(Xn, yn)

In [None]:
Xn_train_lasso, Xn_test_lasso, yn_train_lasso, yn_test_lasso = train_test_split(
    Xn_rus, yn_rus, test_size=0.2, random_state=123)
print(Xn_train_lasso.shape, yn_train_lasso.shape)

In [None]:
model_lasson = LassoCV(
    tol=0.01, n_jobs=-1).fit(Xn_train_lasso, yn_train_lasso)

penalized_featuresn = pd.Series(model_lasson.coef_, index=Xn_train_lasso.columns)
# penalized_featuresn.head()
penalized_featuresn.sort_values().head(10)

In [None]:
imp_coefn = pd.concat([penalized_featuresn.sort_values().head(10),
                      penalized_featuresn.sort_values().tail(10)])
plt.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coefn.plot(kind="barh")
plt.title("Coefficients in the Lasso Model")

In [None]:
num_features = penalized_featuresn.sort_values().head(10).keys().to_list()
num_features = num_features + penalized_featuresn.sort_values().tail(10).keys().to_list()
num_features = sorted(num_features)

# Merged Study with full dataset

In [None]:
numbers.drop(columns='isFraud', inplace=True)
merge_filled = objects.merge(numbers, how='outer', on='TransactionID')
del objects
del numbers
merge_filled.info()

In [None]:
features = merge_filled.columns.to_list()
features = [x for x in features if not x in to_remove]
X = merge_filled[features]
y = merge_filled[target]
print(X.info())
print(len(y))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l2', C=1e42, max_iter=150, verbose=1, solver='sag', n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

##----------------------------------------------------------------------------------------------------------------------------------------------
## Test dataset

In [None]:
# test set preprocessing

idee = pd.read_csv('test_identity.csv')
transaction = pd.read_csv('test_transaction.csv')
merge = transaction.merge(idee, how='outer', on='TransactionID')
print(merge.info())
print(merge.dtypes.value_counts())
objects = merge.select_dtypes('object')
numbers = merge.select_dtypes(include=['float64', 'int64'])
objects.fillna("Unknown", inplace=True)
objects = pd.get_dummies(objects)
objects['TransactionID'] = merge['TransactionID']
cols = objects.columns.to_list()
cols = cols[-1:] + cols[:-1]
objects = objects[cols]
objects.astype('category')
numbers.fillna(numbers.mean(), inplace=True)
merge_filled_test = objects.merge(numbers, how='outer', on='TransactionID')
del objects
del numbers
X_test = merge_filled_test.copy()
print(X_test.info())

In [None]:
sample_sub = pd.read_csv('sample_submission.csv')
sample_sub.head()