## Data Exploration

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Uploading Libraries
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

In [3]:
# Find the path to the dataset
import os
os.getcwd()

'/Users/tajania/code/AlcaRmsp/the_laundromat'

In [4]:
# create a csv path
csv_path = '/Users/tajania/code/AlcaRmsp/the_laundromat'
csv_path

'/Users/tajania/code/AlcaRmsp/the_laundromat'

In [5]:
# Uploading dataset into a dataframe 
df=pd.read_csv(os.path.join(csv_path, 'raw_data/data.csv'))
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [6]:
limit = len(X)

def plotStrip(x, y, hue, figsize = (14, 9)):
    
    fig = plt.figure(figsize = figsize)
    colours = plt.cm.tab10(np.linspace(0, 1, 9))
    with sns.axes_style('ticks'):
        ax = sns.stripplot(x, y, \
             hue = hue, jitter = 0.4, marker = '.', \
             size = 4, palette = colours)
        ax.set_xlabel('')
        ax.set_xticklabels(['genuine', 'fraudulent'], size = 16)
        for axis in ['top','bottom','left','right']:
            ax.spines[axis].set_linewidth(2)

        handles, labels = ax.get_legend_handles_labels()
        plt.legend(handles, ['Transfer', 'Cash out'], bbox_to_anchor=(1, 1), \
               loc=2, borderaxespad=0, fontsize = 16);
    return ax

NameError: name 'X' is not defined

In [None]:
X_plt=df.drop(df['isFraud'])
ax = plotStrip(df['isFraud'][:limit], df['step'][:limit], df['type'][:limit])
ax.set_ylabel('time [hour]', size = 16)
ax.set_title('Striped vs. homogenous fingerprints of genuine and fraudulent \
transactions over time', size = 20);

In [None]:
# change names of the columns to be uniform
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
df.head()

In [None]:
# Get a description of the dataset
df.describe()

In [None]:
# Correlation matrice
df.corr()

In [None]:
# summarize class distribution
from collections import Counter
from matplotlib import pyplot
counter = Counter(df['isFraud'])
print(counter)

In [None]:
#import seaborn as sns
# Heatmap of pairwise correlations
#correlation_matrix = df.corr()
#column_names = correlation_matrix.columns
#sns.heatmap(correlation_matrix, xticklabels=column_names, yticklabels=column_names,cmap= "bwr");

In [None]:
import seaborn as sns
# Heatmap
corr = df.corr()
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
cmap= 'YlGnBu')
corr_df = corr.unstack().reset_index() # Unstack correlation matrix 
corr_df.columns = ['feature_1','feature_2', 'correlation'] # rename columns
corr_df.sort_values(by='correlation',ascending=False, inplace=True) # sort by correlation
corr_df = corr_df[corr_df['feature_1'] != corr_df['feature_2']] # Remove self correlation
corr_df.head()

In [None]:
# Get the shape of dataset
df.shape

In [None]:
# Get the type of dataset
df.dtypes

In [None]:
# check nil values in the dataset
df.isnull().values.any()

### Fraudulent transactions exploration

The dataset contains two columns called "isFraud" and "isFlaggedFraud". The column "isFlaggedFraud" traces fraudulent transactions following rule based  algorithm but are yet to be reviewed and investigated by an investogator. The column "isFraud" has been reviewd by the investigator

In [None]:
# Count the number and percentage of faudulent transaction in the dataset
number_fraud=(df.isFraud==1).value_counts()
percentage_fraud=number_fraud/len(df)
print(number_fraud)
print(percentage_fraud)

Conclusion: 8213 transactions out of 6362620 are fraudulent 

In [None]:
# Overview of transactions which are actual fraud per type of transaction
ax = df.groupby(['type', 'isFraud']).size().plot(kind='bar')
ax.set_title("# of transaction which are the actual fraud per transaction type")
ax.set_xlabel("(Type, isFraud)")
ax.set_ylabel("Count of transaction")
for p in ax.patches:
    ax.annotate(str(format(int(p.get_height()), ',d')), (p.get_x(), p.get_height()*1.01))

In [None]:
# Count the number of rows/data per type of operation
df['type'].value_counts()

In [None]:
# Count number of fraudulent transaction when type is PAYMENT
df[df['type']=='PAYMENT'][df.isFraud==1].count()

In [None]:
# Count number of fraudulent transaction when type is TRANSFER
df[df['type']=='TRANSFER'][df.isFraud==1].count()

In [None]:
# Count number of fraudulent transaction when type is CASH_IN
df[df['type']=='CASH_IN'][df.isFraud==1].count()

In [None]:
# Count number of fraudulent transaction when type is CASH_OUT
df[df['type']=='CASH_OUT'][df.isFraud==1].count()

In [None]:
# Count number of fraudulent transaction when type is DEBIT
df[df['type']=='DEBIT'][df.isFraud==1].count()

Conclusion: Fraudulent transactions happens during TRANSFER and CASH_OUT operations


### Flagged as fraud vs actual fraud

Check if the "isFlaggedFraud" is related to other columns or variables and try to explain the relationship if there is one.


See if all transactions that are flagged as fraud are actual fraud

In [None]:
# Now that we have the type of operations where fraud happens, the next step will be to check how 
# many operations have been flagged as fraudulent
dfFlagged = df[df['type']=='TRANSFER'][df.isFlaggedFraud==1]
dfFlagged

In [None]:
# Get the lenght of fraudulent transaction flagged
len(dfFlagged)

Only 16 operations out of 4097 fraudulant transactions have been flagged as fraud during the TRANSFER process


In [None]:
#check the rule based algorithm efficacy
df_NonFlagged = df[df['type']=='TRANSFER'][df.isFraud==1][df.isFlaggedFraud==0][df.amount>=200000]
len(df_NonFlagged )

In [None]:
dfFlagged.describe()

In [None]:
df_NonFlagged.describe()

Checking the rule based system, we believe that there are other rules, not just the min threshold of 200,000, used to flag potential fraudulent transactions

The flagging Fraud column is a very poor indication of the actual fraud given that the "isFlaggedFraud" column only picked 16 transactions when in reality we have over 8000 fraudulent transactions

In [None]:
# Drop isFlaggedFraud column
df=df.drop(['isFlaggedFraud'], axis=1)
df

### Account balance analysis

In [None]:
# check relationship between amount, isFraud, oldBalanceDest and newBalanceDest

df_fraudBalanceDest = df[(df['isFraud'] == 1) & 
                         (df['oldBalanceDest'] == df['newBalanceDest']) & 
                         (df['amount'] != 0)]
print(len(df_fraudBalanceDest)/len(df[df['isFraud']==1]))
print(len(df_fraudBalanceDest)/len(df[df['isFraud']==0]))

This shows that 49.5% of the fraudulent transactions have the characteristics of old balance of destination account and new balance of the same account equal while the amount is different than zero vs 0.6% for non fraudulent transactions

The same conclusion for the origination account 

In [None]:
df_fraudBalanceOrig = df[(df['isFraud'] == 1) & 
                         (df['oldBalanceOrig'] == df['newBalanceOrig']) & 
                         (df['amount'] != 0)]
print(len(df_fraudBalanceOrig)/len(df[df['isFraud']==1]))
print(len(df_fraudBalanceOrig)/len(df[df['isFraud']==0]))

In [None]:
# Differentiating the fraudulent account that meets the condition above with non fraudulent accounts that have new and old balance either equal or nil
df.loc[(df.oldBalanceDest == 0) & (df.newBalanceDest == 0) & (df.amount != 0), \
      ['oldBalanceDest', 'newBalanceDest']] = - 1
df

In [None]:
# Add a new feature to account for the error in the balance for both origination and destination accounts
df['errorBalanceOrig']=df['newBalanceOrig'] + df['amount'] - df['oldBalanceOrig']
df['errorBalanceDest']=df['newBalanceDest'] + df['amount'] - df['oldBalanceDest']
df

In [None]:
from sklearn.preprocessing import LabelEncoder
# create a label encoder object
le = LabelEncoder()

# apply the label encoder to non-numeric columns
df['nameOrig'] = le.fit_transform(df['nameOrig'])
df['nameDest'] = le.fit_transform(df['nameDest'])
df.head()

In [None]:
#Proportion of fraudulent transaction where errorBalance of destination account is higher than 0
print(len(df[(df['isFraud'] == 1) & 
                         (df['errorBalanceDest']>0)])/len(df[df['isFraud']==1]))

#Proportion of non fraudulent transaction where errorBalance of destination account is higher than 0
print(len(df[(df['isFraud'] == 0) & 
                         (df['errorBalanceDest']>0)])/len(df[df['isFraud']==0]))

Most of the transactions where errorbalanceDest > 0 has high chance of being fraud
Valid transactions are most likely to have errorbalanceOrig > 0

## Recursive Feature Elimination (RFE) for Feature Selection

### Feature Scaling

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Check unique values for streets (2)
print(f"The unique values for 'type' are {df.type.unique()}")

# Instantiate the OneHotEncoder
ohe_binary = OneHotEncoder(sparse = False, drop="if_binary") 

# Fit encoder
ohe_binary.fit(df[['type']]) 

# Display the detected categories
print(f"The categories detected by the OneHotEncoder are {ohe_binary.categories_}")

In [None]:
# Display the generated names
print(f"The column names for the encoded values are {ohe_binary.get_feature_names_out()}")

# Transform the current "Street" column
df[ohe_binary.get_feature_names_out()] = ohe_binary.transform(df[['type']])

# Drop the column "Street" which has been encoded
df.drop(columns = ["type"], inplace = True)

# Show the dataset
df.head(3)

In [None]:
# Heatmap of pairwise correlations
correlation_matrix = df.corr()
column_names = correlation_matrix.columns
sns.heatmap(correlation_matrix, xticklabels=column_names, yticklabels=column_names,cmap= "bwr");

In [None]:
# pearson's correlation feature selection for numeric input and numeric output
#from sklearn.datasets import make_regression
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import f_regression
# generate dataset
#X=df_new.drop(['isFraud'], axis=1)
#y = df_new['isFraud']
# define dataset
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# define feature selection
#fs = SelectKBest(score_func=f_regression, k='all')
# apply feature selection
#X_selected = fs.fit_transform(X, y)
#print(X_selected.shape)

In [None]:
# explore the number of selected features for RFE
#from numpy import mean
#from numpy import std
#from sklearn.datasets import make_classification
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import RepeatedStratifiedKFold
#from sklearn.feature_selection import RFE
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.pipeline import Pipeline
#from matplotlib import pyplot
 
# get a list of models to evaluate
#def get_models():
    #models = dict()
    #for i in range(1, 4):
     #   rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
        #model = DecisionTreeClassifier()
        #models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    #return models
#get_models()

In [None]:
# evaluate a give model using cross-validation


In [None]:
# evaluate the models and store results
#results, names = list(), list()
#for name, model in models.items():
    #scores = evaluate_model(model, X, y)
    #results.append(scores)
    #names.append(name)
#print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
# plot model performance for comparison
#pyplot.boxplot(results, labels=names, showmeans=True)
#pyplot.show()

In [None]:
# summarize all features
# define RFE
#rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=3)
# fit RFE
#rfe.fit(X, y)
#for i in range(X.shape[1]):
    #print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

## Train Test Dataset

In [None]:
# Define X and y
X=df.drop(['isFraud'], axis=1)
y = df['isFraud']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Balance the y test using the SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC

X_test_resampled, y_test_resampled = SMOTE(sampling_strategy=0.6).fit_resample(X_test, y_test)

In [None]:
y_test.shape

In [None]:
y_test_resampled.shape

## Balance the train Dataset using the SMOTE

❗️ Warning about the oversampling method ❗️
Train-test split your dataset before oversampling
Oversample only in the train set
→
The model needs to learn about the minority class.
Evaluate in the test set without oversampling
→
We want the model to be evaluated in real conditions

In [None]:
#Check if data is balanced before SMOTE
import matplotlib.pyplot as plt
labels = 'Not Fraud','Fraud'
values=df['isFraud'].value_counts()
fig, ax = plt.subplots()
ax.pie(values, labels=labels)

In [None]:
import imblearn
print(imblearn.__version__)

In [None]:
X=df.drop(['isFraud'], axis=1)
y = df['isFraud']

In [None]:
# summarize class distribution
import collections
counter = collections.Counter(y)
print(counter)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)


In [None]:
X_resampled.head()

In [None]:
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
pd.DataFrame(y_resampled)

In [None]:
X_resampled['isFraud'] = y_resampled

In [None]:
X_resampled

In [None]:
#Check if data is balanced after SMOTE balancing
import matplotlib.pyplot as plt
labels = 'Not Fraud','Fraud'
values=X_resampled['isFraud'].value_counts()
fig, ax = plt.subplots()
ax.pie(values, labels=labels)

In [None]:
X_resampled.head()

In [None]:
df.info()

In [None]:
from imblearn.over_sampling import ADASYN
X_resampled_ADASYN, y_resampled_ADASYN = ADASYN().fit_resample(X_train, y_train)

In [None]:
X_resampled_ADASYN['isFraud'] = y_resampled_ADASYN

In [None]:
#Check if data is balanced after ADASYN balancing
import matplotlib.pyplot as plt
labels = 'Not Fraud','Fraud'
values=X_resampled_ADASYN['isFraud'].value_counts()
fig, ax = plt.subplots()
ax.pie(values, labels=labels)

## XGBClassifier

In [None]:
df_fraud=X_resampled[X_resampled.isFraud==1].iloc[:100000]
df_fraud.sample(100000)

In [None]:
df_nonfraud=X_resampled[X_resampled.isFraud==0].iloc[:100000]
df_nonfraud.sample(100000)

In [None]:
df_sample=pd.concat([df_fraud,df_nonfraud], axis=0)
df_sample.shape

In [None]:
import seaborn as sns
# Heatmap
corr = df_sample.corr()
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns,
cmap= 'YlGnBu')
corr_df = corr.unstack().reset_index() # Unstack correlation matrix 
corr_df.columns = ['feature_1','feature_2', 'correlation'] # rename columns
corr_df.sort_values(by='correlation',ascending=False, inplace=True) # sort by correlation
corr_df = corr_df[corr_df['feature_1'] != corr_df['feature_2']] # Remove self correlation
corr_df.head()

In [None]:
X_train_modeling=df_sample.drop('isFraud',axis=1)
y_train_modeling=df_sample['isFraud']

In [None]:
# Baseline Model
# Extreme Gradient Boosting algorithm
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
model = XGBClassifier()
model.fit(X_train_modeling, y_train_modeling)

In [None]:
X_test.head()

In [None]:
y_test.shape

In [None]:
X_train_modeling.shape

In [None]:
y_test_resampled.value_counts()/len(y_test_resampled)

In [None]:
y_test_resampled.shape

In [None]:
X_test_resampled.shape

In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test_resampled)
baseline_predictions = [round(value) for value in y_pred]

# Compute MAE
mae_baseline = mean_absolute_error(y_test_resampled, baseline_predictions)
# Compute accuracy
accuracy = accuracy_score(y_test_resampled, baseline_predictions)
recall = recall_score(y_test_resampled, y_pred)
precision
print("Recall:", recall)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("mae_baseline: %.2f%%" % (mae_baseline * 100.0))

In [None]:
print("Accuracy on training set: {:.3f}".format(model.score(X_train, y_train)))
print("Accuracy on testing set: {:.3f}".format(model.score(X_test, y_test)))

In [None]:
# Grid search
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score


# Instantiate model
model_1 = ElasticNet()

# Hyperparameter Grid
grid = {
    'alpha': [0.01, 0.1, 1], 
    'l1_ratio': [0.2, 0.5, 0.8]
}

# Instantiate Grid Search
search = GridSearchCV(
    model_1,
    grid, 
    scoring = 'r2',
    cv = 5,
    n_jobs=-1 # parallelize computation
) 

# Fit data to Grid Search
search.fit(X_train_modeling.iloc[:10], y_train_modeling.iloc[:10])

In [None]:
# Best score
search.best_score_

# Best Params
search.best_params_

# Best estimator
search.best_estimator_

In [None]:
# Instantiate and train model
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=2)
tree_clf.fit(X_train_modeling, y_train_modeling)

In [None]:
from sklearn.metrics import recall_score

y_pred = tree_clf.predict(X_test_resampled)
baseline_predictions = [round(value) for value in y_pred]

# Compute accuracy
accuracy = accuracy_score(y_test_resampled, baseline_predictions)
recall_2 = recall_score(y_test_resampled, y_pred)
print("Recall:", recall_2)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))


In [None]:
#from sklearn.svm import SVC
#svc = SVC(kernel='linear', C=10)

# equivalent but with SGD solver
#from sklearn.linear_model import SGDClassifier
#svc_bis = SGDClassifier(loss='hinge', penalty='l2', alpha=1/10)

In [None]:
#model selection?
#Recall?
#List of classification models/how to do model tuning to the models (hyperparameter to look at)/
#XGBoost
#each model => feature importance/permutation=>model tuning (can be before feature importance, grid search, random search)=>feature importance

## Gridsearch

In [None]:
X_train_modeling.shape

In [None]:
y_train_modeling.shape

In [None]:
cv_params = {'max_depth': [1,2,3,4,5,6], 'min_child_weight': [1,2,3,4]}    # parameters to be tries in the grid search
fix_params = {'learning_rate': 0.2, 'n_estimators': 100, 'objective': 'binary:logistic'}   #other parameters, fixed for the moment 
csv = GridSearchCV(XGBClassifier(**fix_params), cv_params, scoring = 'f1', cv = 2)

In [None]:
csv.fit(X_train_modeling, y_train_modeling)

In [None]:
csv.best_params_

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()


In [None]:
#Train the model on the Training Data = X_resampled and y_resampled
model.fit(X_train_modeling, y_train_modeling)

In [None]:
y_pred = model.predict(X_test_resampled)

In [None]:

# Score the model on the Test data
from sklearn.metrics import recall_score
recall50 = recall_score(y_test_resampled, y_pred)
print("Recall:", recall50)