## Data

In [1]:
%load_ext autoreload
%autoreload 2
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/Users/claudiajovellar/code/AlcaRmsp/the_laundromat/raw_data/data.csv')

df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})

In [2]:
df = df.drop(['isFlaggedFraud'], axis = 1)

In [3]:
df['nameOrig_C_M'] = df['nameOrig'].str[0]
df['nameDest_C_M'] = df['nameDest'].str[0]
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldBalanceOrig,newBalanceOrig,nameDest,oldBalanceDest,newBalanceDest,isFraud,nameOrig_C_M,nameDest_C_M
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,C,M
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,C,M
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,C,C
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,C,C
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,C,M


In [4]:
df1 = pd.get_dummies(df["nameOrig_C_M"])
df2 = pd.concat((df, df1), axis=1)
df2 = df2.drop(["nameOrig_C_M"], axis=1)
df3 = df2.rename(columns={"C": "nameOrig_encoded"})

In [5]:
df4 = pd.get_dummies(df["nameDest_C_M"])
df5 = pd.concat((df3, df4), axis=1)
df5 = df5.drop(["nameDest_C_M"], axis=1)
df5 = df5.drop(["M"], axis=1)
df6 = df5.rename(columns={"C": "nameDest_encoded"})

In [6]:
df6['errorBalanceOrig']=df6['newBalanceOrig'] + df6['amount'] - df6['oldBalanceOrig']
df6['errorBalanceDest']=df6['newBalanceDest'] + df6['amount'] - df6['oldBalanceDest']

In [7]:
from sklearn.preprocessing import OrdinalEncoder

# Instantiate the Ordinal Encoder
ordinal_encoder = OrdinalEncoder()

# Fit it
ordinal_encoder.fit(df6[["type"]])

# Display the learned categories
display(ordinal_encoder.categories_)

# Transforming categories into ordered numbers
df6["encoded_type"] = ordinal_encoder.transform(df6[["type"]])

# Showing the transformed classes
df6.head()

[array(['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER'],
       dtype=object)]

Unnamed: 0,step,type,amount,nameOrig,oldBalanceOrig,newBalanceOrig,nameDest,oldBalanceDest,newBalanceDest,isFraud,nameOrig_encoded,nameDest_encoded,errorBalanceOrig,errorBalanceDest,encoded_type
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,1,0,0.0,9839.64,3.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,1,0,0.0,1864.28,3.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,1,1,0.0,181.0,4.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,1,1,0.0,-21001.0,1.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,1,0,0.0,11668.14,3.0


In [8]:
df_new=df6.drop(['type','nameOrig','nameDest'], axis=1)
df_new.head()

Unnamed: 0,step,amount,oldBalanceOrig,newBalanceOrig,oldBalanceDest,newBalanceDest,isFraud,nameOrig_encoded,nameDest_encoded,errorBalanceOrig,errorBalanceDest,encoded_type
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,1,0,0.0,9839.64,3.0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,1,0,0.0,1864.28,3.0
2,1,181.0,181.0,0.0,0.0,0.0,1,1,1,0.0,181.0,4.0
3,1,181.0,181.0,0.0,21182.0,0.0,1,1,1,0.0,-21001.0,1.0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,1,0,0.0,11668.14,3.0


## Baseline model

In [12]:
# Import the model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Define X and y
X = df_new.drop(["isFraud"], axis = 1)
y = df_new["isFraud"]

In [13]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Instanciate the model (💡 in Sklearn often called "estimator")
model = LogisticRegression()

model.fit(X_train,y_train)

In [15]:
y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)

0.4148148148148148

### Feature Selection

In [None]:
from sklearn.inspection import permutation_importance

log_model = LogisticRegression().fit(X_train, y_train) # Fit model,

permutation_score = permutation_importance(log_model, X_train, y_train, n_repeats=10, scoring = 'recall') # Perform Permutation

importance_df = pd.DataFrame(np.vstack((X_train.columns, permutation_score.importances_mean)).T) # Unstack results

importance_df.columns=['feature','score_decrease']

importance_df.sort_values(by='score_decrease', ascending = False) # Order by importance