# Data Preprocessing:
Main contributor (Cheng)

In [3]:
import seaborn as sns
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
len(df)

6362620

In [6]:
df.nunique()

step                  743
type                    5
amount            5316900
nameOrig          6353307
oldbalanceOrg     1845844
newbalanceOrig    2682586
nameDest          2722362
oldbalanceDest    3614697
newbalanceDest    3555499
isFraud                 2
isFlaggedFraud          2
dtype: int64

In [7]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [8]:
df.info()
# for machine learning model, we need all numerical values, so we will have to encode or transform the objects

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [9]:
df['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [10]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the object column
df['type'] = label_encoder.fit_transform(df['type'])
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,3,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,3,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [11]:
len(df['type'])

6362620

In [14]:
#calculate amount coln
df['amount'] = df['oldbalanceOrg'] - df['newbalanceOrg']

# Calculate total amount sent by each nameOrig
total_sent = df.groupby('nameOrig')['amount'].sum().reset_index()
total_sent.columns = ['nameOrig', 'total_sent']

# Calculate total amount received by each nameDest
total_received = df.groupby('nameDest')['amount'].sum().reset_index()
total_received.columns = ['nameOrig', 'total_received']

# Merge the total_sent and total_received DataFrames back to the original DataFrame
df = df.merge(total_sent, on='nameOrig', how='left')
df = df.merge(total_received, left_on='nameOrig', right_on='nameOrig', how='left')

KeyboardInterrupt: 

In [12]:
df.describe()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,1.71415,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,1.350117,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,1.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,1.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,3.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,4.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


Code from other project that can be applied

# Large amount of outliers in the data, we will clean the outliers for non categorical data
# Define a function to identify outliers for a given column
def identify_outliers(column):
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = (column < lower_bound) | (column > upper_bound)
    return outliers

# Create an empty DataFrame to store outlier information
outlier_info = pd.DataFrame(columns=df.columns)

# Iterate over each column and identify outliers
for column in df.columns:
    outliers = identify_outliers(df[column])
    outlier_info[column] = outliers

def remove_outliers(df):
    df_cleaned = df.copy()

    for col in df_cleaned.select_dtypes(include='number').columns:
        q1 = df_cleaned[col].quantile(0.25)
        q3 = df_cleaned[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        outliers = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)
        df_cleaned = df_cleaned[~outliers]
    
    return df_cleaned

# Compute correlation with a specific column
correlation_with_target = df.corrwith(df['SalePrice'])

df_cor = correlation_with_target.abs().sort_values(ascending=False)
df_cor[df_cor>0.6]

# We see that  these data columns have significantly highter correlation with the interested data, so we should focus on cleaning these data from outlisers
df_cor_highlight =df_cor[df_cor>0.6]



# Exploratory Data Analysis
Main Contributor: Cheng

In [13]:
fraud_trans = df[df['isFraud'] == 1]
valid_trans = df[df['isFraud'] == 0]

print('Number of fraud transactions according to type are below:\n', len(fraud_trans), '\n')
print('Number of valid transactions according to type are below:\n', len(valid_trans))

Number of fraud transactions according to type are below:
 8213 

Number of valid transactions according to type are below:
 6354407


# Feature Engineering:

# Model Selection:

# Model Training:

# Model Evaluation: