# Q1. Performing Data cleaning including missing values, outliers and multi-collinearity.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Resume/Fraud.csv')

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.shape

(6362620, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [7]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [8]:
print(df.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


### Feature Engineering

In [9]:
col_list = []

for col in df.columns:
    if(df[col].dtype == "object"):
        col_list.append(col)
print(col_list)

['type', 'nameOrig', 'nameDest']


In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder = LabelEncoder()

In [12]:
for i in col_list:
    df[i] = label_encoder.fit_transform(df[i])

In [13]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,3,9839.64,757869,170136.0,160296.36,1662094,0.0,0.0,0,0
1,1,3,1864.28,2188998,21249.0,19384.72,1733924,0.0,0.0,0,0
2,1,4,181.0,1002156,181.0,0.0,439685,0.0,0.0,1,0
3,1,1,181.0,5828262,181.0,0.0,391696,21182.0,0.0,1,0
4,1,3,11668.14,3445981,41554.0,29885.86,828919,0.0,0.0,0,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            int64  
 2   amount          float64
 3   nameOrig        int64  
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        int64  
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(6)
memory usage: 534.0 MB


In [15]:
from sklearn.impute import SimpleImputer

In [16]:
imputer = SimpleImputer(strategy='median')
imputer.fit(df[['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']])
df[['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']] = imputer.transform(df[['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']])

In [17]:
df['nameOrig'] = df['nameOrig'].fillna('Missing')
df['nameDest'] = df['nameDest'].fillna('Missing')

In [18]:
from scipy import stats
z = np.abs(stats.zscore(df))

print(df[z > 3])

          step  type      amount  nameOrig  oldbalanceOrg  newbalanceOrig  \
0          NaN   NaN         NaN       NaN            NaN             NaN   
1          NaN   NaN         NaN       NaN            NaN             NaN   
2          NaN   NaN         NaN       NaN            NaN             NaN   
3          NaN   NaN         NaN       NaN            NaN             NaN   
4          NaN   NaN         NaN       NaN            NaN             NaN   
...        ...   ...         ...       ...            ...             ...   
6362615  743.0   NaN         NaN       NaN            NaN             NaN   
6362616  743.0   NaN  6311409.28       NaN            NaN             NaN   
6362617  743.0   NaN  6311409.28       NaN            NaN             NaN   
6362618  743.0   NaN         NaN       NaN            NaN             NaN   
6362619  743.0   NaN         NaN       NaN            NaN             NaN   

         nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud 

In [19]:
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()
df[['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']] = rob_scaler.fit_transform(df[['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest']])

In [20]:
corr = df.corr()
print(corr)

                    step      type    amount  nameOrig  oldbalanceOrg  \
step            1.000000  0.006635  0.022373 -0.000146      -0.010058   
type            0.006635  1.000000  0.088419  0.000213      -0.339760   
amount          0.022373  0.088419  1.000000  0.000124      -0.002762   
nameOrig       -0.000146  0.000213  0.000124  1.000000      -0.000648   
oldbalanceOrg  -0.010058 -0.339760 -0.002762 -0.000648       1.000000   
newbalanceOrig -0.010299 -0.352758 -0.007861 -0.000675       0.998803   
nameDest        0.003767  0.584275 -0.169150 -0.000256      -0.162688   
oldbalanceDest  0.027665 -0.104679  0.294137  0.000245       0.066243   
newbalanceDest  0.025888 -0.059364  0.459304  0.000255       0.042029   
isFraud         0.031578  0.020833  0.076688 -0.000464       0.010154   
isFlaggedFraud  0.003277  0.002685  0.012295  0.000667       0.003835   

                newbalanceOrig  nameDest  oldbalanceDest  newbalanceDest  \
step                 -0.010299  0.003767       

In [21]:
df.drop('newbalanceOrig', axis=1, inplace=True)

# Q2. Describe your fraud detection model in elaboration.

### The Fraud detection model follows these key steps:

#### 1. Data Cleaning :-

For preparation of data modeling the Missing Values, outliers and highly correlated features are handled

#### 2. Feature Engineering :-

a. Categorical variables are encoded into dummy variables using LabelEncoder from sklearn.preprocessing.

b. Continuous variables are standardized by removing the mean and scaling to unit variance. This puts all features on the same scale.

#### 3. Train-Test Split :-

The data is split into a training set (70%) and a test set (30%) using sklearn's train_test_split.

#### 4. Model Training :-

A XGBClassifier is trained on the training data. It trains the model in an additive and sequential manner. XGBClassifier minimizes loss when adding new trees, using loss functions.

#### 5. Prediction :-

The trained model is used to generate predictions on the held-out test data.
We now make predictions on the test data based on the train data.

#### 6. Evaluation :-

We now calculate the metrics like Accuracy, Precision, Recall, which tells us how well the model is performing on new unseen data.

# Q3. How did you select variables to be included in the model?

### There are a few key steps I followed for selecting which variables to include in the fraud detection model:

#### 1. Dataset Knowledge:-

I first looked at the meaning and context of each variable based on prior knowledge from data dictionary. This gave me an idea of which variables are likely to be relevant predictors.

For example, the 'amount' of a transaction is likely an important indicator. Large transaction amounts may be more suspicious.

#### 2. Checking of Missing Value:-
I checked for missing values in each variable. Those with a high percentage of missing values may not be good candidates.

#### 3. Correlation Analysis:-
I calculated the correlation between continuous variables. Highly correlated variables ie. r > 0.9, were dropped to avoid multicollinearity issues.



# Q4. Demonstrate the performance of the model by using best set of tools.

### Train-Test Split

In [22]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Model Training

In [24]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)

### Prediction

In [25]:
y_pred = model.predict(X_test)

### Evaluation

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [28]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy Score:", accuracy)
print("Precision Score:", precision)
print("Recall Score:", recall)


Accuracy Score: 0.9997218127123733
Precision Score: 0.977432296890672
Recall Score: 0.8004106776180698


# Q5. What are the key factors that predict fraudulent customer?

### The key factors that predict fraudulent customer are as follows:

#### Transaction Amount:
Very high transaction amounts are more suspicious, as fraudsters often try to maximize their profit from each stolen account.

#### Change in Account Balance:
Sudden large changes in account balances, either positive or negative, could indicate fraud activity.

####Transaction Type:
Type of Transaction is also a main main factor, where type 'transfer' can be the major source.

#### New Counterparties:
nameOrig/nameDest relationships shows that Transactions with new person that customer doesn't normally interact with are riskier.

# Q6. Do these factors make sense? If yes, How? If not, How not?

### Yes, the key factors predicted by the model to indicate fraud transactions generally make sense:

#### Transaction Amount:
This makes sense as a key indicator because fraudsters will often attempt large transfers to maximize their gain, whereas legitimate transactions are more likely to be small routine amounts. So unusually large amounts raise suspicion.

#### Account Balance Changes:
Sudden spikes or drops in an account balance from a customer's normal patterns could imply fraudulent activity, so this is a sensible factor.

#### Transfer Destination:
The model identifying certain high-risk recipient accounts makes sense, as fraudsters may reuse accounts under their control.

#### Transaction Type:
Cash withdrawals or transfers have higher fraud risk than routine payments. So weighting the transaction type is reasonable.

#### New Counterparties:
Transacting with new unknown origin indicates risk, so tracking relationships makes sense.

# Q7. What kind of prevention should be adopted while company update its infrastructure?


### Here are some fraud prevention measures a company should consider when updating its infrastructure:

1. Implement multi-factor authentication for user accounts and transactions. This adds an extra layer of security beyond just a password.


2. Use AI/ML models to score transactions in real-time for fraud risk. The model can flag high risk transactions for further review.


3. Install security updates and patches promptly to fix known vulnerabilities. Keep all software up-to-date.


4. Mask or tokenize payment card and account numbers. Don't store full account details.


5. Require strong passwords and implement password rotation policies. Enforce password complexity rules.


6. Implement principles of zero trust architecture with strict access controls.


7. Conduct regular security audits and risk assessments, especially after major updates.


8. Provide security awareness training to employees to create a human firewall.


9. Maintain backups and disaster recovery systems in case of breaches.

# Q8. Assuming these actions have been implemented, how would you determine if they work

### There are a few ways to evaluate the effectiveness of the fraud prevention measures after implementing them:

#### 1. Monitoring the fraud rates
    . Calculate fraud rates monthly/quarterly before and after implementation

#### Penetration testing
    . Hire ethical hackers or a security firm to try to breach defenses
    . Gauge how well they are able to penetrate the system
    . Identify any weak points or gaps for improvement

#### Security audits
    . Both internal and external security audits
    . Auditors will try to bypass controls and access data

#### Customer feedback
    . Monitor complaints related to fraud post-implementation
    . Decline in complaints implies improvement
