In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline

# Define dataset path
from pyparsing import Path
import seaborn as sns


Data = Path().resolve() / "data"
file_path = Data / "FraudDetectionDataset.csv"

# Load dataset
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,T1,4174,1292.76,ATM Withdrawal,16.0,Tablet,San Francisco,0,119,13,Debit Card,0
1,T2,4507,1554.58,ATM Withdrawal,13.0,Mobile,New York,4,79,3,Credit Card,0
2,T3,1860,2395.02,ATM Withdrawal,,Mobile,,3,115,9,,0
3,T4,2294,100.1,Bill Payment,15.0,Desktop,Chicago,4,3,4,UPI,0
4,T5,2130,1490.5,POS Payment,19.0,Mobile,San Francisco,2,57,7,Credit Card,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51000 entries, 0 to 50999
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Transaction_ID                    51000 non-null  object 
 1   User_ID                           51000 non-null  int64  
 2   Transaction_Amount                48480 non-null  float64
 3   Transaction_Type                  51000 non-null  object 
 4   Time_of_Transaction               48448 non-null  float64
 5   Device_Used                       48527 non-null  object 
 6   Location                          48453 non-null  object 
 7   Previous_Fraudulent_Transactions  51000 non-null  int64  
 8   Account_Age                       51000 non-null  int64  
 9   Number_of_Transactions_Last_24H   51000 non-null  int64  
 10  Payment_Method                    48531 non-null  object 
 11  Fraudulent                        51000 non-null  int64  
dtypes: f

### Missing Values Ratio: Identify and remove features with a high percentage of missing values.

In [3]:
# === Missing Values Ratio ===

# Calculate total missing values and ratio for each column
missing_ratio = df.isnull().sum() / len(df)

# Combine with counts for better readability
missing_summary = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Ratio(%)': round(missing_ratio * 100, 2)
}).sort_values(by='Missing_Ratio(%)', ascending=False)

# Display
print("=== Missing Values Summary ===")
print(missing_summary)


=== Missing Values Summary ===
                                  Missing_Count  Missing_Ratio(%)
Time_of_Transaction                        2552              5.00
Location                                   2547              4.99
Transaction_Amount                         2520              4.94
Device_Used                                2473              4.85
Payment_Method                             2469              4.84
Transaction_Type                              0              0.00
Transaction_ID                                0              0.00
User_ID                                       0              0.00
Previous_Fraudulent_Transactions              0              0.00
Account_Age                                   0              0.00
Number_of_Transactions_Last_24H               0              0.00
Fraudulent                                    0              0.00


### Handling Missing Values

| Step | Action                       | Why it Matters for Fraud Analysis                        |
| ---- | ---------------------------- | -------------------------------------------------------- |
| 1    | Compute missing ratio        | Helps decide what’s salvageable                          |
| 2    | Drop columns >30% missing    | Removes unreliable features                              |
| 3    | Drop rows with too many NaNs | Prevents model bias from incomplete samples              |
| 4    | Impute missing values        | Keeps dataset balanced for supervised learning           |
| 5    | Sanity check                 | Confirms clean dataset ready for next preprocessing step |


In [4]:
# === 1. Compute Missing Ratio for Each Feature ===
missing_ratio = df.isnull().mean() * 100

# === 2. Define Threshold ===
# Drop columns if more than 30% values are missing
threshold_col = 30.0

# Keep only columns below the threshold
df_reduced = df.loc[:, missing_ratio < threshold_col]

print(f"Columns retained after dropping those with > {threshold_col}% missing values:")
print(df_reduced.columns.tolist())

# === 3. Optionally drop rows with many missing values (e.g., >50%) ===
threshold_row = 0.5  # means drop rows missing half their data
df_reduced = df_reduced.dropna(thresh=int((1 - threshold_row) * df_reduced.shape[1]))

# === 4. Impute remaining missing values (optional but recommended before ML) ===
# Numeric columns → fill with median
num_cols = df_reduced.select_dtypes(include=['float64', 'int64']).columns
df_reduced[num_cols] = df_reduced[num_cols].fillna(df_reduced[num_cols].median())

# Categorical columns → fill with mode
cat_cols = df_reduced.select_dtypes(include=['object']).columns
df_reduced[cat_cols] = df_reduced[cat_cols].fillna(df_reduced[cat_cols].mode().iloc[0])

# === 5. Final sanity check ===
print("\nMissing values after cleanup:")
print(df_reduced.isnull().sum())


Columns retained after dropping those with > 30.0% missing values:
['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type', 'Time_of_Transaction', 'Device_Used', 'Location', 'Previous_Fraudulent_Transactions', 'Account_Age', 'Number_of_Transactions_Last_24H', 'Payment_Method', 'Fraudulent']

Missing values after cleanup:
Transaction_ID                      0
User_ID                             0
Transaction_Amount                  0
Transaction_Type                    0
Time_of_Transaction                 0
Device_Used                         0
Location                            0
Previous_Fraudulent_Transactions    0
Account_Age                         0
Number_of_Transactions_Last_24H     0
Payment_Method                      0
Fraudulent                          0
dtype: int64
