In [2]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
df = pd.read_csv('../data/data.csv')  # or adjust path if needed
df.head()


Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,0,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,0,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,0,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,0,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,1,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1


In [7]:
print("Initial shape:", df.shape)

Initial shape: (50000, 21)


In [9]:
from sklearn.model_selection import train_test_split

y = df['Fraud_Label']
X = df.drop(columns=['Fraud_Label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (40000, 20)
Test shape: (10000, 20)


In [10]:
#Handle missing values
print("Missing values:\n", df.isnull().sum())
df = df.dropna()  

Missing values:
 Transaction_ID                  0
User_ID                         0
Transaction_Amount              0
Transaction_Type                0
Timestamp                       0
Account_Balance                 0
Device_Type                     0
Location                        0
Merchant_Category               0
IP_Address_Flag                 0
Previous_Fraudulent_Activity    0
Daily_Transaction_Count         0
Avg_Transaction_Amount_7d       0
Failed_Transaction_Count_7d     0
Card_Type                       0
Card_Age                        0
Transaction_Distance            0
Authentication_Method           0
Risk_Score                      0
Is_Weekend                      0
Fraud_Label                     0
dtype: int64


In [11]:
#  Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract new time features
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Month'] = df['Timestamp'].dt.month

In [12]:
# Drop unnecessary columns
drop_cols = ['Transaction_ID', 'User_ID', 'Timestamp']
df.drop(columns=drop_cols, inplace=True)

In [13]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [14]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Fraud_Label')

In [15]:
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [16]:
df.head()

Unnamed: 0,Transaction_Amount,Transaction_Type,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label,Hour,DayOfWeek,Month
0,-0.604147,1.338189,1.49231,-1.229118,0.698578,1.413571,-0.229898,-0.330362,-0.120121,1.289836,0.704581,-1.349176,-0.797272,-1.120662,-1.338987,1.208753,-0.654093,0,19,0,8
1,-0.995285,-0.450619,0.88425,-0.004848,-0.006629,-1.41309,-0.229898,-0.330362,1.365176,1.580752,1.411665,0.440324,0.956729,-0.205135,1.34224,-1.409646,-0.654093,1,4,2,6
2,-0.713888,0.443785,-1.693492,1.219422,-0.711836,0.706906,-0.229898,-0.330362,1.612725,-1.451837,1.411665,1.335075,1.536564,-0.409067,-1.338987,1.176089,-0.654093,1,15,1,6
3,1.569711,-1.345022,0.92187,1.219422,-0.006629,-1.41309,-0.229898,-0.330362,0.127429,-0.514864,1.411665,1.335075,-0.637818,-0.823374,-0.445245,1.014502,-0.654093,1,0,3,12
4,-0.69038,1.338189,1.46246,-0.004848,-0.711836,-0.706425,-0.229898,3.026979,1.612725,0.519293,1.411665,0.440324,0.289918,-1.062541,1.34224,-0.415801,1.528836,1,23,5,11


In [17]:
df.to_csv('../data/processed_data.csv', index=False)  # Save the processed data