In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sys import path
import os
path.append(os.path.abspath(os.path.join('..')))
from util.util import *

## Feature Details
* *Transaction ID:* A unique identifier for each transaction.
* *Customer ID:* A unique identifier for each customer.
* *Transaction Amount:* The total amount of money exchanged in the transaction.
* *Transaction Date:* The date and time when the transaction took place.
* *Payment Method:* The method used to complete the transaction (e.g., credit card, PayPal, etc.).
* *Product Category:* The category of the product involved in the transaction.
* *Quantity:* The number of products involved in the transaction.
* *Customer Age:* The age of the customer making the transaction.
* *Customer Location:* The geographical location of the customer.
* *Device Used:* The type of device used to make the transaction (e.g., mobile, desktop).
* *IP Address:* The IP address of the device used for the transaction.
* *Shipping Address:* The address where the product was shipped.
* *Billing Address:* The address associated with the payment method.
* *Is Fraudulent:* A binary indicator of whether the transaction is fraudulent (1 for fraudulent, 0 for legitimate).
* *Account Age Days:* The age of the customer's account in days at the time of the transaction.
* *Transaction Hour:* The hour of the day when the transaction occurred.

In [3]:
os.chdir('..')

path = os.getcwd()+'/data/transactions.csv'
# path = os.getcwd()+'/data/transactions_2.csv'

df = pd.read_csv(path, encoding='latin1')

## Preprocessing Steps:
- Check for missing values
- Check for duplicates
- Create and Drop columns to the analysis
- Convert columns to appropriate data types
- Check for outliers
- Save the processed data


In [4]:
print('DataFrame size:', df.shape)
print('----------------------------------------')
#Checking for null values in the DataFrame
print('Nulls Percentage:\n',round((df.isnull().sum() / len(df) * 100), 2))
print('As we can see, there are no null values in the DataFrame, so no need any imputation.')
print('----------------------------------------')
#Checking for duplicates in the DataFrame
print('Duplicates:', df.duplicated().sum())
print('There are no duplicates in the DataFrame.')
print('----------------------------------------')
#Creation a new columns for see if the shipping and billing address are the same
df['Shipping Billing Same'] = df.apply(lambda x: 1 if x['Shipping Address'] == x['Billing Address'] else 0, axis=1)
#Drop columns that are not useful
df.drop(columns=['Transaction ID', 'Customer ID', 'Transaction Date' , 'Customer Location','IP Address','Shipping Address', 'Billing Address'], axis=1, inplace=True)
#Checking outliers in the DataFrame:
get_outliers(df, implace=True)
print('----------------------------------------')
#Save df in a new csv file
df.to_csv(os.getcwd()+'/data/transactions_processed.csv', index=False)
# Final visualization of the first rows of the transformed DataFrame
print("Showing the first rows of the transformed DataFrame:")
print(df.info())
print('----------------------------------------')
df.head()

DataFrame size: (1472952, 16)
----------------------------------------
Nulls Percentage:
 Transaction ID        0.0
Customer ID           0.0
Transaction Amount    0.0
Transaction Date      0.0
Payment Method        0.0
Product Category      0.0
Quantity              0.0
Customer Age          0.0
Customer Location     0.0
Device Used           0.0
IP Address            0.0
Shipping Address      0.0
Billing Address       0.0
Is Fraudulent         0.0
Account Age Days      0.0
Transaction Hour      0.0
dtype: float64
As we can see, there are no null values in the DataFrame, so no need any imputation.
----------------------------------------
Duplicates: 0
There are no duplicates in the DataFrame.
----------------------------------------
Outliers in Transaction Amount: 79180
Outliers in Quantity: 0
Outliers in Customer Age: 12977
Outliers in Is Fraudulent: 54944
Outliers in Account Age Days: 0
Outliers in Transaction Hour: 0
Outliers in Shipping Billing Same: 132807
-----------------------

Unnamed: 0,Transaction Amount,Payment Method,Product Category,Quantity,Customer Age,Device Used,Is Fraudulent,Account Age Days,Transaction Hour,Shipping Billing Same
0,58.09,bank transfer,electronics,1,17,tablet,0,30,5,1
1,389.96,debit card,electronics,2,40,desktop,0,72,8,1
2,134.19,PayPal,home & garden,2,22,tablet,0,63,3,1
3,226.17,bank transfer,clothing,5,31,desktop,0,124,20,1
4,121.53,bank transfer,clothing,2,51,tablet,0,158,5,1
