In [83]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path

project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/fraud-detection-week-5-and-6
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
import src.data.loader as loader
import src.data.preprocessor as preprocessor
from tabulate import tabulate

In [85]:
data_loader = loader.DataLoader()
raw_fraud_df= data_loader.load_csv() 

Loaded ../data/raw/fraud_data.csv to dataframe!


In [86]:
print("Basic Data Info:")
raw_fraud_df.info()

print("\nStatistical Info:")
print(tabulate(raw_fraud_df.describe(include="all"), headers="keys", tablefmt="grid"))

Basic Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB

Statistical Info:
+--------+-----------+---------------------+---------------------+------------------+---------------+----------+-----------+--------+--------------+------------------+----------------+
|      

### Initial Fraud Data Analysis

##### General
1. No Null Values Seen on the data, but futher investigation needed to check for validity

##### Numeric Values
1. Purchase Value
    - Has a mean value of 36.9 and a median(50%) value of 35. This shows the data is slightly right skewed on this feature
2. Age
    - Has a mean value of 33.1 and a median value of 33. This shows the data is symmetrical on this feature

##### Categorical Values 
1. Unique Values 
    - Source has 3 unique values with the most occuring value being SEO
    - Browser has 5 unique values with the most occuring value being Chrome
    - Sex has 2 unique values with the most occuring value being M


In [99]:
# Step 1: Clean Data
data_preprocessor = preprocessor.DataPreProcessor(raw_fraud_df)
clean_fraud_data = data_preprocessor.get_cleaned_data()

No duplicated rows found.
Converted purchase_time to datetime.
Converted signup_time to datetime.
No generic null values found.
0 transactions with purchase value <= 0 found
0 transactions where sign up date > purchase date values found
0 transactions with age values below 0 or above 100 found
Data preprocessing complete!


In [92]:
print("Basic Cleaned Data Info:")
clean_fraud_data.info()
print(
    tabulate(clean_fraud_data.head(5), headers="keys", tablefmt="grid")
)

Basic Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  object        
 6   browser         151112 non-null  object        
 7   sex             151112 non-null  object        
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  float64       
 10  class           151112 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(4), object(4)
memory usage: 12.7+ MB
+----+-----------+---------------------+---------------------+------------------+------------