# **Task 1 - Data Analysis and Preprocessing**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import logging


In [2]:
# Add the 'scripts' directory to the Python path for module imports
scripts_path = os.path.abspath(os.path.join('..', 'scripts'))
sys.path.append(scripts_path)
print("✅ 'scripts' directory added to the Python path successfully!")

✅ 'scripts' directory added to the Python path successfully!


In [3]:
# Set max rows and columns to display for better visibility in outputs
import pandas as pd

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Log the configuration of display options
logging.info("📏 Maximum rows and columns set for better visibility in outputs.")
print("=======================================")
print("✅ Display options configured successfully!")

✅ Display options configured successfully!


In [4]:
# Configure logging
from logger import LoggerSetup  # Assuming LoggerSetup is defined in scripts/

# Initialize the logger for tracking the data analysis and preprocessing steps
log_file_path = '../logs/data_analysis&preprocessing.log'
logger = LoggerSetup(log_file=log_file_path).get_logger()
print("============================================================================")
print("🔧 Logging system initialized for data analysis and preprocessing.")

2025-02-05 20:26:48,021 - logger - INFO - Logger configured successfully.
INFO:logger:Logger configured successfully.


🔧 Logging system initialized for data analysis and preprocessing.


In [5]:
# Import the DatasetLoader class for loading and preprocessing datasets
from data_loader import DatasetLoader 

In [6]:
# Example usage of the DatasetLoader to load a dataset
fraud_data_path = '../data/Fraud_Data.csv'  
data_loader = DatasetLoader(filepath=fraud_data_path, logger=logger)  

# Load the dataset 
("🔄 Starting to load the dataset...")

fraud_df = data_loader.load_dataset()

# Display the first 10 rows of the dataset
print("================================================================================")
print("               📋 Here are the first 10 rows of the dataset:")
print("================================================================================")
fraud_df.head(10)


2025-02-05 20:26:48,342 - logger - INFO - Dataset loaded successfully.
INFO:logger:Dataset loaded successfully.


               📋 Here are the first 10 rows of the dataset:


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
5,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0
6,50116,2015-08-01 22:40:52,2015-08-27 03:37:57,11,IWKVZHJOCLPUR,Ads,Chrome,F,19,3987484000.0,0
7,360585,2015-04-06 07:35:45,2015-05-25 17:21:14,27,HPUCUYLMJBYFW,Ads,Opera,M,34,1692459000.0,0
8,159045,2015-04-21 23:38:34,2015-06-02 14:01:54,30,ILXYDOZIHOOHT,SEO,IE,F,43,3719094000.0,0
9,182338,2015-01-25 17:49:49,2015-03-23 23:05:42,62,NRFFPPHZYFUVC,Ads,IE,M,31,341674700.0,0


In [7]:
# Convert 'signup_time' and 'purchase_time' to datetime format
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'], errors='coerce')
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'], errors='coerce')

# Log the conversion process
logging.info("🔄 Converted 'signup_time' and 'purchase_time' to datetime format.")
print("🔄 Converted 'signup_time' and 'purchase_time' to datetime format.")

# Verify the data types after conversion
print("=======================================")
print("Data Types After Conversion:")
print(fraud_df.dtypes)

🔄 Converted 'signup_time' and 'purchase_time' to datetime format.
Data Types After Conversion:
user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id                 object
source                    object
browser                   object
sex                       object
age                        int64
ip_address               float64
class                      int64
dtype: object


In [8]:
# Display information about the DataFrame
print("           📊 DataFrame Overview:")
print("==========================================")
fraud_df.info()

print("=======================================")
print("✅ Information displayed successfully.")

           📊 DataFrame Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  object        
 6   browser         151112 non-null  object        
 7   sex             151112 non-null  object        
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  float64       
 10  class           151112 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(4), object(4)
memory usage: 12.7+ MB
✅ Information displayed successfully.


In [9]:
# Check for missing values in the dataset
missing_values = fraud_df.isnull().sum()

# Attractive output for missing values
print("        🔍 Missing Values in Each Column:")
print("==============================================")
print(missing_values)

# Confirm if there are any missing values
if missing_values.sum() == 0:
    print("===============================================")
    print("✅ No missing values found in the dataset.")
    
else:
    total_missing = missing_values.sum()
    print(f"⚠️ There are {total_missing} missing values in the dataset.")

        🔍 Missing Values in Each Column:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64
✅ No missing values found in the dataset.


In [10]:
# Check for duplicate rows in the DataFrame
duplicate_count = fraud_df.duplicated().sum()

# Attractive output for duplicate count
print("🔍 Checking for Duplicate Rows:")
print("=======================================")
print(f"📋 Number of duplicate rows: {duplicate_count}")

# Drop duplicates if any are found
if duplicate_count > 0:
    print("⚠️ Duplicate rows found! Dropping duplicates...")
    fraud_df.drop_duplicates(inplace=True)
    print("✅ Duplicates dropped successfully.")
else:
    print("✅ No duplicate rows found in the dataset.")

🔍 Checking for Duplicate Rows:
📋 Number of duplicate rows: 0
✅ No duplicate rows found in the dataset.


In [11]:
# Summary Statistics of numerical features
summary_statistics = fraud_df.describe(include='number')

# Display the summary statistics
print("          📊 Summary Statistics of Numerical Features:")
print("=================================================================\n")
summary_statistics

          📊 Summary Statistics of Numerical Features:



Unnamed: 0,user_id,purchase_value,age,ip_address,class
count,151112.0,151112.0,151112.0,151112.0,151112.0
mean,200171.04097,36.935372,33.140704,2152145000.0,0.093646
std,115369.285024,18.322762,8.617733,1248497000.0,0.291336
min,2.0,9.0,18.0,52093.5,0.0
25%,100642.5,22.0,27.0,1085934000.0,0.0
50%,199958.0,35.0,33.0,2154770000.0,0.0
75%,300054.0,49.0,39.0,3243258000.0,0.0
max,400000.0,154.0,76.0,4294850000.0,1.0


In [12]:
# Statistical summary of object data features
object_summary_statistics = fraud_df.describe(include='object')

# Display the summary statistics
print("     📋 Statistical Summary of Object Data Features:")
print("============================================================")
object_summary_statistics

     📋 Statistical Summary of Object Data Features:


Unnamed: 0,device_id,source,browser,sex
count,151112,151112,151112,151112
unique,137956,3,5,2
top,CQTUVBYIWWWBC,SEO,Chrome,M
freq,20,60615,61432,88293
