In [1]:
import pandas as pd

In [2]:
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
import sys
sys.path.append('../scripts') 

In [4]:
# Load the datasets
logging.info("Loading datasets...")
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_to_country = pd.read_csv('../data/IpAddress_to_Country.csv')

2025-02-07 19:45:15,088 - INFO - Loading datasets...


In [5]:
from data_cleaning import handle_missing_values,remove_duplicates_and_convert_types, univariate_analysis,bivariate_analysis, process_fraud_data,merge_geolocation, feature_engineering

In [6]:
# Apply data cleaning functions step by step
fraud_data = handle_missing_values(fraud_data)
print(fraud_data.head())

2025-02-07 19:45:16,748 - INFO - Missing values before cleaning:
2025-02-07 19:45:17,005 - INFO - user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64


   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50              44   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0  
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0  


In [7]:
fraud_data = remove_duplicates_and_convert_types(fraud_data)
print(fraud_data.head())

2025-02-07 19:45:17,082 - INFO - Removing duplicates and converting time columns to datetime...


   user_id         signup_time       purchase_time  purchase_value  \
0    22058 2015-02-24 22:55:49 2015-04-18 02:47:11              34   
1   333320 2015-06-07 20:39:50 2015-06-08 01:38:54              16   
2     1359 2015-01-01 18:52:44 2015-01-01 18:52:45              15   
3   150084 2015-04-28 21:13:25 2015-05-04 13:54:50              44   
4   221365 2015-07-21 07:09:52 2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0  
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0  


In [8]:
# Assuming fraud_data is already loaded as a pandas DataFrame
fraud_data = process_fraud_data(fraud_data)

2025-02-07 19:45:17,429 - INFO - Converting IP addresses to integers...
2025-02-07 19:45:17,684 - INFO - Processed IP addresses. Sample data:
0     732758368
1     350311387
2    2621473820
3    3840542443
4     415583117
Name: ip_int, dtype: int64


In [9]:
print(type(fraud_data))  # Should be <class 'pandas.core.frame.DataFrame'>

if isinstance(fraud_data, pd.DataFrame):
    print(fraud_data.dtypes)  # Show column data types


<class 'pandas.core.frame.DataFrame'>
user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id                 object
source                    object
browser                   object
sex                       object
age                        int64
ip_address               float64
class                      int64
ip_int                     int64
dtype: object


In [10]:
print(type(fraud_data))  # Expected output: <class 'pandas.core.frame.DataFrame'>

<class 'pandas.core.frame.DataFrame'>


In [11]:
fraud_data = feature_engineering(fraud_data)
print(fraud_data.head())

2025-02-07 19:45:17,732 - INFO - Performing feature engineering...
2025-02-07 19:45:17,885 - INFO - Normalizing continuous features...
2025-02-07 19:45:17,900 - INFO - Performing one-hot encoding...


   user_id         signup_time       purchase_time  purchase_value  \
0    22058 2015-02-24 22:55:49 2015-04-18 02:47:11              34   
1   333320 2015-06-07 20:39:50 2015-06-08 01:38:54              16   
2     1359 2015-01-01 18:52:44 2015-01-01 18:52:45              15   
3   150084 2015-04-28 21:13:25 2015-05-04 13:54:50              44   
4   221365 2015-07-21 07:09:52 2015-09-09 18:40:53              39   

       device_id  age    ip_address  class      ip_int  purchase_hour  ...  \
0  QVPSPJUOCKZAR   39  7.327584e+08      0   732758368              2  ...   
1  EOGFQPIZPYXFZ   53  3.503114e+08      0   350311387              1  ...   
2  YSSKYOSJHPPLJ   53  2.621474e+09      1  2621473820             18  ...   
3  ATGTXKYKUDUQN   41  3.840542e+09      0  3840542443             13  ...   
4  NAUITBZFJKHWW   45  4.155831e+08      0   415583117             18  ...   

   purchase_value_scaled  age_scaled  transaction_count_scaled  source_Direct  \
0              -0.160204    0

In [12]:
merged_df = merge_geolocation(fraud_data, ip_to_country)

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [None]:
univariate_analysis(fraud_data)

In [None]:
bivariate_analysis(fraud_data)

In [None]:

# Save the cleaned data to a new CSV file
logging.info("Saving the cleaned data to 'Cleaned_Fraud_Data.csv'...")
fraud_data.to_csv('../data/Cleaned_Fraud_Data.csv', index=False)

logging.info("Data cleaning and preprocessing completed successfully.")