In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [17]:
# Function to clean df_credit
def clean_credit(df):
    # Handle Missing Values
    imputer = SimpleImputer(strategy='mean')
    df.iloc[:, :] = imputer.fit_transform(df)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Correct data types 
    df = df.astype({
        'Time': 'float64', 'V1': 'float64', 'V2': 'float64', 'V3': 'float64', 'V4': 'float64', 'V5': 'float64',
        'V6': 'float64', 'V7': 'float64', 'V8': 'float64', 'V9': 'float64', 'V10': 'float64', 'V11': 'float64',
        'V12': 'float64', 'V13': 'float64', 'V14': 'float64', 'V15': 'float64', 'V16': 'float64', 'V17': 'float64',
        'V18': 'float64', 'V19': 'float64', 'V20': 'float64', 'V21': 'float64', 'V22': 'float64', 'V23': 'float64',
        'V24': 'float64', 'V25': 'float64', 'V26': 'float64', 'V27': 'float64', 'V28': 'float64', 'Amount': 'float64',
        'Class': 'int64'
    })
    return df

In [18]:
# Function to clean df_fraud
def clean_fraud(df):
    # Convert time columns to datetime
    df['signup_time'] = pd.to_datetime(df['signup_time'])
    df['purchase_time'] = pd.to_datetime(df['purchase_time'])

    # Handle Missing Values
    imputer = SimpleImputer(strategy='mean')
    df['ip_address'] = imputer.fit_transform(df[['ip_address']])

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Correct data types
    df = df.astype({
        'user_id': 'int64', 'purchase_value': 'int64', 'device_id': 'object',
        'source': 'object', 'browser': 'object', 'sex': 'object', 'age': 'int64',
        'ip_address': 'float64', 'class': 'int64'
    })
    return df

In [19]:
# Function to clean df_ip
def clean_ip(df):
    # Handle Missing Values
    imputer = SimpleImputer(strategy='mean')
    df['lower_bound_ip_address'] = imputer.fit_transform(df[['lower_bound_ip_address']])

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Correct data types
    df = df.astype({
        'lower_bound_ip_address': 'float64', 'upper_bound_ip_address': 'int64', 'country': 'object'
    })
    return df

In [20]:
df_credit=pd.read_csv('../Data/creditcard.csv')
df_fraud=pd.read_csv('../Data/Fraud_Data.csv')
df_Ip=pd.read_csv('../Data/IpAddress_to_Country.csv')

### Clean the data frames

In [21]:
df_credit_cleaned = clean_credit(df_credit)
df_fraud_cleaned = clean_fraud(df_fraud)
df_ip_cleaned = clean_ip(df_Ip)

In [22]:
print(df_credit_cleaned.info())
print(df_fraud_cleaned.info())
print(df_ip_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 283726 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283726 non-null  float64
 1   V1      283726 non-null  float64
 2   V2      283726 non-null  float64
 3   V3      283726 non-null  float64
 4   V4      283726 non-null  float64
 5   V5      283726 non-null  float64
 6   V6      283726 non-null  float64
 7   V7      283726 non-null  float64
 8   V8      283726 non-null  float64
 9   V9      283726 non-null  float64
 10  V10     283726 non-null  float64
 11  V11     283726 non-null  float64
 12  V12     283726 non-null  float64
 13  V13     283726 non-null  float64
 14  V14     283726 non-null  float64
 15  V15     283726 non-null  float64
 16  V16     283726 non-null  float64
 17  V17     283726 non-null  float64
 18  V18     283726 non-null  float64
 19  V19     283726 non-null  float64
 20  V20     283726 non-null  float64
 21  V21     283726 

In [23]:
# Save cleaned data frames 
df_credit_cleaned.to_csv('../Data/credit_card_clean.csv', index=False)
df_fraud_cleaned.to_csv('../Data/fraud_Data_clean.csv', index=False)
df_ip_cleaned.to_csv('../Data/ipAddress_clean.csv', index=False)