In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "/Users/dylanfunk/my_project/PS_20174392719_1491204439457_log.csv"
df = pd.read_csv(file_path)

# Step 1: Check for Missing Values
print("Missing Values in the Dataset:")
print(df.isnull().sum())

# Step 2: Remove Duplicate Entries
print("Number of Duplicate Rows:", df.duplicated().sum())
df = df.drop_duplicates()

# Step 3: Convert Data Types to Appropriate Types
print("\nData Types of Columns:")
print(df.dtypes)

df['step'] = df['step'].astype(int)
df['isFraud'] = df['isFraud'].astype(int)
df['isFlaggedFraud'] = df['isFlaggedFraud'].astype(int)
df['amount'] = df['amount'].astype(float)
df['oldbalanceOrg'] = df['oldbalanceOrg'].astype(float)
df['newbalanceOrig'] = df['newbalanceOrig'].astype(float)
df['oldbalanceDest'] = df['oldbalanceDest'].astype(float)
df['newbalanceDest'] = df['newbalanceDest'].astype(float)

# Step 4: Check for Outliers (Summary Statistics)
print("\nSummary Statistics:")
print(df.describe())

# Step 5: Normalize/Scale Numeric Features
scaler = StandardScaler()
numeric_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Save the cleaned data to a new CSV file
df.to_csv('cleaned_transactions.csv', index=False)

# Step 6: Re-check Data
print("\nCleaned Dataset:")
print(df.head())



Missing Values in the Dataset:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
Number of Duplicate Rows: 0

Data Types of Columns:
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

Summary Statistics:
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.