In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Paths
RAW_DIR = '../data/raw'
CLEAN_DIR = '../data/cleaned'
os.makedirs(CLEAN_DIR, exist_ok=True)

In [3]:
users = pd.read_csv(os.path.join(RAW_DIR, 'users.csv'), parse_dates=['signup_date'])
transactions = pd.read_csv(os.path.join(RAW_DIR, 'transactions.csv'), parse_dates=['transaction_date'])

In [5]:
# Preview
print(users.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002 entries, 0 to 2001
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   user_id         2002 non-null   int64         
 1   signup_date     2002 non-null   datetime64[ns]
 2   age             2001 non-null   float64       
 3   region          2001 non-null   object        
 4   signup_channel  2002 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 78.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   transaction_id         6000 non-null   int64         
 1   user_id                5999 non-null   float64       
 2   transaction_date       6000 non-null   datetime64[ns]
 3   amount                 5999 non-null 

In [12]:
# Data Cleaning

transactions = transactions.drop_duplicates()
transactions['amount'] = pd.to_numeric(transactions['amount'], errors='coerce')
transactions = transactions.dropna(subset=['user_id', 'transaction_date', 'amount'])

In [18]:
# Checking missing values

print(users.isnull().sum())

print(transactions.isnull().sum())

user_id           0
signup_date       0
age               1
region            1
signup_channel    0
dtype: int64
transaction_id           0
user_id                  0
transaction_date         0
amount                   0
transaction_type         0
used_instant_transfer    0
dtype: int64


In [20]:
transactions_cleaned = transactions.dropna()

In [21]:
# Outlier Detection
amt_mean = transactions['amount'].mean()
amt_std = transactions['amount'].std()
_outliers = transactions[transactions_cleaned['amount'] > amt_mean + 4*amt_std]
print('outliers:', _outliers.shape[0])

outliers: 59


In [22]:
# Save the Cleaned Dataset
users.to_csv(os.path.join(CLEAN_DIR, 'users_cleaned.csv'), index=False)
transactions_cleaned.to_csv(os.path.join(CLEAN_DIR, 'transactions_cleaned.csv'), index=False)