# Data clearning

In [9]:
import pandas as pd

fees = pd.read_csv("project_dataset\\extract - fees - data analyst - .csv")
requests = pd.read_csv("project_dataset\\extract - cash request - data analyst.csv")

## Fees

Missing data:
- 4 entries seem to have `cash_request_id` missing. Assuming that this data can not be recovered, they will need to be cleaned.


Format:
- All columns with dates ("created_at", "updated_at", "paid_at", "from_date", "to_date") need to be converted to a valid date format. 
    - Note: at the moment they're stored as str and float (those with float are likely because the data is NaN)



In [None]:
#
# remove all entries where `cash_request_id` is missing
#

# display(fees.shape) # 21061

fees = fees.dropna(subset=["cash_request_id"])

# display(fees.shape) # 21057



(21061, 13)

(21057, 13)

In [16]:
#
# convert all columns with dates to datetime objects
#

# display(fees.dtypes)

date_columns = ["created_at", "updated_at", "paid_at", "from_date", "to_date"]
fees[date_columns] = fees[date_columns].apply(pd.to_datetime, errors="coerce")

# display(fees.dtypes)


#
# Note: all dates are now converted to datetime objects, but there's rows with missing dates (NaT)
#
fees[date_columns].isna().sum()




created_at        0
updated_at        0
paid_at        5619
from_date     14308
to_date       14545
dtype: int64

## Requests

- Format:
    - The columns `user_id` and `deleted_account_id` are stored as float. They need to be converted to int.
    - All columns with dates need to be converted to a valid date format. 
        - Note: at the moment they're stored as str and float (those with float are likely because the data is NaN)
        - List of columns to convert to date:
            - "created_at"
            - "updated_at"
            - "moderated_at"
            - "reimbursement_date"
            - "cash_request_received_date"
            - "money_back_date"
            - "send_at"
            - "reco_creation"
            - "reco_last_update"




In [22]:
# Convert `user_id` and `deleted_account_id` to int.


requests["user_id"] = requests["user_id"].astype("Int64")
requests["deleted_account_id"] = requests["deleted_account_id"].astype("Int64")


# display(requests.dtypes) 

In [None]:
#
# convert all columns with dates to datetime objects
#

display(requests.dtypes)



date_columns = [
    "created_at", 
    "updated_at", 
    "moderated_at", 
    "reimbursement_date", 
    "cash_request_received_date", 
    "money_back_date", 
    "send_at", 
    "reco_creation", 
    "reco_last_update"
]

requests[date_columns] = requests[date_columns].apply(pd.to_datetime, errors="coerce")

display(requests.dtypes)





id                              int64
amount                        float64
status                         object
created_at                     object
updated_at                     object
user_id                         Int64
moderated_at                   object
deleted_account_id              Int64
reimbursement_date             object
cash_request_received_date     object
money_back_date                object
transfer_type                  object
send_at                        object
recovery_status                object
reco_creation                  object
reco_last_update               object
dtype: object

id                                          int64
amount                                    float64
status                                     object
created_at                    datetime64[ns, UTC]
updated_at                    datetime64[ns, UTC]
user_id                                     Int64
moderated_at                  datetime64[ns, UTC]
deleted_account_id                          Int64
reimbursement_date            datetime64[ns, UTC]
cash_request_received_date         datetime64[ns]
money_back_date               datetime64[ns, UTC]
transfer_type                              object
send_at                       datetime64[ns, UTC]
recovery_status                            object
reco_creation                 datetime64[ns, UTC]
reco_last_update              datetime64[ns, UTC]
dtype: object


<br><br>


## Add timezone for `cash_request_received_date` 

- All columns except `cash_request_received_date` have UTC timezone.
- We'll assume that the times in `cash_request_received_date`are also in UTC and modify all the values accordingly.

<br>

In [24]:
requests["cash_request_received_date"] = requests["cash_request_received_date"].dt.tz_localize("UTC")

display(requests.dtypes)


id                                          int64
amount                                    float64
status                                     object
created_at                    datetime64[ns, UTC]
updated_at                    datetime64[ns, UTC]
user_id                                     Int64
moderated_at                  datetime64[ns, UTC]
deleted_account_id                          Int64
reimbursement_date            datetime64[ns, UTC]
cash_request_received_date    datetime64[ns, UTC]
money_back_date               datetime64[ns, UTC]
transfer_type                              object
send_at                       datetime64[ns, UTC]
recovery_status                            object
reco_creation                 datetime64[ns, UTC]
reco_last_update              datetime64[ns, UTC]
dtype: object

In [27]:
#
# Note: all dates are now converted to datetime objects, but there's rows with missing dates (NaT)
#
requests[date_columns].isna().sum()

created_at                        0
updated_at                        0
moderated_at                   8058
reimbursement_date            20920
cash_request_received_date     7681
money_back_date               11930
send_at                        7504
reco_creation                 20640
reco_last_update              20640
dtype: int64