
# ***Evaluating and cleaning the Tranactions Table***






In [3]:
import pandas as pd
# Display all columns
pd.set_option('display.max_columns', None)

# Display all rows
pd.set_option('display.max_rows', None)

In [None]:
from google.colab import auth
auth.authenticate_user()
from google.cloud import bigquery
from pandas_gbq import read_gbq


project_id = "data-analytics-bootcamp-363212"

query = '''
SELECT *
FROM `data-analytics-bootcamp-363212.neo_bank.transactions`
'''

df_transactions = read_gbq(query, project_id=project_id)

Downloading:  96%|[32m█████████▌[0m|

In [None]:
df_transactions.head()

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,ea_cardholderpresence,ea_merchant_mcc,ea_merchant_city,ea_merchant_country,direction,user_id,created_date
0,transaction_505,TOPUP,AED,298.62,COMPLETED,,,,,INBOUND,user_4773,2018-08-02 09:41:07.426363+00:00
1,transaction_1755379,EXCHANGE,AED,3.51,COMPLETED,,,,,OUTBOUND,user_10435,2018-09-06 10:23:15.807148+00:00
2,transaction_1667,EXCHANGE,AED,0.05,COMPLETED,,,,,OUTBOUND,user_397,2018-05-16 06:09:28.063037+00:00
3,transaction_1429,EXCHANGE,AED,24.14,COMPLETED,,,,,OUTBOUND,user_4399,2018-12-19 20:22:03.550130+00:00
4,transaction_345,EXCHANGE,AED,0.21,COMPLETED,,,,,OUTBOUND,user_1731,2018-10-27 13:29:56.524826+00:00


In [None]:
df_transactions.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740075 entries, 0 to 2740074
Data columns (total 12 columns):
 #   Column                 Non-Null Count    Dtype              
---  ------                 --------------    -----              
 0   transaction_id         2740075 non-null  object             
 1   transactions_type      2740075 non-null  object             
 2   transactions_currency  2740075 non-null  object             
 3   amount_usd             2740075 non-null  float64            
 4   transactions_state     2740075 non-null  object             
 5   ea_cardholderpresence  1569481 non-null  object             
 6   ea_merchant_mcc        1581417 non-null  float64            
 7   ea_merchant_city       1581281 non-null  object             
 8   ea_merchant_country    1581096 non-null  object             
 9   direction              2740075 non-null  object             
 10  user_id                2740075 non-null  object             
 11  created_date           2

In [None]:
df_transactions.isnull().sum()



Unnamed: 0,0
transaction_id,0
transactions_type,0
transactions_currency,0
amount_usd,0
transactions_state,0
ea_cardholderpresence,1170594
ea_merchant_mcc,1158658
ea_merchant_city,1158794
ea_merchant_country,1158979
direction,0


In [None]:
df_transactions.nunique()

Unnamed: 0,0
transaction_id,2740075
transactions_type,10
transactions_currency,35
amount_usd,48548
transactions_state,6
ea_cardholderpresence,3
ea_merchant_mcc,664
ea_merchant_city,63253
ea_merchant_country,198
direction,2


In [None]:
cols = ['ea_cardholderpresence','ea_merchant_city', 'ea_merchant_country']
df_transactions[cols] = df_transactions[cols].fillna('unknown')
df_transactions['ea_merchant_mcc'] = df_transactions['ea_merchant_mcc'].fillna(0)


In [None]:
df_transactions.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740075 entries, 0 to 2740074
Data columns (total 12 columns):
 #   Column                 Non-Null Count    Dtype              
---  ------                 --------------    -----              
 0   transaction_id         2740075 non-null  object             
 1   transactions_type      2740075 non-null  object             
 2   transactions_currency  2740075 non-null  object             
 3   amount_usd             2740075 non-null  float64            
 4   transactions_state     2740075 non-null  object             
 5   ea_cardholderpresence  2740075 non-null  object             
 6   ea_merchant_mcc        2740075 non-null  float64            
 7   ea_merchant_city       2740075 non-null  object             
 8   ea_merchant_country    2740075 non-null  object             
 9   direction              2740075 non-null  object             
 10  user_id                2740075 non-null  object             
 11  created_date           2

In [None]:
df_transactions.head()

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,ea_cardholderpresence,ea_merchant_mcc,ea_merchant_city,ea_merchant_country,direction,user_id,created_date
0,transaction_505,TOPUP,AED,298.62,COMPLETED,unknown,0.0,unknown,unknown,INBOUND,user_4773,2018-08-02 09:41:07.426363+00:00
1,transaction_1755379,EXCHANGE,AED,3.51,COMPLETED,unknown,0.0,unknown,unknown,OUTBOUND,user_10435,2018-09-06 10:23:15.807148+00:00
2,transaction_1667,EXCHANGE,AED,0.05,COMPLETED,unknown,0.0,unknown,unknown,OUTBOUND,user_397,2018-05-16 06:09:28.063037+00:00
3,transaction_1429,EXCHANGE,AED,24.14,COMPLETED,unknown,0.0,unknown,unknown,OUTBOUND,user_4399,2018-12-19 20:22:03.550130+00:00
4,transaction_345,EXCHANGE,AED,0.21,COMPLETED,unknown,0.0,unknown,unknown,OUTBOUND,user_1731,2018-10-27 13:29:56.524826+00:00


In [None]:
df_transactions.columns = df_transactions.columns.str.lower().str.replace(" ", "_")
text_cols = df_transactions.select_dtypes(include='object').columns

for col in text_cols:
    df_transactions[col] = (df_transactions[col].str.lower() .str.strip().str.replace(r'\s+', '_', regex=True) .str.replace(r'[^\w]', '', regex=True))

In [None]:
df_transactions.head()

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,ea_cardholderpresence,ea_merchant_mcc,ea_merchant_city,ea_merchant_country,direction,user_id,created_date
0,transaction_505,topup,aed,298.62,completed,unknown,0.0,unknown,unknown,inbound,user_4773,2018-08-02 09:41:07.426363+00:00
1,transaction_1755379,exchange,aed,3.51,completed,unknown,0.0,unknown,unknown,outbound,user_10435,2018-09-06 10:23:15.807148+00:00
2,transaction_1667,exchange,aed,0.05,completed,unknown,0.0,unknown,unknown,outbound,user_397,2018-05-16 06:09:28.063037+00:00
3,transaction_1429,exchange,aed,24.14,completed,unknown,0.0,unknown,unknown,outbound,user_4399,2018-12-19 20:22:03.550130+00:00
4,transaction_345,exchange,aed,0.21,completed,unknown,0.0,unknown,unknown,outbound,user_1731,2018-10-27 13:29:56.524826+00:00


In [None]:
df_transactions[df_transactions.duplicated(keep=False)]

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,ea_cardholderpresence,ea_merchant_mcc,ea_merchant_city,ea_merchant_country,direction,user_id,created_date


In [None]:
df_transactions.tail(20)

Unnamed: 0,transaction_id,transactions_type,transactions_currency,amount_usd,transactions_state,ea_cardholderpresence,ea_merchant_mcc,ea_merchant_city,ea_merchant_country,direction,user_id,created_date
2740055,transaction_89344,card_payment,zar,29.62,completed,False,5812.0,mossel_bay,zaf,outbound,user_1805,2018-12-30 03:12:48.330871+00:00
2740056,transaction_1810046,card_payment,zar,37.02,completed,False,5812.0,green_point,zaf,outbound,user_10145,2018-11-28 09:02:48.654125+00:00
2740057,transaction_88964,card_payment,zar,0.2,completed,False,5812.0,johannesburg,zaf,outbound,user_3017,2019-01-17 10:26:11.171715+00:00
2740058,transaction_1810045,card_payment,zar,4.23,completed,False,5812.0,nelspruit,zaf,outbound,user_15106,2018-11-28 07:52:29.879505+00:00
2740059,transaction_1846231,card_payment,zar,3.54,completed,False,5812.0,durban,zaf,outbound,user_13647,2018-12-19 02:12:20.070997+00:00
2740060,transaction_1809880,card_payment,zar,14.54,completed,False,5812.0,hout_bay,zaf,outbound,user_19085,2019-04-04 18:57:33.377703+00:00
2740061,transaction_1846551,card_payment,zar,2.12,completed,False,5813.0,durban,zaf,outbound,user_13647,2018-10-27 11:28:21.753997+00:00
2740062,transaction_227975,card_payment,zar,15.87,completed,False,5813.0,johannesburg,zaf,outbound,user_2818,2018-05-12 09:19:52.595898+00:00
2740063,transaction_1846398,card_payment,zar,23.8,completed,False,5813.0,cape_town_cbd,zaf,outbound,user_18197,2019-03-17 13:55:44.563988+00:00
2740064,transaction_1846752,card_payment,zar,0.0,reverted,False,7349.0,avalon,aus,outbound,user_12577,2018-11-07 12:40:24.093874+00:00


# ***Evaluating and cleaning the Notifications Table***


---




In [None]:
from google.colab import auth
auth.authenticate_user()
from google.cloud import bigquery
import pandas as pd
from pandas_gbq import read_gbq


project_id = "data-analytics-bootcamp-363212"

query = '''
SELECT *
FROM `data-analytics-bootcamp-363212.neo_bank.notifications`
'''

df_notifications = read_gbq(query, project_id=project_id)

Downloading: 100%|[32m██████████[0m|


In [None]:
df_notifications.head()

Unnamed: 0,reason,channel,status,user_id,created_date
0,METAL_RESERVE_PLAN,SMS,SENT,user_4703,2018-10-14 01:48:13.319987+00:00
1,METAL_RESERVE_PLAN,SMS,SENT,user_2397,2018-10-17 08:07:05.709072+00:00
2,METAL_RESERVE_PLAN,SMS,SENT,user_2411,2018-10-14 03:46:34.244392+00:00
3,METAL_RESERVE_PLAN,SMS,SENT,user_1119,2018-10-14 14:16:35.801185+00:00
4,METAL_RESERVE_PLAN,SMS,SENT,user_3499,2018-10-14 18:41:09.268675+00:00


In [None]:
df_notifications.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121813 entries, 0 to 121812
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   reason        121813 non-null  object             
 1   channel       121813 non-null  object             
 2   status        121813 non-null  object             
 3   user_id       121813 non-null  object             
 4   created_date  121813 non-null  datetime64[us, UTC]
dtypes: datetime64[us, UTC](1), object(4)
memory usage: 4.6+ MB


In [None]:
df_notifications.nunique()

Unnamed: 0,0
reason,17
channel,3
status,2
user_id,18953
created_date,121810


In [None]:
df_notifications.columns = df_notifications.columns.str.lower().str.replace(" ", "_")
text_cols = df_notifications.select_dtypes(include='object').columns

for col in text_cols:
    df_notifications[col] = (df_notifications[col].str.lower() .str.strip().str.replace(r'\s+', '_', regex=True) .str.replace(r'[^\w]', '', regex=True))

In [None]:
df_notifications.head()

Unnamed: 0,reason,channel,status,user_id,created_date
0,metal_reserve_plan,sms,sent,user_4703,2018-10-14 01:48:13.319987+00:00
1,metal_reserve_plan,sms,sent,user_2397,2018-10-17 08:07:05.709072+00:00
2,metal_reserve_plan,sms,sent,user_2411,2018-10-14 03:46:34.244392+00:00
3,metal_reserve_plan,sms,sent,user_1119,2018-10-14 14:16:35.801185+00:00
4,metal_reserve_plan,sms,sent,user_3499,2018-10-14 18:41:09.268675+00:00


In [None]:
df_notifications[df_notifications.duplicated(keep=False)]


Unnamed: 0,reason,channel,status,user_id,created_date


In [None]:
df_notifications.tail(40)