In [15]:
from sqlalchemy import create_engine
from sqlalchemy import inspect
import psycopg2
import yaml


class DatabaseConnector():

    def __init__(self):
        pass

    def read_db_creds(self):

        with open('db_creds.yaml') as f:
            database = yaml.safe_load(f)

        return database

    def init_db_engine(self):

        db_creds = self.read_db_creds()
        # with psycopg2.connect(host='RDS_HOST', user='RDS_USER', password='RDS_PASSWORD', dbname='RDS_DATABASE', port='RDS_PORT')
        # DBAPI = 'psycopg2'
        # DATABASE_TYPE = 'postgresql'
        # engine = create_engine(f"{DATABASE_TYPE}+{DBAPI}://{RDS_USER}:{RDS_PASSWORD}@{RDS_HOST}:{RDS_PORT}/{RDS_DATABASE}")
        connector = f"postgresql://{db_creds['RDS_USER']}:{db_creds['RDS_PASSWORD']}@{db_creds['RDS_HOST']}:{db_creds['RDS_PORT']}/{db_creds['RDS_DATABASE']}"
        engine = create_engine(connector)
        return engine
        
    def list_db_tables(self):

        data = self.init_db_engine()
        data.connect()
        inspector = inspect(data)
        # table_names = inspector.get_table_names()
        # for table_name in table_names:
        #     print(table_name)
        inspector.get_table_names()
        # print(data.execute('''SELECT * FROM legacy_store_details''').fetchall())
        # connection.close()


db_creds = DatabaseConnector()
# db_creds.init_db_engine()
db_creds.list_db_tables()

In [17]:
from database_utils import DatabaseConnector
import pandas as pd

class DataExtractors():

    def __init__(self):
        pass

    def read_rds_table(self, engine, table):

        legacy_users_df = pd.read_sql_table(table, engine)
        pd.set_option('display.max_columns', None)
        # pd.set_option('display.max_rows', None)
        legacy_users_df.drop(['first_name', 'last_name', '1'], axis=1, inplace=True)
        return print(legacy_users_df.card_number.value_counts())


db_extractor = DataExtractors()
db_connector = DatabaseConnector()
users_df = db_extractor.read_rds_table(table='orders_table', engine=db_connector.engine)

30274732830562         21
4789523459695          19
30174457989736         19
374234434757251        18
676138088478           18
4226858782445430       18
4172853666665420000    18
4255201339575          17
639009951919           17
340365137694978        17
30331296070421         17
6507177323903640       17
4395378955073370       17
38679448310919         17
6011803105132710       17
3547582729288400       17
5275402794270420       17
4676645365442420       16
2310177474915390       16
3546933649575860       16
4945124379648960       16
3501052545741900       16
585639119734           16
6586628156439250       16
3503945633208810       16
30512765824162         16
4093590061777          16
376874677477352        16
213193417496920        16
4555927278045520       16
30243605915719         16
376344538901354        16
2227922848389840       16
4051548564303430000    16
5136718032401590       16
213157557263804        16
4641191742451060       16
3569203785586090       16
420151221517

In [21]:
from data_extraction import DataExtractor
from database_utils import DatabaseConnector
import pandas as pd

class DataCleaning():

    def __init__(self):
        self.db_extractor = DataExtractor()
        self.db_connector = DatabaseConnector()

    def clean_the_user_data(self, table):
        cleaned_users_df = self.db_extractor.read_rds_table(self.db_connector, table=table)

        # perform cleaning operations on the dataframe here
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        return print(cleaned_users_df)


clean_db = DataCleaning()
clean_users_df = clean_db.clean_the_user_data(table='legacy_users')

       index first_name last_name date_of_birth                       company  \
0          0   Sigfried     Noack    1990-09-30            Heydrich Junitz KG   
1          1        Guy     Allen    1940-12-01                       Fox Ltd   
2          2      Harry  Lawrence    1995-08-02     Johnson, Jones and Harris   
3          3     Darren   Hussain    1972-09-23                   Wheeler LLC   
4          4      Garry     Stone    1952-12-20                    Warner Inc   
...      ...        ...       ...           ...                           ...   
15315  14913    Stephen   Jenkins    1943-08-09  Thornton, Carroll and Newman   
15316  14994    Stephen     Smith    1948-08-20               Robinson-Harris   
15317  15012    Stephen  Losekann    1940-10-09                       Rosenow   
15318  15269    Stephen    Rivera    1952-06-04         Taylor, Fry and Jones   
15319   1249    Stephen    Duncan    1994-03-27    Phillips, Brown and Powell   

                      email

In [3]:
import pandas as pd
import requests

def extract_json_from_s3(s3_address):

    response = requests.get(s3_address)
    data = response.json()
    df = pd.DataFrame(data)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    # print(df)

    pattern = r'^[a-zA-Z0-9]*$'

    # create a boolean mask for missing or random values
    mask = (df['timestamp'].isna()) | (df['timestamp'].astype(str).str.contains(pattern))
    

    # drop the rows with missing or random values
    df = df[~mask]
    # print(df)
    return print(df.year.value_counts())

url = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json'
extract_json_from_s3(url)

1998          4102
2003          4082
2017          4082
2005          4074
2015          4055
1997          4044
2014          4039
2004          4037
2006          4033
2000          4026
2018          4024
1996          4024
2001          4023
1994          4021
2019          4019
1999          4009
2007          4008
2010          4006
2009          4002
2012          3994
2020          3991
2021          3984
2011          3972
2016          3963
2008          3961
1995          3952
2002          3927
1993          3876
2013          3831
2022          3516
1992           446
NULL            15
LND1WX0Y6Z       1
FXC3K5LZZX       1
UDHIYJS2GP       1
OVDJZCARJA       1
L1N4X0SVZA       1
QF6S8TDTEA       1
5RZL03AWR6       1
3GJWN253MM       1
9QDY0WMH6K       1
AQLUVY7DA2       1
FIEOPTN7WZ       1
I5367BRUVN       1
RA8D4CIQOV       1
KO7BGRPOKH       1
EB2N507OZ0       1
9DKC6PW41E       1
O17F6WE1TD       1
0M8BGI0CI3       1
G3DEZY8UW6       1
33F45BZPSP       1
14NRQ80L5E  