In [156]:
from data_extraction import DataExtractor
from database_utils import DatabaseConnector

import pandas as pd
from datetime import datetime
import re



In [157]:
connection = DatabaseConnector()
engine = connection.init_db_engine()

extractor = DataExtractor()
df = extractor.read_rds_table(engine, "legacy_users")

In [158]:
# class DataCleaning():
  
#   def clean_user_data(self, pandas_dataframe):
#     df = pandas_dataframe

#     # df.drop("first_name", axis=1, inplace=True)

#     return cleaned df




Checking for missing/null values in the columns

In [159]:

df.isnull().sum()

index            0
first_name       0
last_name        0
date_of_birth    0
company          0
email_address    0
address          0
country          0
country_code     0
phone_number     0
join_date        0
user_uuid        0
dtype: int64

The above shows that there are no missing or null values in any column.

We can check now that the index is unique.

In [160]:
is_unique = df["index"].is_unique
is_unique

True

In [161]:
is_unique = df["user_uuid"].is_unique
is_unique

False

In [162]:
duplicate_uuids = df[df.duplicated("user_uuid", keep=False)]
duplicate_uuids

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
866,867,,,,,,,,,,,
1022,1023,,,,,,,,,,,
1805,1807,,,,,,,,,,,
2103,2103,,,,,,,,,,,
2437,2439,,,,,,,,,,,
2739,6526,,,,,,,,,,,
2764,2764,,,,,,,,,,,
4984,4987,,,,,,,,,,,
5307,5310,,,,,,,,,,,
6920,6927,,,,,,,,,,,


`pd.Series(range(len(df)))` creates a Series (like a column) of numbers from 0 to length of the dataframe -1. The `.equals()`function compares to see if two objects are the same. Essentially, we're comparing the index column to the range. Returns True if equal.

In [163]:
is_sequential = df['index'].equals(pd.Series(range(len(df))))
is_sequential

False

Given that it's not sequential, but there are `(len(df))` unique, non-null values, it's possible that the index column is just messed up. We can sort the dataframe by the index column, and compare it to the Series again.

In [164]:
sorted_df = df.sort_values(by="index")
sorted_df.reset_index(drop=True, inplace=True)
is_sequential = sorted_df["index"].equals(pd.Series(range(len(df))))
# sorted_df.tail()
is_sequential

True

In [165]:
sorted_df.head()

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579


We can drop the index column, as it's not adding any useful information

In [166]:
sorted_df = sorted_df.drop(columns="index")
sorted_df.head()

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579


In [167]:
sorted_df.dtypes

first_name       object
last_name        object
date_of_birth    object
company          object
email_address    object
address          object
country          object
country_code     object
phone_number     object
join_date        object
user_uuid        object
dtype: object

In [168]:
null_search_id = sorted_df.loc[sorted_df["user_uuid"] == "NULL"]
null_search_id

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
867,,,,,,,,,,,
1023,,,,,,,,,,,
1807,,,,,,,,,,,
2103,,,,,,,,,,,
2439,,,,,,,,,,,
2764,,,,,,,,,,,
4987,,,,,,,,,,,
5310,,,,,,,,,,,
6526,,,,,,,,,,,
6927,,,,,,,,,,,


Let's drop all of the values that match "NULL". We're dropping the rows that match our null_search_id by passing it to the drop function.

In [169]:
filtered_df = sorted_df.drop(null_search_id.index)
filtered_df = filtered_df.reset_index(drop=True)

Test again for uniqueness in the filtered dataframe

In [170]:
unique_test = filtered_df["user_uuid"].is_unique
unique_test

True

Let's look at the date of birth column

In [171]:
unique_dob_values = filtered_df["date_of_birth"].nunique()
unique_dob_values

11359

In [172]:
date_of_birth_series = filtered_df["date_of_birth"]
len(date_of_birth_series)

15299

In [173]:
date_of_birth_series = filtered_df["date_of_birth"]

date_of_birth_regex = r'^\d{4}-\d{2}-\d{2}$'

matches_pattern = date_of_birth_series.str.match(date_of_birth_regex)

match_count = matches_pattern.sum()

match_count

np.int64(15257)

In [174]:
non_matching_dates = date_of_birth_series[~matches_pattern]
non_matching_dates

360       1968 October 16
697            1971/10/23
752            KBTI7FI7Y3
1045           OFH8YGZJWN
1629      January 1951 27
1994     November 1958 11
2991           PQPEUO937L
3063      1946 October 18
3533           7KGJ3C5TSW
3610           1974/06/06
3794           2000/01/06
4201     1979 February 01
4588           2003/09/21
5302           RQTF5XSXP4
5345         June 1943 28
5418     November 1963 06
5527           1998/08/14
6105     February 2005 05
6218         July 1966 08
6418           D2OZZHWOLK
7166           2001/07/28
7257      1948 October 24
8118     December 1946 09
8274           1942/05/29
8387           QTVEU5TR8H
8525           2001/10/23
9015           L3E8OV4UAC
9304           1944/11/30
9935      2005 January 27
10212          TLSTUEIKI0
10245        July 1961 14
10360          YTC82GP4XE
11204        July 1939 16
11367          O1LIA1MT1N
12178          RQI3KQXFBQ
13048     1951 January 14
13115          1IA43NTJFB
13164          1944/10/15
14103       

In [175]:
filtered_df.iloc[5302]

first_name       XKDVQD7BH2
last_name        4YSEX8AY1Z
date_of_birth    RQTF5XSXP4
company          GKJZ58DTHT
email_address    789HWCYC7M
address          KJW5WR7W16
country          XN9NGL5C0B
country_code     NTCGYW8LVC
phone_number     MUXPJD0BZY
join_date        LYVWXBBI6F
user_uuid        LNRNI6X6VL
Name: 5302, dtype: object

Filter all rows that contain a number in the first_name column, and drop them.

In [176]:
numbers_in_name = filtered_df[filtered_df["first_name"].str.contains(r"\d", na=False)]
filtered_df = filtered_df.drop(numbers_in_name.index)
filtered_df.reset_index(drop=True)

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579
...,...,...,...,...,...,...,...,...,...,...,...
15279,Marta,Rogge,1981-03-03,Dehmel,baererklothilde@trubin.com,Ziegertstr. 60\n93330 Stollberg,Germany,DE,(05917) 549662,2000-03-29,8a77629e-7ca1-409f-b22c-c24056bd4eb1
15280,Erna,Hoffmann,1967-10-28,Atzler Seifert AG & Co. KGaA,dunjafischer@vollbrecht.de,Henkallee 186\n33456 Sankt Goarshausen,Germany,DE,+49(0)7384 51073,2018-03-13,5f57209e-8695-4863-b9e7-084a4ba02808
15281,Konstantinos,Thanel,1954-08-05,Fritsch Ehlert GmbH,rpruschke@gotthard.com,Steffi-Rose-Platz 16\n12365 Apolda,Germany,DE,(07856) 050049,2007-07-21,6f16e0ce-9b07-4479-a151-7efdd35408aa
15282,Caroline,Fisher,1975-09-27,Coleman Ltd,wardshaun@miah.org,826 Hollie park\nKhanberg\nM9J 1GP,United Kingdom,GB,(0115) 496 0754,2016-11-26,1a202edd-20aa-4787-b3b3-622fc01a9d08


Let's convert the date_of_birth to a datetime.

In [177]:
filtered_df.dtypes


first_name       object
last_name        object
date_of_birth    object
company          object
email_address    object
address          object
country          object
country_code     object
phone_number     object
join_date        object
user_uuid        object
dtype: object

In [178]:
date_of_birth_series = filtered_df["date_of_birth"]

date_of_birth_regex = r"^\d{4}-\d{2}-\d{2}$"

matches_pattern = date_of_birth_series.str.match(date_of_birth_regex)

match_count = matches_pattern.sum()

non_matching_dates = date_of_birth_series[~matches_pattern].index
non_matching_dates

Index([  360,   697,  1629,  1994,  3063,  3610,  3794,  4201,  4588,  5345,
        5418,  5527,  6105,  6218,  7166,  7257,  8118,  8274,  8525,  9304,
        9935, 10245, 11204, 13048, 13164, 14107, 14549],
      dtype='int64')

There are 27 problematic entries - this corresponds to the number of datetimes that didn't fit the YYYY MM DD format previously. We are going to need to convert these dates before we can move on.

In [179]:
filtered_df.loc[non_matching_dates]

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
360,Margaret,Akhtar,1968 October 16,"Turner, Marshall and Clarke",tcoleman@saunders-blake.org,033 Poole gateway\nLake Sharon\nCW62 6SD,United Kingdom,GB,(0191) 496 0493,1994-11-21,90f1f4cf-aca3-4c23-bc74-cc9f8e58c5ae
697,Scott,Jones,1971/10/23,"Jordan, Brown and Evans",tom73@jones.net,44 Irene plaza\nNew Abbie\nM4H 0NH,United Kingdom,GB,+449098790834,2012-10-08,9ac79783-f8e9-4639-b891-ce6b0d65028a
1629,Gertraut,Thanel,January 1951 27,Scholl GbR,kbaehr@berger.de,Werneckering 9/5\n85998 Stendal,Germany,DE,(03961) 82313,1996-01-06,35ac4c9c-ea19-4cc9-b257-100aa61238ce
1994,Peggy,Gibson,November 1958 11,Davis Group,keithespinoza@hall-stevens.info,"06263 Huynh Unions Apt. 079\nMichaelton, GA 98923",United States,US,135-127-1916,1995-05-07,d74d1885-0db3-40ed-bdfd-0548dac12337
3063,Lioba,Dobes,1946 October 18,Heinrich Stiftung & Co. KG,hthanel@lehmann.de,Rosestr. 6\n97463 Griesbach Rottal,Germany,DE,+49(0)0334 246986,1993-07-22,a272077a-e52a-48e4-a213-c93227f34d84
3610,Luke,Glover,1974/06/06,Jones-Brown,bellterence@booth-rahman.net,Flat 5\nFiona fort\nSouth Jeffreybury\nBB1 1WN,United Kingdom,GB,0115 496 0051,1994-04-25,6ef6d2fb-d48e-4385-87a5-fcc52e813a72
3794,Amber,Stewart,2000/01/06,Bennett Group,pblake@bradley.com,233 Lesley expressway\nOwenfurt\nCO7E 5UP,United Kingdom,GB,+44(0)151 4960261,1995-03-26,4a6261ad-c028-4ec6-bc91-2220fbea4887
4201,Dietlind,Karge,1979 February 01,Schlosser Karge KG,hans-josef54@ortmann.de,Alexandros-Hiller-Allee 9\n90857 Pirmasens,Germany,DE,0649651876,1993-05-23,d391bf54-2cd6-40ff-a564-18d20fab21c7
4588,Ricky,Kemp,2003/09/21,Burgess Ltd,bhopkins@morris.co.uk,475 Lane ports\nElizabethside\nCH1W 0EP,United Kingdom,GB,0161 4960570,2005-02-09,e9bb5031-c176-44ec-8d7c-f608307a2763
5345,Jaroslaw,Zimmer,June 1943 28,Wesack Wulf AG,diedrich85@waehner.net,Friedhold-Müller-Allee 84\n57937 Jena,Germany,DE,+49(0) 881581861,2006-02-06,1f5edb02-ff25-41e0-b5ab-b0c979ebdf9e


In [180]:
def preprocess_dob(date_str):
  formats = ["%Y %B %d", "%Y/%m/%d", "%B %Y %d", "%Y-%m-%d"]
  print(date_str, "Date String")
  for format in formats:
    try:
      return datetime.strptime(date_str, format)
    except Exception as e:
      print(e)
      continue
  return pd.NaT


filtered_df["date_of_birth"] = filtered_df["date_of_birth"].apply(preprocess_dob)

filtered_df.loc[non_matching_dates]

1990-09-30 Date String
time data '1990-09-30' does not match format '%Y %B %d'
time data '1990-09-30' does not match format '%Y/%m/%d'
time data '1990-09-30' does not match format '%B %Y %d'
1940-12-01 Date String
time data '1940-12-01' does not match format '%Y %B %d'
time data '1940-12-01' does not match format '%Y/%m/%d'
time data '1940-12-01' does not match format '%B %Y %d'
1995-08-02 Date String
time data '1995-08-02' does not match format '%Y %B %d'
time data '1995-08-02' does not match format '%Y/%m/%d'
time data '1995-08-02' does not match format '%B %Y %d'
1972-09-23 Date String
time data '1972-09-23' does not match format '%Y %B %d'
time data '1972-09-23' does not match format '%Y/%m/%d'
time data '1972-09-23' does not match format '%B %Y %d'
1952-12-20 Date String
time data '1952-12-20' does not match format '%Y %B %d'
time data '1952-12-20' does not match format '%Y/%m/%d'
time data '1952-12-20' does not match format '%B %Y %d'
1949-08-12 Date String
time data '1949-08-12'

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
360,Margaret,Akhtar,1968-10-16,"Turner, Marshall and Clarke",tcoleman@saunders-blake.org,033 Poole gateway\nLake Sharon\nCW62 6SD,United Kingdom,GB,(0191) 496 0493,1994-11-21,90f1f4cf-aca3-4c23-bc74-cc9f8e58c5ae
697,Scott,Jones,1971-10-23,"Jordan, Brown and Evans",tom73@jones.net,44 Irene plaza\nNew Abbie\nM4H 0NH,United Kingdom,GB,+449098790834,2012-10-08,9ac79783-f8e9-4639-b891-ce6b0d65028a
1629,Gertraut,Thanel,1951-01-27,Scholl GbR,kbaehr@berger.de,Werneckering 9/5\n85998 Stendal,Germany,DE,(03961) 82313,1996-01-06,35ac4c9c-ea19-4cc9-b257-100aa61238ce
1994,Peggy,Gibson,1958-11-11,Davis Group,keithespinoza@hall-stevens.info,"06263 Huynh Unions Apt. 079\nMichaelton, GA 98923",United States,US,135-127-1916,1995-05-07,d74d1885-0db3-40ed-bdfd-0548dac12337
3063,Lioba,Dobes,1946-10-18,Heinrich Stiftung & Co. KG,hthanel@lehmann.de,Rosestr. 6\n97463 Griesbach Rottal,Germany,DE,+49(0)0334 246986,1993-07-22,a272077a-e52a-48e4-a213-c93227f34d84
3610,Luke,Glover,1974-06-06,Jones-Brown,bellterence@booth-rahman.net,Flat 5\nFiona fort\nSouth Jeffreybury\nBB1 1WN,United Kingdom,GB,0115 496 0051,1994-04-25,6ef6d2fb-d48e-4385-87a5-fcc52e813a72
3794,Amber,Stewart,2000-01-06,Bennett Group,pblake@bradley.com,233 Lesley expressway\nOwenfurt\nCO7E 5UP,United Kingdom,GB,+44(0)151 4960261,1995-03-26,4a6261ad-c028-4ec6-bc91-2220fbea4887
4201,Dietlind,Karge,1979-02-01,Schlosser Karge KG,hans-josef54@ortmann.de,Alexandros-Hiller-Allee 9\n90857 Pirmasens,Germany,DE,0649651876,1993-05-23,d391bf54-2cd6-40ff-a564-18d20fab21c7
4588,Ricky,Kemp,2003-09-21,Burgess Ltd,bhopkins@morris.co.uk,475 Lane ports\nElizabethside\nCH1W 0EP,United Kingdom,GB,0161 4960570,2005-02-09,e9bb5031-c176-44ec-8d7c-f608307a2763
5345,Jaroslaw,Zimmer,1943-06-28,Wesack Wulf AG,diedrich85@waehner.net,Friedhold-Müller-Allee 84\n57937 Jena,Germany,DE,+49(0) 881581861,2006-02-06,1f5edb02-ff25-41e0-b5ab-b0c979ebdf9e


In [181]:
filtered_df["date_of_birth"]

Unnamed: 0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579
