In [77]:
import pandas as pd
import re

In [57]:
df = pd.read_csv('inputs/applications_dataset_1.csv')

In [58]:
df.head()

Unnamed: 0,name,email,date_of_birth,mobile_no
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429


In [59]:
# format mobile_no: check for only digits, length == 8
df['mobile_no'] = df['mobile_no'].apply(lambda x: ''.join(re.findall(r'\d+', x)))
df['still_successful'] = df['mobile_no'].apply(lambda x: True if len(x) == 8 else False)

In [60]:
# keep unsuccessful into another df
unsuccesful_df = df[~df['still_successful']].drop(columns='still_successful')
unsuccesful_df

Unnamed: 0,name,email,date_of_birth,mobile_no
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429
5,Cathy Werner,Cathy_Werner@martinez.net,2018-09-25,7138041
...,...,...,...,...
1994,Sherry Martin,Sherry_Martin@cross-wright.org,1995/05/04,45471
1995,Robert Li,Robert_Li@brock.com,2018/11/07,38151
1996,Jesse Miller,Jesse_Miller@thompson-owens.biz,1961-03-06,854883
1997,Kevin Jones,Kevin_Jones@moore.org,26-01-1974,122217


In [61]:
# df retains still successful apps
df = df[df['still_successful']].drop(columns='still_successful')
df

Unnamed: 0,name,email,date_of_birth,mobile_no
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711
11,Scott Lee,Scott_Lee@phillips.com,2010-07-12,52145751
20,Patty Smith,Patty_Smith@ross.com,27-08-1975,59428759
25,Travis Rice,Travis_Rice@bradley.net,05/08/2006,36316303
27,Sarah Jacobson,Sarah_Jacobson@mclean-jacobs.info,03/09/1958,61071779
...,...,...,...,...
1965,Jody Reed,Jody_Reed@winters-taylor.biz,2015/12/30,27588167
1972,Susan Donovan,Susan_Donovan@jordan-anderson.com,30-01-1969,85222335
1974,Michael Serrano,Michael_Serrano@turner-maldonado.com,07/11/1982,12792543
1976,Michael Powell,Michael_Powell@mccall.com,1994/11/27,63894311


In [49]:
# format email: check for [alphanumerical]@[emailprovider].[com/net]
df[['first_email_part', 'second_email_part']] = df['email'].str.split('@', expand = True)
df[['emailprovider', 'com/net']] = df['second_email_part'].str.split('.', expand = True)
df['still_successful'] = df['com/net'].isin(['com', 'net'])

In [51]:
working_cols = ['first_email_part', 'second_email_part', 'emailprovider', 'com/net', 'still_successful']

In [54]:
# keep unsuccessful into another df
unsuccesful_df = pd.concat([unsuccesful_df, df[~df['still_successful']].drop(columns=working_cols)])
unsuccesful_df

Unnamed: 0,name,email,date_of_birth,mobile_no
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429
5,Cathy Werner,Cathy_Werner@martinez.net,2018-09-25,7138041
...,...,...,...,...
1935,Henry Johnson Jr.,Henry_Johnson@horton.org,2012-05-15,54804439
1941,Joe Moody,Joe_Moody@johnson-watts.biz,1999-04-05,97781031
1950,Robert Cabrera,Robert_Cabrera@kim.org,1995-07-06,99768879
1958,Stephen Campos,Stephen_Campos@lam.biz,2005/01/23,19488863


In [55]:
# df retains still successful apps
df = df[df['still_successful']].drop(columns=working_cols)
df

Unnamed: 0,name,email,date_of_birth,mobile_no
11,Scott Lee,Scott_Lee@phillips.com,2010-07-12,52145751
20,Patty Smith,Patty_Smith@ross.com,27-08-1975,59428759
25,Travis Rice,Travis_Rice@bradley.net,05/08/2006,36316303
29,Sean Wang DDS,Sean_Wang@gibson-calderon.com,1960-03-11,25595367
30,Richard Estrada,Richard_Estrada@malone.com,1992/10/15,22821527
...,...,...,...,...
1962,Daniel Armstrong,Daniel_Armstrong@brown.com,1982/07/14,18991247
1972,Susan Donovan,Susan_Donovan@jordan-anderson.com,30-01-1969,85222335
1974,Michael Serrano,Michael_Serrano@turner-maldonado.com,07/11/1982,12792543
1976,Michael Powell,Michael_Powell@mccall.com,1994/11/27,63894311


In [None]:
# format name: trim whitespaces, check for first name and last name separated by a space
df['name'] = df['name'].str.strip()
df['words_in_name'] = df['name'].apply(lambda x: len(x.split(' ')))
# df[['first_name', 'last_name']] = df['name'].str.split(' ', expand = True)

In [84]:
def clean_name(name):
  split_name = name.split(' ')
  if len(split_name) == 2:
    first_name, last_name = split_name
  elif len(split_name) > 2:
    # check for common salutations
    pass
  else:
    first_name, last_name = None, None

  return first_name, last_name

In [85]:
clean_name('Sean')

(None, None)

In [78]:
df[df['words_in_name'] > 2]

Unnamed: 0,name,email,date_of_birth,mobile_no,words_in_name
29,Sean Wang DDS,Sean_Wang@gibson-calderon.com,1960-03-11,25595367,3
93,Arthur Hall MD,Arthur_Hall@gonzalez.com,03-11-1994,65493407,3
150,Dr. Samuel Thompson,Samuel_Thompson@sanchez-carroll.com,03-04-1984,99063183,3
568,Rebecca Thompson DDS,Rebecca_Thompson@ferrell.biz,20-07-1998,98761031,3
645,Mr. Daniel Smith,Daniel_Smith@cummings.com,1986-03-08,26613823,3
758,Jennifer Martin PhD,Jennifer_Martin@mack.com,2010/01/06,41092831,3
932,Arthur Gibson II,Arthur_Gibson@thompson-thompson.com,03-06-1961,75411807,3
942,Miss Katherine Brennan MD,Katherine_Brennan@garcia.net,11/13/1977,37918255,4
948,Charles Mendoza DVM,Charles_Mendoza@watts.org,1952-11-09,56858727,3
1035,Mr. Bryan Porter,Bryan_Porter@elliott.com,1957/02/14,88089671,3


In [50]:
df

Unnamed: 0,name,email,date_of_birth,mobile_no,first_email_part,second_email_part,emailprovider,com/net,still_successful
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,William_Dixon,woodward-fuller.biz,woodward-fuller,biz,False
11,Scott Lee,Scott_Lee@phillips.com,2010-07-12,52145751,Scott_Lee,phillips.com,phillips,com,True
20,Patty Smith,Patty_Smith@ross.com,27-08-1975,59428759,Patty_Smith,ross.com,ross,com,True
25,Travis Rice,Travis_Rice@bradley.net,05/08/2006,36316303,Travis_Rice,bradley.net,bradley,net,True
27,Sarah Jacobson,Sarah_Jacobson@mclean-jacobs.info,03/09/1958,61071779,Sarah_Jacobson,mclean-jacobs.info,mclean-jacobs,info,False
...,...,...,...,...,...,...,...,...,...
1965,Jody Reed,Jody_Reed@winters-taylor.biz,2015/12/30,27588167,Jody_Reed,winters-taylor.biz,winters-taylor,biz,False
1972,Susan Donovan,Susan_Donovan@jordan-anderson.com,30-01-1969,85222335,Susan_Donovan,jordan-anderson.com,jordan-anderson,com,True
1974,Michael Serrano,Michael_Serrano@turner-maldonado.com,07/11/1982,12792543,Michael_Serrano,turner-maldonado.com,turner-maldonado,com,True
1976,Michael Powell,Michael_Powell@mccall.com,1994/11/27,63894311,Michael_Powell,mccall.com,mccall,com,True


In [22]:
pd.to_datetime(df['date_of_birth'], format='mixed')

0     1986-01-10
1     2010-07-12
2     1975-08-27
3     2006-05-08
4     1958-03-09
         ...    
491   2015-12-30
492   1969-01-30
493   1982-07-11
494   1994-11-27
495   1961-05-08
Name: date_of_birth, Length: 496, dtype: datetime64[ns]