# PAWS Data Pipeline
The objective of this script is to create a master data table that links all the PAWS datasources together.
## Pipeline sections
0. Import libraries
1. Create & populate database 
2. Create ***metadata master table*** schema to link all source tables together & populate with one of the dataset (e.g. SalesForce)
3. For each dataset, merge each record with the ***metadata master table***. If a match is found, link the two sources. If not, create a new record. <br/>
    a. Petpoint<br/>
    b. Volgistics<br/>
    c. Other - TBD<br/>
4. Write the new table to the database

### 0. Import libraries

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import re

### 1. Create & populate database 

In [2]:
# connect to or create database

conn = sqlite3.connect("./sample_data/paws.db")

In [3]:
# function for loading a csv into a database table or "updating" the table by dropping it and recreating it with the csv

def load_to_sqlite(csv_name, table_name, connection, drop_first_col=False):
    
    # load csv into a dataframe
    df = pd.read_csv(csv_name, encoding='cp1252')
    
    # drop the first column - so far all csvs have had a first column that's an index and doesn't have a name
    if drop_first_col:
        df = df.drop(df.columns[0], axis=1)
    
    # strip whitespace and periods from headers, convert to lowercase
    df.columns = df.columns.str.lower().str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.map(lambda x: re.sub(r'\.+', '_', x))
    
    # create a cursor object, and use it to drop the table if it exists
    cursor = connection.cursor()
    cursor.execute(f'DROP TABLE {table_name}')
    connection.commit()
    cursor.close()
    
    # load dataframe into database table
    df.to_sql(table_name, connection, index=False,)

In [4]:
# load petpoint

load_to_sqlite('./sample_data/CfP_PDP_petpoint_deidentified.csv', 'petpoint', conn, True)

In [5]:
# load volgistics

load_to_sqlite('./sample_data/CfP_PDP_volgistics_deidentified.csv', 'volgistics', conn, True)

In [6]:
# load salesforce contacts

load_to_sqlite('./sample_data/CfP_PDP_salesforceContacts_deidentified.csv', 'salesforcecontacts', conn, True)

In [7]:
# load salesforce donations

load_to_sqlite('./sample_data/CfP_PDP_salesforceDonations_deidentified.csv', 'salesforcedonations', conn, True)

  if (await self.run_code(code, result,  async_=asy)):


### 2. Create ***metadata master table*** schema to link all source tables together & populate with one of the dataset (e.g. SalesForce)

In [6]:
pd.read_sql('select * from salesforcecontacts', conn).tail()

Unnamed: 0,account_name,contact_id,first_name,last_name,title,mailing_street,mailing_city,mailing_state_province,mailing_zip_postal_code,mailing_country,phone,fax,mobile,email,account_owner,account_id
60182,Angelica el-Ashraf Bistro,0033p00002UO8dB,Angelica,el-Ashraf,,1417 Estate,Fontana,Pennsvania,19119-3111,US,,,,pxm@bnygeuzhvo.ewu,PAWS Development,0013p00001pVtVy
60183,Cassondra el-Kamal Household,0033p00002UO8ed,Cassondra,el-Kamal,,2210 S. 14st Street,West Portsmouth,NH,19125-3329,US,,,,ske@ciqgr.ndf,PAWS Development,0013p00001pVtWX
60184,Justin Campbell Bistro,0033p00002UO8oS,Justin,Campbell,,4074 S. 41rd St.,Ocean,Texas,19474-0204,,,,,faxh@lcume.enj,Jared Hupp,0013p00001pVtaP
60185,Aslam Wilson Household,0033p00002UO8q2,Aslam,Wilson,,222 n Columbus blvd,New Haven,BC,17009,US,4146143364.0,,,tpik@wotkn.qwi,PAWS Development,0013p00001pVtaj
60186,Dashawn Patterson Household,0033p00002UO8tB,Dashawn,Patterson,,311,High Bridge,WA,19064-3130,,,,,nobcuvj@blyh.zva,Jared Hupp,0013p00001pVtbS


In [7]:
pd.read_sql('select * from volgistics', conn).tail()

Unnamed: 0,last_name_first_name,first_name_last_name,title_first_name_last_name,last_name,first_name,middle_name,title,nickname,status,type,...,spare_checkbox_5,spare_checkbox_6,volunteer_distribution_list,general_volunteer_emails,schedule_reminders,my_availability_is,from,to,i_would_like_to_serve_up_to,hours
1237,"Lo, Max",Max Lo,Ms. Max Lo,Lo,Max,,Ms.,,Active,,...,,,Yes,,Yes,,,,0,
1238,"Johnson, Jessica",Jessica Johnson,Jessica Johnson,Johnson,Jessica,,,,Active,,...,,,Yes,,Yes,,,,0,
1239,"Williams, Bryce",Bryce Williams,Bryce Williams,Williams,Bryce,,,They/them pronouns please,Active,,...,,,Yes,,Yes,,,,0,
1240,"Turner, Kaelyn",Kaelyn Turner,Ms. Kaelyn Turner,Turner,Kaelyn,,Ms.,,Active,,...,,,Yes,,Yes,,,,0,
1241,"el-Majeed, Carolina",Carolina el-Majeed,Ms. Carolina el-Majeed,el-Majeed,Carolina,,Ms.,,Active,,...,,,Yes,,Yes,,,,0,


In [22]:
def clean_entry(entry):
    """
    Function to clean up all values returned from the SQL statement, so this 
    should be performed on every entry in the dataframe with an applymap
    
    1 Change 'None' or 'NaN' value to an empty string
    2 Cast value as string
    3 Lowercase value
    3 Strip leading and trailing white space
    4 Remove punctuation by only keeping letters, numbers and white space
    5 Replace internal multiple consecutive white spaces with a single white space
    """
    
    # convert None and NaN to an empty string
    if entry ==  None or entry == np.nan:
        entry = ''
    
    # convert to string, lowercase, and strip leading and trailing whitespace
    entry = str(entry).lower().strip()
    
    # remove all non alphanumeric characters except white space
    alphanumeric_and_space = ' 1234567890abcdefghijklmnopqrstuvwxyz'
    entry = ''.join([c for c in entry if c in alphanumeric_and_space])
    
    # cut down (internal) consecutive whitespaces to one white space
    entry = re.sub(r'\s+', ' ', entry)
    
    return entry

In [23]:
def create_user_master_df(connection, query, *addl_columns):
    """
    Creates a pandas dataframe placeholder with key meta-data to fuzzy-match
    the users from different datasets.
    
    Pseudo-code:
        Create a blank pandas dataframe (e.g. pd.DataFrame) with columns for
        Name (last, first), address, zip code, phone number, email, etc.
        
        Include "ID" fields for each of the datasets that will be merged.
        
        Populate/Initialize the dataframe with data from one of the datasets
        (e.g. Salesforce)
    """
    
    # pull the dataframe from SQL database, call cleaning function, 
    # and add empty columns for the datasets that will be merged
    df = pd.read_sql(query, connection)
    df = df.applymap(clean_entry)
    
    for col_name in addl_columns:
        df[col_name] = np.nan
    
    return df

In [None]:
def standardize_states(state,  min_score=.8):
    """
    Taking a state or territory's name as its argument, this function returns 
    the 2 letter postal abbreviation. Since the data is human input and 
    often misspelled, it relies on a fuzzy match based on the Levenshtein 
    Distance. 
    
    If the fuzzy match score is above a minimum (defaulting to 80%) it 
    selects the top match, otherwise it returns a blank.
    """
    
    state_abbr_dict = {'alabama': 'al',
                     'alaska': 'ak',
                     'arizona': 'az',
                     'arkansas': 'ar',
                     'california': 'ca',
                     'colorado': 'co',
                     'connecticut': 'ct',
                     'delaware': 'de',
                     'florida': 'fl',
                     'georgia': 'ga',
                     'hawaii': 'hi',
                     'idaho': 'id',
                     'illinois': 'il',
                     'indiana': 'in',
                     'iowa': 'ia',
                     'kansas': 'ks',
                     'kentucky': 'ky',
                     'louisiana': 'la',
                     'maine': 'me',
                     'maryland': 'md',
                     'massachusetts': 'ma',
                     'michigan': 'mi',
                     'minnesota': 'mn',
                     'mississippi': 'ms',
                     'missouri': 'mo',
                     'montana': 'mt',
                     'nebraska': 'ne',
                     'nevada': 'nv',
                     'new hampshire': 'nh',
                     'new jersey': 'nj',
                     'new mexico': 'nm',
                     'new york': 'ny',
                     'north carolina': 'nc',
                     'north dakota': 'nd',
                     'ohio': 'oh',
                     'oklahoma': 'ok',
                     'oregon': 'or',
                     'pennsylvania': 'pa',
                     'rhode island': 'ri',
                     'south carolina': 'sc',
                     'south dakota': 'sd',
                     'tennessee': 'tn',
                     'texas': 'tx',
                     'utah': 'ut',
                     'vermont': 'vt',
                     'virginia': 'va',
                     'washington': 'wa',
                     'west virginia': 'wv',
                     'wisconsin': 'wi',
                     'wyoming': 'wy',
                     'american samoa': 'as',
                     'district of columbia': 'dc',
                     'washington dc': 'dc',
                     'washington district of columbia': 'dc',
                     'federated states of micronesia': 'fm',
                     'guam': 'gu',
                     'marshall islands': 'mh',
                     'northern mariana islands': 'mp',
                     'palau': 'pw',
                     'puerto rico': 'pr',
                     'virgin islands': 'vi'}

In [26]:
# create master dataframe using the 'salesforcecontacts' table

sf_cont_query = """SELECT    last_name
                             , first_name 
                             , mailing_street as street
                             , mailing_city as city
                             , mailing_state_province as state_etc 
                             , mailing_zip_postal_code as zipcode
                             , mailing_country as country
                             , phone
                             , mobile
                             , email
                    FROM     salesforcecontacts"""

### cleanup still to do in pandas ###
# street needs to have formatting standardized (eg 19th st vs 19 st, n vs north)- probably want to always go with the shorter version. this will be onerous, but maybe there's a library on github for this. Some of this will just be stripping to letters and numbers, then looking for names like south, avenue, apartment etec and making them the abbreviation- there must be a list of these things, actually Jonathan might have given me code with that list
# some states are written as full names, some as abbreviations- this won't be so bad, make everything 2 letters, have all fifty states, get a distance score or percentage, take the top one if it's above a cutoff, otherwise leave it blank

master_df = create_user_master_df(conn, sf_cont_query, 'volgistics_id', 'petpoint_id', 'sf_donations_id')

# combine last and first names to make a single name column
master_df['name'] = master_df['last_name'] + ', ' + master_df['first_name']

# standardize state and territory names to their 2 letter postal abbreviation
master_df['state_etc'] = master_df['state_etc'].apply(standardize_states)

# make a single address column
master_df['address'] = (master_df['street'] + ' ' + master_df['city'] + ' ' + master_df['state_etc'] + ' ' + master_df['zipcode'] + ' ' + master_df['country']).str.strip()
# drop extraneous address columns
master_df = master_df[['name', 'address', 'phone', 'mobile', 'email', 'volgistics_id', 'petpoint_id', 'sf_donations_id']]

master_df.head(10)

Unnamed: 0,name,address,phone,mobile,email,volgistics_id,petpoint_id,sf_donations_id
0,"kiyota, loren",704 wynnemoor way orinda co 7701 us,,,pzvbscf,,,
1,"trujillo, lisa",moore rd,,,,,,
2,"thomas, jade",220 annin st malvern pennsylvania 20009 us,1276261767,714 7111110,mvkbtwogprgvqkuedegp,,,
3,"rascon, hannah",150 chestnut st scotch plain in 186403525 us,544 5554550,141 3431454,xebqfclvopqfrhgzkuoxzi,,,
4,"flores, robert",5818 bristol tokyo 191232316 us,2352355555,,rapwxnkoltkpect,,,
5,"wong, kale",6555 north hartland baden wrttemberg 60612 us,3355333533,,hemqwzutgcdyhdy,,,
6,"tafoya, sean",27 edgewater drive home id 60643 us,1421154224,,ajetbxfnszbimqcdumji,,,
7,"donthinani, faadil",404 e redondo ave sagamore hills dc 191461048 us,33453343334,,frmapoyko,,,
8,"frudden, nulong",3313 s quince street reading or 193421415 us,5565151110,,phtbanpzjjhr,,,
9,"hebert, alan",200 n carroll street natick wa 191451655 us,668 1118081,,myxhgodzfdoliabgv,,,


### 3. For each dataset, merge each record with the ***metadata master table***
If a match is found, link the two sources. If not, create a new record. <br/>

In [None]:
def fuzzy_merge(new_df, master_df):
    """
    This function merges each new dataset with the metadata master table by
    going line-by-line on the new dataset and looking for a match in the 
    existing metadata master dataset. If a match is found
    
    Pseudo-code:
        LOOP: For each line in the new_df, compare that line against all lines in 
        the master_df. 
        
        LOGIC: For each comparison, generate (a) a fuzzy-match score on name,
        (b) T/F on whether zip-code matches, (c) T/F on whether email matches,
        (d) T/F on whether phone number matches.
        
        OUTPUT: For each comparison if the fuzzy-match score is above a threshold (e.g. >=90%)
        and (b), (c) or (d) matches, consider it a match and add the new dataset 
        id to the existing record. If it doesn't match, create a new record in the
        master dataset.
        
    Note: there's probably a more efficient way to do this (vs. going line-by-line)
    """

#### 3.A Petpoint merge 
Apply function above the Petpoint dataset

#### 3.B Volgistics merge
Apply function above the Volgistics dataset

#### 3.C Other - TBD - Merge

### 4. Write the new table to the database

In [4]:
# load_to_sqlite(master_df, master_table, conn)

## Other - placeholder - graveyard
Graveyard/placeholder code from previous sections

In [None]:
# simple join to check that it worked and the tables can be queried

df = pd.read_sql('''select * from petpoint as pp 
                    join volgistics as vol 
                    on pp."unnamed:_0" = vol."unnamed:_0"

                    join (SELECT * FROM salesforcecontacts AS sf_contacts
                            JOIN salesforcedonations AS sf_donations
                            ON sf_contacts."Account_ID" = sf_donations."Account_ID") as sf
                    on pp."unnamed:_0" = sf."unnamed:_0"
                    
                    ''', conn)

df.head()

In [None]:
# get all data matching on (first name + last name)

df2 = pd.read_sql('''SELECT * FROM petpoint AS pp
                     INNER JOIN volgistics AS vol ON pp."Intake_Record_Owner" = vol."First_name_Last_name"
                     INNER JOIN (SELECT * FROM salesforcecontacts AS sf_contacts
                            JOIN salesforcedonations AS sf_donations
                            ON sf_contacts."Account_ID" = sf_donations."Account_ID") AS sf
                     ON pp."Intake_Record_Owner" = (sf."First_Name" + " " + sf."Last_Name")
                  ''', conn)
df2.head()

In [None]:
# close database connection

conn.close()