# PAWS Data Pipeline
The objective of this script is to create a master data table that links all the PAWS datasources together.
## Pipeline sections
0. Import libraries
1. Create & populate database 
2. Create ***metadata master table*** schema to link all source tables together & populate with one of the dataset (e.g. SalesForce)
3. For each dataset, merge each record with the ***metadata master table***. If a match is found, link the two sources. If not, create a new record. <br/>
    a. Petpoint<br/>
    b. Volgistics<br/>
    c. Other - TBD<br/>
4. Write the new table to the database

### 0. Import libraries

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import re

### 1. Create & populate database 

In [2]:
# connect to or create database

conn = sqlite3.connect("./sample_data/paws.db")

In [3]:
# function for loading a csv into a database table or "updating" the table by dropping it and recreating it with the csv

def load_to_sqlite(csv_name, table_name, connection, drop_first_col=False):
    
    # load csv into a dataframe
    df = pd.read_csv(csv_name, encoding='cp1252')
    
    # drop the first column - so far all csvs have had a first column that's an index and doesn't have a name
    if drop_first_col:
        df = df.drop(df.columns[0], axis=1)
    
    # strip whitespace and periods from headers, convert to lowercase
    df.columns = df.columns.str.lower().str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.map(lambda x: re.sub(r'\.+', '_', x))
    
    # create a cursor object, and use it to drop the table if it exists
    cursor = connection.cursor()
    cursor.execute(f'DROP TABLE {table_name}')
    connection.commit()
    cursor.close()
    
    # load dataframe into database table
    df.to_sql(table_name, connection, index=False,)

In [4]:
# load petpoint

load_to_sqlite('./sample_data/CfP_PDP_petpoint_deidentified.csv', 'petpoint', conn, True)

In [5]:
# load volgistics

load_to_sqlite('./sample_data/CfP_PDP_volgistics_deidentified.csv', 'volgistics', conn, True)

In [6]:
# load salesforce contacts

load_to_sqlite('./sample_data/CfP_PDP_salesforceContacts_deidentified.csv', 'salesforcecontacts', conn, True)

In [7]:
# load salesforce donations

load_to_sqlite('./sample_data/CfP_PDP_salesforceDonations_deidentified.csv', 'salesforcedonations', conn, True)

  if (await self.run_code(code, result,  async_=asy)):


### 2. Create ***metadata master table*** schema to link all source tables together & populate with one of the dataset (e.g. SalesForce)

In [8]:
pd.read_sql('select * from salesforcecontacts', conn).tail()

Unnamed: 0,account_name,contact_id,first_name,last_name,title,mailing_street,mailing_city,mailing_state_province,mailing_zip_postal_code,mailing_country,phone,fax,mobile,email,account_owner,account_id
60182,Angelica el-Ashraf Bistro,0033p00002UO8dB,Angelica,el-Ashraf,,1417 Estate,Fontana,Pennsvania,19119-3111,US,,,,pxm@bnygeuzhvo.ewu,PAWS Development,0013p00001pVtVy
60183,Cassondra el-Kamal Household,0033p00002UO8ed,Cassondra,el-Kamal,,2210 S. 14st Street,West Portsmouth,NH,19125-3329,US,,,,ske@ciqgr.ndf,PAWS Development,0013p00001pVtWX
60184,Justin Campbell Bistro,0033p00002UO8oS,Justin,Campbell,,4074 S. 41rd St.,Ocean,Texas,19474-0204,,,,,faxh@lcume.enj,Jared Hupp,0013p00001pVtaP
60185,Aslam Wilson Household,0033p00002UO8q2,Aslam,Wilson,,222 n Columbus blvd,New Haven,BC,17009,US,4146143364.0,,,tpik@wotkn.qwi,PAWS Development,0013p00001pVtaj
60186,Dashawn Patterson Household,0033p00002UO8tB,Dashawn,Patterson,,311,High Bridge,WA,19064-3130,,,,,nobcuvj@blyh.zva,Jared Hupp,0013p00001pVtbS


In [12]:
def create_user_master_df(connection, query):
    """
    Creates a pandas dataframe placeholder with key meta-data to fuzzy-match
    the users from different datasets.
    
    Pseudo-code:
        Create a blank pandas dataframe (e.g. pd.DataFrame) with columns for
        Name (last, first), address, zip code, phone number, email, etc.
        
        Include "ID" fields for each of the datasets that will be merged.
        
        Populate/Initialize the dataframe with data from one of the datasets
        (e.g. Salesforce)
    """
    
    df = pd.read_sql(query, connection)
    df = df.applymap(lambda x: x.lower() if type(x) == str else x)
    df = df.applymap(lambda x: '' if x == None else x)
    return df

In [44]:
# create master dataframe using the 'salesforcecontacts' table

table_name = 'salesforcecontacts'
sf_cont_query = f"""SELECT    trim(last_name) || ', ' || trim(first_name) as name 
                              , trim(mailing_street) as street
                              , trim(mailing_city) as city
                              , trim(mailing_state_province) as state_etc 
                              , substr(trim(mailing_zip_postal_code), 1, 5) as zipcode
                              , trim(mailing_country) as country
                              , trim(phone) as phone
                              , trim(mobile) as mobile
                              , trim(email) as email
                    FROM      {table_name}"""

### cleanup still to do in pandas ###
# street and city need to have formatting standardized (eg n vs n. and 19th st vs 19 st)
# some states are written as full names, some as abbreviations

master_df = create_user_master_df(conn, sf_cont_query)

# make a single address column
master_df['address'] = (master_df['street'] + ' ' + master_df['city'] + ' ' + master_df['state_etc'] + ' ' + master_df['zipcode'] + ' ' + master_df['country']).str.strip()
# drop extraneous address columns
master_df = master_df[['name', 'address', 'phone', 'mobile', 'email']]

# format phone numbers to be just numbers
master_df['phone'] = master_df['phone'].apply(lambda phone_string: ''.join([c for c in phone_string if c in '1234567890']))
master_df['mobile'] = master_df['mobile'].apply(lambda phone_string: ''.join([c for c in phone_string if c in '1234567890']))
# cut down (internal) consecutive whitespaces to one white space (external whitespace has already been stripped)
master_df['name'] = master_df['name'].apply(lambda text_string: re.sub(r'\s+', ' ', text_string))
master_df['address'] = master_df['address'].apply(lambda text_string: re.sub(r'\s+', ' ', text_string))

# add empty columns for the datasets that will be merged
master_df['volgistics_id'] = np.nan
master_df['petpoint_id'] = np.nan
master_df['sf_donations_id'] = np.nan

master_df.head(10)

Unnamed: 0,name,address,phone,mobile,email,volgistics_id,petpoint_id,sf_donations_id
0,"kiyota, loren",704 wynnemoor way orinda co 7701 us,,,pzv@b.scf,,,
1,"trujillo, lisa",moore rd,,,,,,
2,"thomas, jade",220 annin st malvern pennsylvania 20009 us,1276261767.0,7147111110.0,mvkbtwogp@rgvqkued.egp,,,
3,"rascon, hannah",150 chestnut st scotch plain in 18640 us,5445554550.0,1413431454.0,xebqfclvop@qfrhgzkuo.xzi,,,
4,"flores, robert",5818 bristol tokyo 19123 us,2352355555.0,,rapwxnko@ltkp.ect,,,
5,"wong, kale",6555 north hartland baden württemberg 60612 us,3355333533.0,,hemqwzu@tgcdy.hdy,,,
6,"tafoya, sean",27 edgewater drive home id 60643 us,1421154224.0,,ajetbxf@nszbimqcdu.mji,,,
7,"donthinani, faadil",404 e redondo ave sagamore hills dc 19146 us,33453343334.0,,frm@apo.yko,,,
8,"frudden, nulong",3313 s quince street reading or 19342 us,5565151110.0,,p@htbanpzj.jhr,,,
9,"hebert, alan",200 n. carroll street natick wa 19145 us,6681118081.0,,myxhgodz@fdolia.bgv,,,


### 3. For each dataset, merge each record with the ***metadata master table***
If a match is found, link the two sources. If not, create a new record. <br/>

In [None]:
def fuzzy_merge(new_df, master_df):
    """
    This function merges each new dataset with the metadata master table by
    going line-by-line on the new dataset and looking for a match in the 
    existing metadata master dataset. If a match is found
    
    Pseudo-code:
        LOOP: For each line in the new_df, compare that line against all lines in 
        the master_df. 
        
        LOGIC: For each comparison, generate (a) a fuzzy-match score on name,
        (b) T/F on whether zip-code matches, (c) T/F on whether email matches,
        (d) T/F on whether phone number matches.
        
        OUTPUT: For each comparison if the fuzzy-match score is above a threshold (e.g. >=90%)
        and (b), (c) or (d) matches, consider it a match and add the new dataset 
        id to the existing record. If it doesn't match, create a new record in the
        master dataset.
        
    Note: there's probably a more efficient way to do this (vs. going line-by-line)
    """

#### 3.A Petpoint merge 
Apply function above the Petpoint dataset

#### 3.B Volgistics merge
Apply function above the Volgistics dataset

#### 3.C Other - TBD - Merge

### 4. Write the new table to the database

In [4]:
# load_to_sqlite(master_df, master_table, conn)

## Other - placeholder - graveyard
Graveyard/placeholder code from previous sections

In [None]:
# simple join to check that it worked and the tables can be queried

df = pd.read_sql('''select * from petpoint as pp 
                    join volgistics as vol 
                    on pp."unnamed:_0" = vol."unnamed:_0"

                    join (SELECT * FROM salesforcecontacts AS sf_contacts
                            JOIN salesforcedonations AS sf_donations
                            ON sf_contacts."Account_ID" = sf_donations."Account_ID") as sf
                    on pp."unnamed:_0" = sf."unnamed:_0"
                    
                    ''', conn)

df.head()

In [None]:
# get all data matching on (first name + last name)

df2 = pd.read_sql('''SELECT * FROM petpoint AS pp
                     INNER JOIN volgistics AS vol ON pp."Intake_Record_Owner" = vol."First_name_Last_name"
                     INNER JOIN (SELECT * FROM salesforcecontacts AS sf_contacts
                            JOIN salesforcedonations AS sf_donations
                            ON sf_contacts."Account_ID" = sf_donations."Account_ID") AS sf
                     ON pp."Intake_Record_Owner" = (sf."First_Name" + " " + sf."Last_Name")
                  ''', conn)
df2.head()

In [None]:
# close database connection

conn.close()