In [5]:
import pandas as pd
import re

In [52]:
def add_source(df, source):
    """Add columns indictating the lab the data came from"""
    dataframe = df.copy()
    dataframe['ana360'] = int(source == 'ana360')
    dataframe['psilabs'] = int(source == 'psilabs')
    dataframe['sclabs'] = int(source == 'sclabs')
    return dataframe

In [58]:
def drop_columns(df):
    """Drop columns other than name that do not have numeric values that can be averaged"""
    drops = ['Test Result UID', 'Sample Type', 'Receipt Time', 'Test Time', 'Provider']
    if 'Post Time' in df.columns:
        drops = drops + ['Post Time']
    return df.drop(columns=drops)

In [7]:
def conform_names(name):
    """Standardize names of strains across data sets"""
    
    # change any upper case letters to lower case
    name = name.lower()
    
    # strip out text enclosed in brackets, parantheses, or quotes
    brackets = r'[\[({<].*?[\])}>]'
    name = re.sub(brackets, '', name)
    
    # remove characters that are not letters, numbers, spaces, hyphens, or underscores
    drop = r'[^a-z0-9 _-]'
    name = re.sub(drop, '', name)
    
    # replace hyphens and underscores with spaces
    name = re.sub(r'[_-]', ' ', name)
    
    # split string on spaces and rejoin with hypens
    name = '-'.join(name.split())
    
    return name
    
    

In [None]:
def process_dataframes(data_dict):
    """
    Process and append dataframes into one dataframe.
    data_dict pairs are a key that is the string of the dataframe variable name
    and value of the dataframe variable.
    """
    for source, dataframe in data_dict.items():
        dataframe = add_source(dataframe, source)
        dataframe = drop_columns(dataframe)
        dataframe['Sample Name'] = dataframe['Sample Name'].apply(conform_names)
        dataframe.rename(columns={'Sample Name': 'strain'})
        
    
    

### Create dataframes and process them to a single dataframe

In [9]:
ana360 = pd.read_csv('./web_scrapers/analytical360/results.csv')

In [10]:
psilabs = pd.read_csv('./web_scrapers/psilabs/results.csv')

In [11]:
sclabs = pd.read_csv('./web_scrapers/sclabs/results.csv')

In [15]:
dataframe_dict = {'ana360': ana360, 'psilabs': psilabs, 'sclabs': sclabs}
df = process_dataframes(dataframe_dict)

ana360 <class 'str'>
psilabs <class 'str'>
sclabs <class 'str'>


In [20]:
print(df.shape)
df.head()

(47, 46, 46, 47, 46, 46)