In [1]:
import pandas as pd
import re

In [2]:
def add_source(df, source):
    """Add columns indictating the lab the data came from"""
    dataframe = df.copy()
    dataframe['ana360'] = int(source == 'ana360')
    dataframe['psilabs'] = int(source == 'psilabs')
    dataframe['sclabs'] = int(source == 'sclabs')
    return dataframe

In [3]:
def drop_columns(df):
    """Drop columns other than name that do not have numeric values that can be averaged"""
    drops = ['Test Result UID', 'Receipt Time', 'Test Time', 'Provider']
    if 'Post Time' in df.columns:
        drops = drops + ['Post Time']
    return df.drop(columns=drops)

In [4]:
def conform_names(name):
    """Standardize names of strains across data sets"""
    
    # change any upper case letters to lower case
    name = name.lower()
    
    # strip out text enclosed in brackets, parantheses, or quotes
    brackets = r'[\[({<].*?[\])}>]'
    name = re.sub(brackets, '', name)
    
    # remove characters that are not letters, numbers, spaces, hyphens, or underscores
    drop = r'[^a-z0-9 _-]'
    name = re.sub(drop, '', name)
    
    # replace hyphens and underscores with spaces
    name = re.sub(r'[_-]', ' ', name)
    
    # split string on spaces and rejoin with hypens
    name = '-'.join(name.split())
    
    return name
    
    

In [5]:
def process_dataframe(dataframe, source):
    """Process dataframe and rename column"""
    dataframe = add_source(dataframe, source)
    dataframe = drop_columns(dataframe)
    dataframe['Sample Name'] = dataframe['Sample Name'].apply(conform_names)
    dataframe = dataframe.rename(columns={'Sample Name': 'strain'})
        
    return dataframe

### Create dataframes and process them to a single dataframe

In [6]:
ana360 = pd.read_csv('./web_scrapers/analytical360/results.csv')
ana360 = process_dataframe(ana360, 'ana360')

In [7]:
psilabs = pd.read_csv('./web_scrapers/psilabs/results.csv')
psilabs = process_dataframe(psilabs, 'psilabs')

In [8]:
sclabs = pd.read_csv('./web_scrapers/sclabs/results.csv')
sclabs = process_dataframe(sclabs, 'sclabs')

In [9]:
ana360.head()

Unnamed: 0,strain,Sample Type,cis-Nerolidol,trans-Nerolidol,trans-Nerolidol 1,trans-Nerolidol 2,trans-Ocimene,3-Carene,Camphene,Caryophyllene Oxide,...,CBD,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs
0,bho-blackberry-22,Archived,,,,,,,,,...,,,,0.08,,,,1,0,0
1,bho-the-sour-bud,Archived,,,,,,,,,...,,,,0.75,0.34,,,1,0,0
2,northern-lights,Archived,,,,,,,,,...,,,,0.66,0.05,,,1,0,0
3,blueberry-kush,Archived,,,,,,,,,...,,,,0.25,0.19,,,1,0,0
4,white-dawg,Archived,,,,,,,,,...,,,,0.12,0.11,,,1,0,0


In [10]:
psilabs.head()

Unnamed: 0,strain,Sample Type,cis-Nerolidol,trans-Nerolidol,trans-Nerolidol 1,trans-Nerolidol 2,trans-Ocimene,3-Carene,Camphene,Caryophyllene Oxide,...,CBD,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs
0,cbdees-whole-live-plant-distillate,Concentrate,,,0.0,0.0,,0.0,0.0,0.0,...,68.65,,,,2.2,2.36,,0,1,0
1,girl-scout-cookie,Concentrate,,,0.0,0.0,,0.0,0.05,0.0,...,0.05,,,,2.76,0.69,,0,1,0
2,blueberry-haze-htfse,Concentrate,0.0,0.0,,,,0.0,0.19,0.0,...,0.01,,,7.66,2.52,0.66,,0,1,0
3,011518-02-straw-lem-sce,Concentrate,0.0,0.0,,,,0.0,0.13,0.0,...,0.01,,,8.75,1.55,0.01,,0,1,0
4,011518-02-straw-lem-sce,Concentrate,0.0,0.0,,,,0.0,0.13,0.0,...,0.01,,,8.75,1.55,0.01,,0,1,0


In [11]:
sclabs.head()

Unnamed: 0,strain,Sample Type,cis-Nerolidol,trans-Nerolidol,trans-Nerolidol 1,trans-Nerolidol 2,trans-Ocimene,3-Carene,Camphene,Caryophyllene Oxide,...,CBD,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs
0,guava-kush,"Flower, Inhalable",,,,,,0.0,0.019,0.011,...,0.0,0.0,0.02,0.39,0.09,0.08,,0,0,1
1,watermelon-rancher,"Flower, Inhalable",,,,,,0.036,0.005,0.0,...,0.0,0.01,0.02,0.11,0.09,0.03,,0,0,1
2,pscr-634,"Concentrate, Inhalable",,,,,,0.0,0.0,0.206,...,0.0,0.0,0.0,1.32,0.12,0.0,,0,0,1
3,stcr-638,"Concentrate, Inhalable",,,,,,0.0,0.0,0.0,...,0.0,0.26,0.0,0.9,0.09,0.11,,0,0,1
4,pscr-639,"Concentrate, Inhalable",,,,,,0.0,0.035,0.038,...,0.0,0.0,0.0,0.97,0.38,0.0,,0,0,1
