In [1]:
import pandas as pd
import re

In [2]:
def add_source(df, source):
    """Add columns indictating the lab the data came from"""
    dataframe = df.copy()
    dataframe['ana360'] = int(source == 'ana360')
    dataframe['psilabs'] = int(source == 'psilabs')
    dataframe['sclabs'] = int(source == 'sclabs')
    return dataframe

In [3]:
def drop_columns(df):
    """Drop columns other than name that do not have numeric values that can be averaged"""
    drops = ['Test Result UID', 'Receipt Time', 'Test Time', 'Provider']
    if 'Post Time' in df.columns:
        drops = drops + ['Post Time']
    return df.drop(columns=drops)

In [4]:
def conform_names(name):
    """Standardize names of strains across data sets"""
    
    # change any upper case letters to lower case
    name = name.lower()
    
    # strip out text enclosed in brackets, parantheses, or quotes
    brackets = r'[\[({<].*?[\])}>]'
    name = re.sub(brackets, '', name)
    
    # remove characters that are not letters, numbers, spaces, hyphens, or underscores
    drop = r'[^a-z0-9 _-]'
    name = re.sub(drop, '', name)
    
    # replace hyphens and underscores with spaces
    name = re.sub(r'[_-]', ' ', name)
    
    # split string on spaces and rejoin with hypens
    name = '-'.join(name.split())
    
    return name
    
    

In [5]:
def process_dataframe(dataframe, source):
    """Process dataframe and rename column"""
    dataframe = add_source(dataframe, source)
    dataframe = drop_columns(dataframe)
    dataframe['Sample Name'] = dataframe['Sample Name'].apply(conform_names)
    dataframe = dataframe.rename(columns={'Sample Name': 'strain'})
        
    return dataframe

### Create dataframes and process them to a single dataframe

In [6]:
ana360 = pd.read_csv('./web_scrapers/analytical360/results.csv')
ana360 = process_dataframe(ana360, 'ana360')

In [7]:
psilabs = pd.read_csv('./web_scrapers/psilabs/results.csv')
psilabs = process_dataframe(psilabs, 'psilabs')

In [8]:
sclabs = pd.read_csv('./web_scrapers/sclabs/results.csv')
sclabs = process_dataframe(sclabs, 'sclabs')

In [9]:
df = pd.concat([ana360, psilabs, sclabs])

In [10]:
df = df.sort_values(by=['strain'])

In [11]:
df[df['strain'].str.strip() == '']

Unnamed: 0,strain,Sample Type,cis-Nerolidol,trans-Nerolidol,trans-Nerolidol 1,trans-Nerolidol 2,trans-Ocimene,3-Carene,Camphene,Caryophyllene Oxide,...,CBD,CBDV,CBDV-A,delta-9 CBG-A,delta-9 CBG,CBC,Moisture Content,ana360,psilabs,sclabs
10379,,Archived,,,,,,,,0.47,...,0.01,,,0.24,0.61,0.09,,1,0,0
4914,,Archived,,,,,,,,0.26,...,0.14,,,0.6,0.21,0.05,,1,0,0


In [12]:
# remove rows with missing strain value
df = df.drop(df[df['strain'].str.strip() == ''].index, axis=0)

In [13]:
df.to_csv('./Intermediate_data/results_compiled.csv', index=False)