In [1]:
import datetime
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('left_merged.csv')

In [3]:
def choose_description(row):
    # if description is not a string, set length to zero
    try:
        length_1 = len(row['Description 1'].split())
    except AttributeError:
        length_1 = 0
    try:
        length_2 = len(row['Description 1'].split())
    except AttributeError:
        length_2 = 0
    
    # return NaN if under 10 words in longest description
    if length_1 < 10 and length_2 < 10:
        return np.nan
    
    if length_1 >= length_2:
        return row['Description 1']
    
    return row['Description 2']

In [4]:
df['Description'] = df.apply(choose_description, axis=1)

In [5]:
def combine_descriptions(row):
    desc_1 = row['Description 1']
    desc_2 = row['Description 2']
    
    # if description is not a string, set it to empty string
    if isinstance(desc_1, float):
        desc_1 = ''
    if isinstance(desc_2, float):
        desc_2 = ''
    
    # return combined descrtiption unless length is zero, then return NaN
    combined_desc = desc_1 + ' ' + desc_2
    return combined_desc if combined_desc != ' ' else np.nan

In [6]:
df['Generated Description'] = df.apply(combine_descriptions, axis=1)

In [7]:
# drop original description columns
df = df.drop(columns=['Description 1', 'Description 2'])

## Process Laboratory Results

In [8]:
data = df

In [9]:
# Drop Lab Overview Columns
data = data.drop(columns=['ana360', 'psilabs', 'sclabs'])

# Drop unnecessary data columns

data = data.drop(columns=['Moisture Content'])

In [10]:
# Consolidate THC to Total_THC
thc_cols = ['delta-9 THC-A', 'delta-9 THC', 'delta-8 THC', 'THC-A', 'THCV']
cbd_cols = ['CBD-A', 'CBD', 'CBDV', 'CBDV-A']
cbg_cols = ['delta-9 CBG-A','delta-9 CBG']
nerolidol_cols = ['cis-Nerolidol', 'trans-Nerolidol', 'trans-Nerolidol 1', 'trans-Nerolidol 2']
ocimene_cols = ['trans-Ocimene', 'beta-Ocimene']
caryophyllene_cols = ['beta-Caryophyllene', 'Caryophyllene Oxide']
pinene_cols = ['alpha-Pinene','beta-Pinene']
terpinene_cols = ['alpha-Terpinene','gamma-Terpinene']


# Copy data for transformations, calculation of total active components
tf = data[thc_cols].copy().fillna(value=0)
cf = data[cbd_cols].copy().fillna(value=0)
gf = data[cbg_cols].copy().fillna(value=0)
nf = data[nerolidol_cols].copy().fillna(value=0)
of = data[ocimene_cols].copy().fillna(value=0)
caf = data[caryophyllene_cols].copy().fillna(value=0)
pf = data[pinene_cols].copy().fillna(value=0)
tef = data[terpinene_cols].copy().fillna(value=0)

In [11]:
# Calculate total active component and similar analytes
tf['Total_THC'] = tf[['THC-A', 'delta-9 THC-A']].max(axis=1) * 0.877 + tf['delta-9 THC'] + tf['delta-8 THC'] + tf['THCV']
cf['Total_CBD'] = cf[['CBD-A', 'CBDV-A']].sum(axis=1) * 0.877 + cf['CBD'] + cf['CBDV']
gf['Total_CBG'] = gf['delta-9 CBG-A'] + gf['delta-9 CBG']
nf['Nerolidol'] = nf.sum(axis=1)
of['Ocimene'] = of.sum(axis=1)
caf['Caryophyllene'] = caf.sum(axis=1)
pf['Pinene'] = pf.sum(axis=1)
tef['Terpinene'] = tef.sum(axis=1)


# Replace zeroes with NaN
tf.Total_THC = tf.Total_THC.replace(to_replace=0, value=np.NaN)
cf.Total_CBD = cf.Total_CBD.replace(to_replace=0, value=np.NaN)
gf.Total_CBG = gf.Total_CBG.replace(to_replace=0, value=np.NaN)
nf.Nerolidol = nf.Nerolidol.replace(to_replace=0, value=np.NaN)
of.Ocimene = of.Ocimene.replace(to_replace=0, value=np.NaN)
caf.Caryophyllene = caf.Caryophyllene.replace(to_replace=0, value=np.NaN)
pf.Pinene = pf.Pinene.replace(to_replace=0, value=np.NaN)
tef.Terpinene = tef.Terpinene.replace(to_replace=0, value=np.NaN)

In [12]:
# Add summary THC and CBD columns back to original dataframe.  Drop working columns.
data = data.drop(columns=thc_cols)
data = data.drop(columns=cbd_cols)
data = data.drop(columns=cbg_cols)
data = data.drop(columns=nerolidol_cols)
data = data.drop(columns=ocimene_cols)
data = data.drop(columns=caryophyllene_cols)
data = data.drop(columns=pinene_cols)
data = data.drop(columns=terpinene_cols)

data['Total_THC'] = tf.Total_THC
data['Total_CBD'] = cf.Total_CBD
data['Total_CBG'] = gf.Total_CBG
data['Nerolidol'] = nf.Nerolidol
data['Ocimene'] = of.Ocimene
data['Caryophyllene'] = caf.Caryophyllene
data['Terpinene'] = tef.Terpinene

In [13]:
# Clean remaining names for easier search/load operation

name_delta_map = {
    '3-Carene': 'Carene',
    'p-Cymene': 'Cymene',
    'alpha-Bisabolol': 'Bisabolol',
    'alpha-Humulene': 'Humulene',
    'beta-Myrcene': 'Myrcene',
    'delta-Limonene': 'Limonene',
}

data = data.rename(columns=name_delta_map)

In [14]:
# Compare THC reported by users/site and directly from labs, using lab value where possible

data['Total_THC'] = data['Total_THC'].fillna(value=data['THC Percent']*100)

# Drop original THC Percent column

data = data.drop(columns='THC Percent')

In [15]:
# Drop strain repeat column
data = data.drop(columns='strain')

In [16]:
# save master dataset to csv file
now = datetime.datetime.now()
path = '../processed_data/master-' + now.strftime("%Y-%m-%d %H:%M") + '.csv'
data.to_csv(path)

In [18]:
# save subset to exports for db creation, web integration
export_cols = ['Strain', 'Type', 'Percent Indica', 'Percent Sativa', 
               'Flavor', 'Effects', 'Rating', 'Carene',
               'Camphene', 'Eucalyptol', 'Geraniol', 'Guaiol', 'Isopulegol',
               'Linalool', 'Ocimene', 'Terpinolene', 'Bisabolol', 'Humulene',
               'Myrcene', 'Limonene', 'Cymene', 'CBN', 'CBC', 'Description',
               'Total_THC', 'Total_CBD', 'Total_CBG', 'Nerolidol', 'Caryophyllene',
               'Terpinene']
path = '../exports/master_export-' + now.strftime("%Y-%m-%d %H:%M") + '.csv'
data[export_cols].to_csv(path)