# Clean Lab Data

Consolidate results by family, type.  Clean names

In [151]:
# Load Data
import pandas as pd
import numpy as np

data = pd.read_csv('left_merged.csv')

In [152]:
# Drop Lab Overview Columns
data = data.drop(columns=['ana360', 'psilabs', 'sclabs'])

# Drop unnecessary data columns

data = data.drop(columns=['Moisture Content'])

In [153]:
# Consolidate THC to Total_THC
thc_cols = ['delta-9 THC-A', 'delta-9 THC', 'delta-8 THC', 'THC-A', 'THCV']
cbd_cols = ['CBD-A', 'CBD', 'CBDV', 'CBDV-A']
cbg_cols = ['delta-9 CBG-A','delta-9 CBG']
nerolidol_cols = ['cis-Nerolidol', 'trans-Nerolidol', 'trans-Nerolidol 1', 'trans-Nerolidol 2']
ocimene_cols = ['trans-Ocimene', 'beta-Ocimene']
caryophyllene_cols = ['beta-Caryophyllene', 'Caryophyllene Oxide']
pinene_cols = ['alpha-Pinene','beta-Pinene']
terpinene_cols = ['alpha-Terpinene','gamma-Terpinene']


# Copy data for transformations, calculation of total active components
tf = data[thc_cols].copy().fillna(value=0)
cf = data[cbd_cols].copy().fillna(value=0)
gf = data[cbg_cols].copy().fillna(value=0)
nf = data[nerolidol_cols].copy().fillna(value=0)
of = data[ocimene_cols].copy().fillna(value=0)
caf = data[caryophyllene_cols].copy().fillna(value=0)
pf = data[pinene_cols].copy().fillna(value=0)
tef = data[terpinene_cols].copy().fillna(value=0)

In [154]:
# Calculate total active component and similar analytes
tf['Total_THC'] = tf[['THC-A', 'delta-9 THC-A']].max(axis=1) * 0.877 + tf['delta-9 THC'] + tf['delta-8 THC'] + tf['THCV']
cf['Total_CBD'] = cf[['CBD-A', 'CBDV-A']].sum(axis=1) * 0.877 + cf['CBD'] + cf['CBDV']
gf['Total_CBG'] = gf['delta-9 CBG-A'] + gf['delta-9 CBG']
nf['Nerolidol'] = nf.sum(axis=1)
of['Ocimene'] = of.sum(axis=1)
caf['Caryophyllene'] = caf.sum(axis=1)
pf['Pinene'] = pf.sum(axis=1)
tef['Terpinene'] = tef.sum(axis=1)


# Replace zeroes with NaN
tf.Total_THC = tf.Total_THC.replace(to_replace=0, value=np.NaN)
cf.Total_CBD = cf.Total_CBD.replace(to_replace=0, value=np.NaN)
gf.Total_CBG = gf.Total_CBG.replace(to_replace=0, value=np.NaN)
nf.Nerolidol = nf.Nerolidol.replace(to_replace=0, value=np.NaN)
of.Ocimene = of.Ocimene.replace(to_replace=0, value=np.NaN)
caf.Caryophyllene = caf.Caryophyllene.replace(to_replace=0, value=np.NaN)
pf.Pinene = pf.Pinene.replace(to_replace=0, value=np.NaN)
tef.Terpinene = tef.Terpinene.replace(to_replace=0, value=np.NaN)

In [155]:
# Add summary THC and CBD columns back to original dataframe.  Drop working columns.
data = data.drop(columns=thc_cols)
data = data.drop(columns=cbd_cols)
data = data.drop(columns=cbg_cols)
data = data.drop(columns=nerolidol_cols)
data = data.drop(columns=ocimene_cols)
data = data.drop(columns=caryophyllene_cols)
data = data.drop(columns=pinene_cols)
data = data.drop(columns=terpinene_cols)

data['Total_THC'] = tf.Total_THC
data['Total_CBD'] = cf.Total_CBD
data['Total_CBG'] = gf.Total_CBG
data['Nerolidol'] = nf.Nerolidol
data['Ocimene'] = of.Ocimene
data['Caryophyllene'] = caf.Caryophyllene
data['Terpinene'] = tef.Terpinene
data.head()

Unnamed: 0,Strain,Type,Percent Indica,Percent Sativa,THC Percent,Description 1,Description 2,Generated Description,Flavor,Effects,...,delta-Limonene,p-Cymene,CBN,CBC,Total_THC,Total_CBD,Total_CBG,Nerolidol,Caryophyllene,Terpinene
0,sugar-cane,hybrid,0.4,0.6,0.2,Sugar Cane is a rare slightly sativa dominant ...,,,Earthy Sweet Candy Grape Spicy Fruity Herbal P...,Body High Cerebral Creative Energizing Relaxin...,...,,,,,,,,,,
1,mac1,hybrid,0.5,0.5,0.215,"MAC 1, also known as “Miracle Alien Cookies X1...",,,Sweet Diesel Sour Spicy Herbal Pungent,Creative Euphoria Happy Motivation Relaxing Up...,...,,,,,,,,,,
2,chemdawg,hybrid,0.55,0.45,0.19,With a near-even balance between sativa and in...,Chemdawg has developed quite the name for itse...,,Earthy Pungent Chemical Diesel Pine Diesel Ear...,Cerebral Creative Euphoria Happy Relaxing Cere...,...,0.445571,,0.07,0.069,18.160529,0.239965,0.876875,,0.812522,
3,jack-herer,sativa,,,0.23,Jack Herer is easily one of the best-known str...,Jack Herer is a sativa-dominant cannabis strai...,,Earthy Sweet Spicy Herbal Lemon Pine Woody Ear...,Body High Cerebral Creative Euphoria Happy Bod...,...,0.461663,0.01,0.082,0.046667,15.461573,0.206155,0.962574,,0.656769,0.034
4,nerds,hybrid,0.5,0.5,0.155,"Nerds, also known as “Nerdz,” is an evenly bal...",,,Earthy Sweet Grape Spicy Herbal Fruity Berry W...,Cerebral Creative Euphoria Focus Relaxing Cere...,...,,,,,,,,,,


In [156]:
# Clean remaining names for easier search/load operation

name_delta_map = {
    '3-Carene': 'Carene',
    'p-Cymene': 'Cymene',
    'alpha-Bisabolol': 'Bisabolol',
    'alpha-Humulene': 'Humulene',
    'beta-Myrcene': 'Myrcene',
    'delta-Limonene': 'Limonene',
}

data = data.rename(columns=name_delta_map)

In [158]:
# Compare THC reported by users/site and directly from labs, using lab value where possible

data['Total_THC'] = data['Total_THC'].fillna(value=data['THC Percent']*100)

# Drop original THC Percent column

data = data.drop(columns='THC Percent')
display(data.head(), data.columns)

Unnamed: 0,Strain,Type,Percent Indica,Percent Sativa,Description 1,Description 2,Generated Description,Flavor,Effects,Rating,...,Limonene,Cymene,CBN,CBC,Total_THC,Total_CBD,Total_CBG,Nerolidol,Caryophyllene,Terpinene
0,sugar-cane,hybrid,0.4,0.6,Sugar Cane is a rare slightly sativa dominant ...,,,Earthy Sweet Candy Grape Spicy Fruity Herbal P...,Body High Cerebral Creative Energizing Relaxin...,,...,,,,,20.0,,,,,
1,mac1,hybrid,0.5,0.5,"MAC 1, also known as “Miracle Alien Cookies X1...",,,Sweet Diesel Sour Spicy Herbal Pungent,Creative Euphoria Happy Motivation Relaxing Up...,,...,,,,,21.5,,,,,
2,chemdawg,hybrid,0.55,0.45,With a near-even balance between sativa and in...,Chemdawg has developed quite the name for itse...,,Earthy Pungent Chemical Diesel Pine Diesel Ear...,Cerebral Creative Euphoria Happy Relaxing Cere...,4.3,...,0.445571,,0.07,0.069,18.160529,0.239965,0.876875,,0.812522,
3,jack-herer,sativa,,,Jack Herer is easily one of the best-known str...,Jack Herer is a sativa-dominant cannabis strai...,,Earthy Sweet Spicy Herbal Lemon Pine Woody Ear...,Body High Cerebral Creative Euphoria Happy Bod...,4.4,...,0.461663,0.01,0.082,0.046667,15.461573,0.206155,0.962574,,0.656769,0.034
4,nerds,hybrid,0.5,0.5,"Nerds, also known as “Nerdz,” is an evenly bal...",,,Earthy Sweet Grape Spicy Herbal Fruity Berry W...,Cerebral Creative Euphoria Focus Relaxing Cere...,,...,,,,,15.5,,,,,


Index(['Strain', 'Type', 'Percent Indica', 'Percent Sativa', 'Description 1',
       'Description 2', 'Generated Description', 'Flavor', 'Effects', 'Rating',
       'strain', 'Carene', 'Camphene', 'Eucalyptol', 'Geraniol', 'Guaiol',
       'Isopulegol', 'Linalool', 'Ocimene', 'Terpinolene', 'Bisabolol',
       'Humulene', 'Myrcene', 'Limonene', 'Cymene', 'CBN', 'CBC', 'Total_THC',
       'Total_CBD', 'Total_CBG', 'Nerolidol', 'Caryophyllene', 'Terpinene'],
      dtype='object')

In [132]:
data.columns.to_numpy().reshape(-1,1)

array([['Strain'],
       ['Type'],
       ['Percent Indica'],
       ['Percent Sativa'],
       ['THC Percent'],
       ['Description 1'],
       ['Description 2'],
       ['Generated Description'],
       ['Flavor'],
       ['Effects'],
       ['Rating'],
       ['strain'],
       ['Carene'],
       ['Camphene'],
       ['Eucalyptol'],
       ['Geraniol'],
       ['Guaiol'],
       ['Isopulegol'],
       ['Linalool'],
       ['Ocimene'],
       ['Terpinolene'],
       ['Bisabolol'],
       ['Humulene'],
       ['Myrcene'],
       ['Limonene'],
       ['Cymene'],
       ['CBN'],
       ['CBC'],
       ['Total_THC'],
       ['Total_CBD'],
       ['Total_CBG'],
       ['Nerolidol'],
       ['Caryophyllene'],
       ['Terpinene']], dtype=object)