In [1]:
import pandas as pd
import re

In [2]:
def remove_strain(name):
    """Temp function to remove excess word 'strain'"""
    return re.sub(r'Strain$', '', name)

In [3]:
def conform_names(name):
    """Standardize names of strains across data sets"""
    
    # change any upper case letters to lower case
    name = name.lower()
    
    # strip out text enclosed in brackets, parantheses, or quotes
    brackets = r'[\[({<].*?[\])}>]'
    name = re.sub(brackets, '', name)
    
    # remove characters that are not letters, numbers, spaces, hyphens, or underscores
    drop = r'[^a-z0-9 _-]'
    name = re.sub(drop, '', name)
    
    # replace hyphens and underscores with spaces
    name = re.sub(r'[_-]', ' ', name)
    
    # split string on spaces and rejoin with hypens
    name = '-'.join(name.split())
    
    return name
    
    

### Add a column to each csv file and save it back with a new file name

In [4]:
kag = pd.read_csv('./raw_data/cannabis.csv')

In [5]:
kag['strain'] = kag['Strain'].apply(conform_names)

In [6]:
kag.to_csv('./raw_data/cannabis_kaggle.csv', index=False)

In [7]:
ana = pd.read_csv('./web_scrapers/analytical360/results.csv')

In [8]:
ana['strain'] = ana['Sample Name'].apply(conform_names)

In [9]:
ana.to_csv('./web_scrapers/analytical360/results_analytical360.csv', index=False)

In [10]:
psi = pd.read_csv('./web_scrapers/psilabs/results.csv')

In [11]:
psi['strain'] = psi['Sample Name'].apply(conform_names)

In [12]:
psi.to_csv('./web_scrapers/psilabs/results_psilabs.csv', index=False)

In [13]:
scl = pd.read_csv('./web_scrapers/sclabs/results.csv')

In [14]:
scl['strain'] = scl['Sample Name'].apply(conform_names)

In [15]:
scl.to_csv('./web_scrapers/sclabs/results_sclabs.csv', index=False)

In [16]:
allbuds = pd.read_csv('./web_scrapers/scrapy_morereviews_images/allbuds_strain_data.csv', names=['name', 'desc'])

In [17]:
# temp cell until new scrapping of allbuds
allbuds['name'] = allbuds['name'].apply(remove_strain)

In [18]:
allbuds['strain'] = allbuds['name'].apply(conform_names)

In [19]:
allbuds.to_csv('./web_scrapers/scrapy_morereviews_images/allbuds_rename_data.csv', index=False)