In [16]:
import pandas as pd
import re

In [17]:
def conform_names(name):
    """Standardize names of strains across data sets"""
    
    # change any upper case letters to lower case
    name = name.lower()
    
    # strip out text enclosed in brackets, parantheses, or quotes
    brackets = r'[\[({<].*?[\])}>]'
    name = re.sub(brackets, '', name)
    
    # remove characters that are not letters, numbers, spaces, hyphens, or underscores
    drop = r'[^a-z0-9 _-]'
    name = re.sub(drop, '', name)
    
    # replace hyphens and underscores with spaces
    name = re.sub(r'[_-]', ' ', name)
    
    # split string on spaces and rejoin with hypens
    name = '-'.join(name.split())
    
    return name
    
    

### conform and rename the strain name column

In [18]:
kag = pd.read_csv('./raw_data/kaggle_cannabis_raw.csv')
kag['Strain'] = kag['Strain'].apply(conform_names)
kag.rename(columns={'Strain': 'strain'})
kag.to_csv('./Intermediate_data/cannabis_kaggle_rename.csv', index=False)

In [19]:
allbuds = pd.read_csv('./web_scrapers/scrapy_morereviews_images/allbuds_strain_data.csv', index_col=0)
allbuds['strain_name'] = allbuds['strain_name'].apply(conform_names)
allbuds.rename(columns={'strain_name': 'strain'})
allbuds.to_csv('./Intermediate_data/allbuds_strain_data_rename.csv', index=False)