In [1]:
# import your libraries and dataset then instantiate
import pandas as pd
import numpy as np

cannabis_url = 'https://raw.githubusercontent.com/Build-Week-Med-Cabinent-4/data-science/main/data/raw/cannabis.csv'
kushy_url = 'https://raw.githubusercontent.com/Build-Week-Med-Cabinent-4/data-science/main/data/raw/strains-kushy_api.2017-11-14.csv'

cannabis = pd.read_csv(cannabis_url)
kushy = pd.read_csv(kushy_url)
cannabis.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [2]:
kushy.head()

Unnamed: 0,id,status,sort,name,slug,image,description,type,crosses,breeder,...,cbn,cbg,cbgm,cbgv,cbc,cbcv,cbv,cbe,cbt,cbl
0,1,1,0,100 OG,,,<p>This strain is named after it's high price ...,Hybrid,,Old School Breeder's Association,...,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,0,707 Headband,,,,Hybrid,378272.0,Unknown Breeder,...,0,,,,,,,,,
2,3,1,0,A-10,,,,Indica,0.0,,...,0,,,,,,,,,
3,4,1,0,Acapulco Gold,,,,Sativa,0.0,,...,0,,,,,,,,,
4,5,1,0,Afghani Bullrider,,,,Hybrid,0.0,Unknown Breeder,...,0,,,,,,,,,


In [3]:
# Find null values
cannabis.isnull().sum()

Strain          0
Type            0
Rating          0
Effects         0
Flavor         46
Description    33
dtype: int64

In [4]:
kushy.isnull().sum()

id                0
status            0
sort              0
name              1
slug           9524
image          9524
description    9523
type             24
crosses        8960
breeder         545
effects        8509
ailment        8553
flavor         8553
location       8903
terpenes       9523
thc               0
thca            889
thcv            889
cbd               0
cbda            889
cbdv            889
cbn               0
cbg             889
cbgm            889
cbgv            889
cbc             889
cbcv            889
cbv             889
cbe             889
cbt             889
cbl             889
dtype: int64

In [5]:
# shape of dfs
print(cannabis.shape)
print(kushy.shape)

(2351, 6)
(9524, 31)


In [6]:
# Dropping a portion of our data so we can standardize output
cannabis_clean = cannabis.dropna(subset=['Flavor', 'Description'])
kushy_clean = kushy[['name', 'ailment']]
print(cannabis.shape)
print(kushy_clean.shape)

(2351, 6)
(9524, 2)


In [7]:
# Check null values
kushy_clean.isnull().sum()

name          1
ailment    8553
dtype: int64

In [8]:
# Dropping nulls
kushy_clean = kushy_clean.dropna()
kushy_clean.shape

(971, 2)

In [9]:
# Merging the 2 datasets since only 1 of them has ailments
merged = pd.merge(cannabis_clean, kushy_clean, left_on='Strain', right_on='name', how='left')
merged.shape

(2280, 8)

In [10]:
# check a description
merged.loc[2]['Description']

'1024 is a sativa-dominant hybrid bred in Spain by Medical Seeds Co. The breeders claim to guard the secret genetics due to security reasons, but regardless of its genetic heritage, 1024 is a THC powerhouse with a sweet and spicy bouquet. Subtle fruit flavors mix with an herbal musk to produce uplifting sativa effects. One specific phenotype is noted for having a pungent odor that fills a room, similar to burning incense.'

In [11]:
# check a ailment
merged.loc[2]['ailment']

'Stress, Pain, Depression, Inflammation'

In [12]:
# Drop the rating to have only strings
merged_strings = merged.drop(['Rating'], axis=1)
merged_strings = merged_strings.fillna('')
merged_strings.isnull().sum()

Strain         0
Type           0
Effects        0
Flavor         0
Description    0
name           0
ailment        0
dtype: int64

In [13]:
# check data types
merged_strings.dtypes

Strain         object
Type           object
Effects        object
Flavor         object
Description    object
name           object
ailment        object
dtype: object

In [14]:
# Concatenate all text
merged_strings['all_text'] = merged_strings.apply(lambda x:''.join(x), axis=1)
merged_strings['all_text'][0]

'100-OghybridCreative,Energetic,Tingly,Euphoric,RelaxedEarthy,Sweet,Citrus$100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.'

In [15]:
merged_strings.head()

Unnamed: 0,Strain,Type,Effects,Flavor,Description,name,ailment,all_text
0,100-Og,hybrid,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,,,"100-OghybridCreative,Energetic,Tingly,Euphoric..."
1,98-White-Widow,hybrid,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,,,"98-White-WidowhybridRelaxed,Aroused,Creative,H..."
2,1024,sativa,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,1024.0,"Stress, Pain, Depression, Inflammation","1024sativaUplifted,Happy,Relaxed,Energetic,Cre..."
3,13-Dawgs,hybrid,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,,,"13-DawgshybridTingly,Creative,Hungry,Relaxed,U..."
4,24K-Gold,hybrid,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",,,"24K-GoldhybridHappy,Relaxed,Euphoric,Uplifted,..."


In [16]:
# save cleaned and merged dataset as csv for flask
# merged_strings.to_csv('merged_dataset.csv', index=False)

In [17]:
# Import libraries and tokenize the string
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en_core_web_lg')
def lemma_producer(text):
    """
    Tokenizes string and returning a list of lemmas.
    """
    lemmas = []
    processed_text = nlp(text)
    
    for token in processed_text:
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    return ' '.join(lemmas)
merged_strings['lemmas'] = merged_strings['all_text'].apply(lemma_producer)
merged_strings['lemmas'][0]

'100-oghybridcreative energetic tingly euphoric RelaxedEarthy Sweet Citrus$100 OG 50/50 hybrid strain pack strong punch supposedly refer strength high price start show Hollywood plant $ 100 OG tend produce large dark green bud stem user report strong body effect indica pain relief alert cerebral feeling thank sativa'

In [18]:
# Vectorize
tfidf = TfidfVectorizer(stop_words="english", min_df=0.025, max_df=0.98, ngram_range=(1,3))
dtm = tfidf.fit_transform(merged_strings['lemmas'])
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
dtm.head()

Unnamed: 0,10,10 week,1st,20,50,60,70,80,active,activity,...,user,variety,way,week,white,white widow,widow,win,woody,yield
0,0.0,0.0,0.0,0.0,0.458184,0.0,0.0,0.0,0.0,0.0,...,0.229781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.132847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.477955,0.48926,0.486584,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198554,0.0
4,0.0,0.0,0.0,0.0,0.0,0.194094,0.0,0.0,0.0,0.0,...,0.205589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# save vectorized data to csv
dtm.to_csv('vectorized.csv', index=False)