In [94]:
import ast
import bs4
import time
import json
import requests
import pandas as pd
import scipy as scipy
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt

%matplotlib inline

## Leafly Dataframe

In [96]:
# leafly_df = pd.read_csv('canna_leafly_df.csv')
leafly_df = pd.read_csv('canna_df.csv')
leafly_df['Feelings'] = leafly_df['Feelings'].map(ast.literal_eval) ## To get dicts instead of strings
leafly_df['Cannabinoids'] = leafly_df['Cannabinoids'].map(ast.literal_eval)

In [97]:
## Open dictionaries in columns ['Feelings','Cannabinoids'] to columns

leafly_df = leafly_df.reset_index(drop=True) 


for i in range(5): 
    leafly_df[f'Feeling_{i+1}'] = leafly_df.apply(lambda x: x['Feelings']['Feelings'][i],axis=1) ## Turn feelings list to columns
    leafly_df[f'Negative_{i+1}'] = leafly_df.apply(lambda x: x['Feelings']['Negatives'][i],axis=1)   
    leafly_df[f'Helps with_{i+1}'] = leafly_df.apply(lambda x: x['Feelings']['Helps with'][i],axis=1) 

    leafly_df[f'Feeling_{i+1}'] = leafly_df[f'Feeling_{i+1}'].str.replace(r'\b $','',regex=True) ## Remove space at the end of values
    leafly_df[f'Negative_{i+1}'] = leafly_df[f'Negative_{i+1}'].str.replace(r'\b $','',regex=True)
    leafly_df[f'Helps with_{i+1}'] = leafly_df[f'Helps with_{i+1}'].str.replace(r'\b $','',regex=True)


leafly_df['THC'] = leafly_df.apply(lambda x: x['Cannabinoids']['THC'],axis=1) ## Turn Cannabinoid dict to columns
leafly_df['CBD'] = leafly_df.apply(lambda x: x['Cannabinoids']['CBD'],axis=1) 
leafly_df['CBG'] = leafly_df.apply(lambda x: x['Cannabinoids']['CBG'],axis=1) 


leafly_df.drop(['Feelings','Cannabinoids'],inplace=True,axis=1) ## Remove old columns

cols = ['Rating','Rating Users','THC','CBD','CBG']

leafly_df['THC'] = leafly_df['THC'].str.replace("%",'',regex=True) # Remove % from numbers
leafly_df['CBD'] = leafly_df['CBD'].str.replace("%",'',regex=True)
leafly_df['CBG'] = leafly_df['CBG'].str.replace("%",'',regex=True)


leafly_df['CBD'].replace("—",np.nan,regex=True,inplace=True) # Replace '-' values with NaN
leafly_df['THC'].replace("—",np.nan,regex=True,inplace=True)
leafly_df['CBG'].replace("—",np.nan,regex=True,inplace=True)

leafly_df[cols] = leafly_df[cols].apply(pd.to_numeric) # Change columns type to numeric




## StrainsOfWeed Dataframe

In [98]:
flavors_df = pd.read_csv('strainofweed_canna_df.csv')
flavors_df['Flavors'] = flavors_df['Flavors'].map(ast.literal_eval) ## To get lists instead of strings

flavors_df[['Flavor_1','Flavor_2','Flavor_3']] = pd.DataFrame(flavors_df.Flavors.tolist(),index=flavors_df.index) ## Open lists to columns
flavors_df.drop(columns=['Flavors'],inplace=True) ## Remove original 'Flavors' column

flavors_df['Flavor_1'].replace("",np.nan,regex=True,inplace=True) ## Replace empty values with NaN
flavors_df['Flavor_2'].replace("",np.nan,regex=True,inplace=True)
flavors_df['Flavor_3'].replace("",np.nan,regex=True,inplace=True)

flavors_df = flavors_df.loc[:,~flavors_df.columns.str.match("Unnamed")] ## Remove 'Unnamed' column that was added


## Combining both Dataframes

In [100]:
for i in range(1,4):
    
    leafly_df[f'Flavor_{i}'] = leafly_df['Strain Name'].map(flavors_df.set_index('Strain Name')[f'Flavor_{i}'])   


In [106]:
df = leafly_df.copy()
# leafly_df.shape

## Data adjustment

In [107]:
def get_nan_replace_value(df,parent,mode,col):
    ## Get value to complete in NaN
    func_df = df.loc[(df['Left Parent'] == parent) | (df['Right Parent'] == parent) | (df['Strain Name'] == parent)] ## All rows with same parent, including the parent

    if mode == 'Feelings' or mode == 'Terpenes' or mode == 'Flavors':
        try:
            feeling = func_df[col].value_counts().idxmax() ## Limits to look only at the same col

        except Exception as e:
            feeling = np.NaN

        return feeling

    elif mode == 'Cannabinoids':

        mean = round(func_df[col].mean(),0)
        
        return mean

def complete_nan(df,col_names):
        ## Create Dataframe that contains rows where all Feeling/Negative/Helps With/Flavor columns are NaN and has at least 1 parent
    for i,col in enumerate(col_names,0):
        print(f'Working on {col}')
        if 'Feeling' in col: ## Create DF that contains 'Feeling' columns with NaN that has at least 1 parent that's not NaN
            nanim = df[(df['Feeling_1'].isnull()) & (df['Feeling_2'].isnull()) & (df['Feeling_3'].isnull()) & (df['Feeling_4'].isnull()) & (df['Feeling_5'].isnull()) & ((df['Left Parent'].notna()) | (df['Right Parent'].notna()))].copy()
            mode = 'Feelings'

        elif 'Negative' in col:
            nanim = df[(df['Negative_1'].isnull()) & (df['Negative_2'].isnull()) & (df['Negative_3'].isnull()) & (df['Negative_4'].isnull()) & (df['Negative_5'].isnull()) & ((df['Left Parent'].notna()) | (df['Right Parent'].notna()))].copy()
            mode = 'Feelings'
        
        elif 'Helps with' in col: 
            nanim = df[(df['Helps with_1'].isnull()) & (df['Helps with_2'].isnull()) & (df['Helps with_3'].isnull()) & (df['Helps with_4'].isnull()) & (df['Helps with_5'].isnull()) & ((df['Left Parent'].notna()) | (df['Right Parent'].notna()))].copy()
            mode = 'Feelings'

        elif 'Flavor' in col:
            nanim = df[(df['Flavor_1'].isnull()) & (df['Flavor_2'].isnull()) & (df['Flavor_3'].isnull()) & ((df['Left Parent'].notna()) | (df['Right Parent'].notna()))].copy()
            mode = 'Flavors'
        
        else: ## Create DF that contains a 'Cannabinoid' column with NaN, with at least 1 parent that's not NaN
            nanim = df[(df[col].isna()) & ((df['Left Parent'].notna()) | (df['Right Parent'].notna()))].copy() # To apply changes on real DF
            if col == 'Top Terpene':
                mode = 'Terpenes'
            else:
                mode = 'Cannabinoids'
        
        try:
            nanim[col] = nanim.apply(lambda x: get_nan_replace_value(df,x['Left Parent'],mode,col),axis=1)
            nanim[col] = nanim.apply(lambda x: get_nan_replace_value(df,x['Right Parent'],mode,col) if pd.isnull(x[col]) else x[col],axis=1)
            
            df[col].fillna(nanim[col],inplace=True)


        except Exception as e:
            print(e)
            pass
    

    return df

In [108]:
df['Flavor_1'].isna().sum()

2925

In [109]:
cols = ['Feeling_1','Feeling_2','Feeling_3','Feeling_4','Feeling_5','Negative_1','Negative_2','Negative_3','Negative_4','Negative_5','Helps with_1','Helps with_2','Helps with_3','Helps with_4','Helps with_5','THC','CBD','CBG','Top Terpene','Flavor_1','Flavor_2','Flavor_3']
feelings_complete = complete_nan(df,cols) ## Fill NaN

Working on Feeling_1
Working on Feeling_2
Working on Feeling_3
Working on Feeling_4
Working on Feeling_5
Working on Negative_1
Working on Negative_2
Working on Negative_3
Working on Negative_4
Working on Negative_5
Working on Helps with_1
Working on Helps with_2
Working on Helps with_3
Working on Helps with_4
Working on Helps with_5
Working on THC
Working on CBD
Working on CBG
Working on Top Terpene
Working on Flavor_1
Working on Flavor_2
Working on Flavor_3


In [111]:
# feelings_complete.head()
# df['Flavor_1'].isna().sum()


df.head()

Unnamed: 0.1,Unnamed: 0,Strain Name,Type,Rating,Rating Users,Left Parent,Right Parent,Left Child,Right Child,Top Terpene,...,Helps with_4,Feeling_5,Negative_5,Helps with_5,THC,CBD,CBG,Flavor_1,Flavor_2,Flavor_3
0,0,Mind Flayer,Hybrid,,,,,,,Caryophyllene,...,,,,,19.0,,1.0,,,
1,1,,,5.0,68.0,,,,,,...,,,,,,,,,,
2,2,Lucid Dream,Hybrid,4.6,108.0,Blue Dream,Amnesia Haze,,,Myrcene,...,Pain,Creative,Headache,Fatigue,20.0,11.0,1.0,Berry,Blueberry,Sage
3,3,Fred Flipn’ Stoned,Indica,5.0,2.0,Pink Champagne,Straight A's Haze,,,Myrcene,...,PMS,Focused,,,18.0,,1.0,Berry,,
4,4,Black Magic,Indica,4.5,17.0,,,,,Myrcene,...,Insomnia,Focused,,Lack of appetite,18.0,,,Earthy,,


In [112]:
df = df.copy()
df.update(feelings_complete)
df = df.loc[:,~df.columns.str.match("Unnamed")]## A new column was created from some reason

In [113]:


df = df[(df['Rating Users'].notna()) & (df['Type'].notna()) & (df['Strain Name'].notna())].copy() ## Dataframe without NaN values in Rating users / Type / Name columns

df.dropna(subset=['THC'],inplace=True) ## Drop rows with NaN in THC column

df.drop(columns=['Left Parent','Right Parent','Left Child','Right Child','CBG'],inplace=True)


In [19]:
df['CBD'].isna().sum()

2339

In [114]:


df.head(10)



Unnamed: 0,Strain Name,Type,Rating,Rating Users,Left Parent,Right Parent,Left Child,Right Child,Top Terpene,Feeling_1,...,Helps with_4,Feeling_5,Negative_5,Helps with_5,THC,CBD,CBG,Flavor_1,Flavor_2,Flavor_3
2,Lucid Dream,Hybrid,4.6,108.0,Blue Dream,Amnesia Haze,,,Myrcene,Uplifted,...,Pain,Creative,Headache,Fatigue,20.0,11.0,1.0,Berry,Blueberry,Sage
3,Fred Flipn’ Stoned,Indica,5.0,2.0,Pink Champagne,Straight A's Haze,,,Myrcene,Relaxed,...,PMS,Focused,,,18.0,,1.0,Berry,,
4,Black Magic,Indica,4.5,17.0,,,,,Myrcene,Relaxed,...,Insomnia,Focused,,Lack of appetite,18.0,,,Earthy,,
6,NYC Diesel,Hybrid,4.2,939.0,Afghani,Mexican,Blue Diesel,Strawberry Diesel,Myrcene,Happy,...,Pain,Energetic,Headache,Nausea,18.0,4.0,1.0,Diesel,Earthy,Grapefruit
7,Purple Goat,Hybrid,4.2,5.0,Trainwreck,Blueberry Skunk,,,Myrcene,Happy,...,Pain,Creative,,Fatigue,15.0,1.0,1.0,Sweet,Orange,Lemon
8,Alphadawg,Hybrid,5.0,1.0,Chemdawg,,,,Myrcene,Relaxed,...,,,,,16.0,1.0,1.0,Earthy,,
11,Sour Sunset,Hybrid,4.7,49.0,Sour Diesel,,,,Myrcene,Relaxed,...,Headaches,Focused,Headache,Pain,18.0,6.0,1.0,Pungent,Lemon,TreeFruit
12,Purple Chemdawg,Indica,4.5,170.0,Granddaddy Purple,Chemdawg,,,Caryophyllene,Relaxed,...,Depression,Sleepy,Anxious,Insomnia,17.0,7.0,1.0,Chemical,Grape,Diesel
13,Vortex,Sativa,4.3,272.0,Space Queen,Apollo 13,Cinex,Timewreck,Myrcene,Euphoric,...,Pain,Creative,Paranoid,Fatigue,16.0,,1.0,Citrus,Lemon,Tropical
15,Gutbuster,Indica,4.7,17.0,Cookies and Cream,Kimbo Kush,,,Limonene,Relaxed,...,Pain,Creative,,Anxiety,18.0,14.0,1.0,Berry,Vanilla,Woody


In [127]:
curr_df = df.copy()
curr_df = curr_df[(curr_df['Feeling_1'].notna()) & (curr_df['Feeling_2'].notna()) & (curr_df['Negative_2'].notna()) & (curr_df['Negative_1'].notna()) & (curr_df['Helps with_2'].notna()) & (curr_df['Helps with_1'].notna()) & (curr_df['Flavor_1'].notna())].copy()

In [128]:
## Fill rest of NaNs with 'N/A' or 0.0

for i in range(1,6):
    curr_df[f'Feeling_{i}'].fillna('N/A',inplace=True)
    curr_df[f'Negative_{i}'].fillna('N/A',inplace=True)
    curr_df[f'Helps with_{i}'].fillna('N/A',inplace=True)

curr_df.dropna(subset=['THC'],inplace=True)
curr_df['CBD'].fillna(0.0,inplace=True)
curr_df['Top Terpene'].fillna('N/A',inplace=True)
curr_df['Flavor_2'].fillna('N/A',inplace=True)
curr_df['Flavor_3'].fillna('N/A',inplace=True)


In [130]:
curr_df.to_csv('clean_df.csv')

In [129]:
curr_df.shape

(2064, 25)