In [1]:
import ast
import bs4
import time
import json
import requests
import pandas as pd
import scipy as scipy
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt

%matplotlib inline

In [27]:
df = pd.read_csv('canna_df.csv')
df['Feelings'] = df['Feelings'].map(ast.literal_eval) ## To get dicts instead of strings
df['Cannabinoids'] = df['Cannabinoids'].map(ast.literal_eval)

In [28]:
## Open dictionaries in columns ['Feelings','Cannabinoids'] to columns

df = df.reset_index(drop=True)


for i in range(5): 
    df[f'Feeling_{i+1}'] = df.apply(lambda x: x['Feelings']['Feelings'][i],axis=1) ## Turn feelings list to columns
    df[f'Negative_{i+1}'] = df.apply(lambda x: x['Feelings']['Negatives'][i],axis=1) 
    
    df[f'Helps with_{i+1}'] = df.apply(lambda x: x['Feelings']['Helps with'][i],axis=1) 

    df[f'Feeling_{i+1}'] = df[f'Feeling_{i+1}'].str.replace(r'\b $','',regex=True) ## Remove space at the end of values
    df[f'Negative_{i+1}'] = df[f'Negative_{i+1}'].str.replace(r'\b $','',regex=True)
    df[f'Helps with_{i+1}'] = df[f'Helps with_{i+1}'].str.replace(r'\b $','',regex=True)


df['THC'] = df.apply(lambda x: x['Cannabinoids']['THC'],axis=1) ## Turn Cannabinoid dict to columns
df['CBD'] = df.apply(lambda x: x['Cannabinoids']['CBD'],axis=1) 
df['CBG'] = df.apply(lambda x: x['Cannabinoids']['CBG'],axis=1) 


df.drop(['Feelings','Cannabinoids'],inplace=True,axis=1) ## Remove old columns

cols = ['Rating','Rating Users','THC','CBD','CBG']

df['THC'] = df['THC'].str.replace("%",'',regex=True) # Remove % from numbers
df['CBD'] = df['CBD'].str.replace("%",'',regex=True)
df['CBG'] = df['CBG'].str.replace("%",'',regex=True)


df['CBD'].replace("—",np.nan,regex=True,inplace=True) # Replace '-' values with NaN
df['THC'].replace("—",np.nan,regex=True,inplace=True)
df['CBG'].replace("—",np.nan,regex=True,inplace=True)

df[cols] = df[cols].apply(pd.to_numeric) # Change columns type to numeric

# df['Rating Users'] = df['Rating Users'].astype(np.int64)



In [29]:
def get_nan_replace_value(df,parent,mode,col):

    func_df = df.loc[(df['Left Parent'] == parent) | (df['Right parent'] == parent) | (df['Strain Name'] == parent)] ## All rows with same parent, including the parent

    if mode == 'Feelings':
        try:
            feeling = func_df[col].value_counts().idxmax() ## Limits to look only at the same col

        except Exception as e:
            feeling = np.NaN

        return feeling

    elif mode == 'Cannabinoids':

        mean = round(func_df[col].mean(),0)
        
        return mean



def complete_nan(df,col_names):
        
    for i,col in enumerate(col_names,0):
        if 'Feeling' in col: ## Create DF that contains 'Feeling' columns with NaN that has at least 1 parent that's not NaN
            nanim = df[(df['Feeling_1'].isnull()) & (df['Feeling_2'].isnull()) & (df['Feeling_3'].isnull()) & ((df['Left Parent'].notna()) | (df['Right parent'].notna()))].copy()
            mode = 'Feelings'

        elif 'Negative' in col:
            nanim = df[(df['Negative_1'].isnull()) & (df['Negative_2'].isnull()) & (df['Negative_3'].isnull()) & ((df['Left Parent'].notna()) | (df['Right parent'].notna()))].copy()
            mode = 'Feelings'
        
        elif 'Helps with' in col:
            nanim = df[(df['Helps with_1'].isnull()) & (df['Helps with_2'].isnull()) & (df['Helps with_3'].isnull()) & ((df['Left Parent'].notna()) | (df['Right parent'].notna()))].copy()
            mode = 'Feelings'
        
        else: ## Create DF that contains a 'Cannabinoid' column with NaN, with at least 1 parent that's not NaN
            nanim = df[(df[col].isna()) & ((df['Left Parent'].notna()) | (df['Right parent'].notna()))].copy() # To apply changes on real DF
            mode = 'Cannabinoids'
        
        try:
            nanim[col] = nanim.apply(lambda x: get_nan_replace_value(df,x['Left Parent'],mode,col),axis=1)
            nanim[col] = nanim.apply(lambda x: get_nan_replace_value(df,x['Right parent'],mode,col) if pd.isnull(x[col]) else x[col],axis=1)
            
            df[col].fillna(nanim[col],inplace=True)


        except Exception as e:
            print(e)
            pass
    

    return df

In [37]:

df.loc[0]

Strain Name         Mega Jackpot
Type                      Hybrid
Rating                       4.1
Rating Users                19.0
Left Parent           Jack Herer
Right parent     Northern Lights
Left Child                   NaN
Right Child                  NaN
Feeling_1                Relaxed
Negative_1             Dry mouth
Helps with_1          Depression
Feeling_2                  Happy
Negative_2                   NaN
Helps with_2    Lack of appetite
Feeling_3               Euphoric
Negative_3                   NaN
Helps with_3             Anxiety
Feeling_4               Creative
Negative_4                  None
Helps with_4              Nausea
Feeling_5              Talkative
Negative_5                  None
Helps with_5              Stress
THC                         18.0
CBD                          6.0
CBG                          1.0
Name: 0, dtype: object

In [30]:
cols = ['Feeling_1','Feeling_2','Feeling_3','Negative_1','Negative_2','Negative_3','Helps with_1','Helps with_2','Helps with_3','THC','CBD','CBG']
feelings_complete = complete_nan(df,cols) ## Fill NaN

In [31]:
df = df.copy()
df.update(feelings_complete)
df.columns.str.match("Unnamed") ## A new column was created from some reason
df = df.loc[:,~df.columns.str.match("Unnamed")]

In [35]:

df.to_csv('clean_df.csv')