In [220]:
from datetime import date, time

import pandas as pd
import numpy as np
from glom import glom

import json
import re

from sqlalchemy import create_engine
from sqlalchemy.sql import text


pd.options.display.max_columns = 100
pd.options.display.max_rows = 300
pd.options.display.max_colwidth = 50

In [75]:
def read_mongoextjson_file(filename):
    with open(filename, "r") as f:
        # read the entire input; in a real application,
        # you would want to read a chunk at a time
        bsondata = '['+f.read()+']'

        # convert the TenGen JSON to Strict JSON
        # here, I just convert the ObjectId and Date structures,
        # but it's easy to extend to cover all structures listed at
        # http://www.mongodb.org/display/DOCS/Mongo+Extended+JSON
        jsondata = re.sub(r'ObjectId\s*\(\"(\S+)\"\)',
                          r'"\1"',
                          bsondata)
#         print(jsondata)
        jsondata = re.sub(r'ISODate\s*\(\s*(\S+)\s*\)',
                          r'\1',
                          jsondata)
        jsondata = re.sub(r'NumberInt\s*\(\s*(\S+)\s*\)',
                          r'"\1"',
                          jsondata)
        
        jsondata = re.sub(r'current_state\\.bid_package',
                          r'current_state_bid_package',
                          jsondata)



        # now we can parse this as JSON, and use MongoDB's object_hook
        # function to get rich Python data structures inside a dictionary
#         data = json.loads(jsondata, object_hook=json_util.object_hook)
        data=json.loads(jsondata)

        return data
    
json_data = read_mongoextjson_file("../data/external/project1.json")

In [176]:
##pd.DataFrame.from_dict(json_data)

In [177]:
##pd.DataFrame.from_dict(json_data[0]["scope_collection_items"])

In [178]:
##pd.json_normalize(json_data[0]['scope_collection_items'][0], 
##                  record_path=['scope_items'], meta=['scope_type'], meta_prefix='scope_collection_items')

In [175]:
lst = []

for scope_itm in json_data[0]['scope_collection_items']:
    
    for sub_itm in scope_itm['scope_items']:
        temp = pd.json_normalize(sub_itm, record_path=['sub_items'], 
                                 meta=['is_selected', 'scope_type'])
        
        temp['area'] = scope_itm['scope_type']
    
        lst.append(temp)
    
f_df = pd.concat(lst)
f_df.reset_index(drop=True, inplace=True)
f_df.head(2)

Unnamed: 0,_id,key,value,is_selected,scope_type,area
0,60749774008b0f001846029c,HINGE_HARDWARE,NONE,True,CABINETS,KITCHEN
1,60749774008b0f001846029d,HANDLE_HARDWARE,NONE,True,CABINETS,KITCHEN


In [183]:
f_df[f_df.scope_type == 'CABINETS']

Unnamed: 0,_id,key,value,is_selected,scope_type,area
0,60749774008b0f001846029c,HINGE_HARDWARE,NONE,True,CABINETS,KITCHEN
1,60749774008b0f001846029d,HANDLE_HARDWARE,NONE,True,CABINETS,KITCHEN
2,60749774008b0f001846029e,FINISH,MELAMINE,True,CABINETS,KITCHEN
3,60749774008b0f001846029f,MATERIAL,ALL_WOOD,True,CABINETS,KITCHEN
4,60749774008b0f00184602a0,CABINET_STYLE,SHAKER,True,CABINETS,KITCHEN
5,60749774008b0f00184602a1,CABINETS,NEW_DOORS_/_DRAWERS,True,CABINETS,KITCHEN


In [221]:
df = f_df[f_df.scope_type == 'CABINETS']
df.value[df.key.str.contains('STYLE')]

'SHAKER'

In [274]:
style = 'N'
style_value = ''
if df.key.str.contains('STYLE').any():
    style = 'Y'
    style_value = df.value[df.key.str.contains('STYLE')].values[0]

color = 'N'
color_value = ''
if df.key.str.contains('COLOR').any():
    color = 'Y'
    color_value = df.value[df.key.str.contains('COLOR')].values[0]

finish = 'N'
finish_value = ''
if df.key.str.contains('FINISH').any():
    finish = 'Y'
    finish_value = df.value[df.key.str.contains('FINISH')].values[0]

material = 'N'
material_value = ''
if df.key.str.contains('MATERIAL').any():
    material = 'Y'
    material_value = df.value[df.key.str.contains('MATERIAL')].values[0]
    
if np.sum(df['value'].str.contains('NEW_DOORS')) > 0:
    cabinet_spec = 'FRONTS, DOORS'

In [275]:
cabinet_spec

'FRONTS, DOORS'

In [263]:
engine = create_engine('postgresql://localhost/TB')

In [276]:
s = "SELECT * \
  FROM sku_data \
  WHERE area = 'KITCHEN' \
    AND scope = 'CABINET' \
    AND spec = %(cabinet_spec)s \
    AND (    (%(style)s = 'Y' and style_a = %(style_value)s) OR (%(style)s = 'N' and style_a > '') \
         AND (%(color)s = 'Y' and color = %(color_value)s) OR (%(color)s = 'N' and color > '') \
         AND (%(finish)s = 'Y' and finish = %(finish_value)s) OR (%(finish)s = 'N' and finish > '') \
         AND (%(material)s = 'Y' and material = %(material_value)s) OR (%(material)s = 'N' and material > '') )" 

             
conn = engine.connect()  

conn.execute(s, {'cabinet_spec' : cabinet_spec,
                'style' : style, 'style_value' : style_value,
                'material' : material, 'material_value' : material_value,
                'finish' : finish, 'finish_value' : finish_value,
                'color' : color, 'color_value' : color_value} ).fetchall() 

[('KITCHEN', 'CABINET', 'FRONTS, DOORS', 'MATERIAL', 'CABINET - FRONTS, DOORS, WOOD, SHAKER', 'SUB CATEGORY, MATERIAL, STYLE A', 'ANY', 'CATALYST HOUSING GROUP', 'GREYSTAR NCA', 'NORTHERN CALIFORNIA', 'ANY', 'ANY', 'ANY', 'ANY', 'ANY', 'ANY', 'ANY', 'ANY', 'DNA', 'SHAKER', 'DNA', 'WOOD', 'DNA', 'DNA', 'DNA', 'DNA', 'B+C', 'COUNT', None, 'MI', 'SERENITY RFP', 'H2 KITCHEN CABINETS WOOD SHAKER - NEW FRONTS ')]

In [None]:
def buld_cabinet_lines(df):
    d = {}
    
    style = 'N'
    if df.key.str.contains('STYLE').any():
        style = 'Y'
        style_value = df.value[df.key.str.contains('STYLE')]
        
    color = 'N'
    if df.key.str.contains('COLOR').any():
        color = 'Y'
        color_value = df.value[df.key.str.contains('COLOR')]
        
    finish = 'N'
    if df.key.str.contains('FINISH').any():
        finish = 'Y'
        finish_value = df.value[df.key.str.contains('FINISH')]
        
    material = 'N'
    if df.key.str.contains('MATERIAL').any():
        material = 'Y'
        material_value = df.value[df.key.str.contains('MATERIAL')]
    
    if np.sum(df['value'].str.contains('NEW_DOORS')) > 0:
        cabinet_spec = 'FRONTS, DOORS'
         
        
    
    

In [174]:
f_df.to_csv("../data/interim/scope_spec_mapping.csv", header=True, index=False)

In [172]:
##[x for x in f_df.key if '_' in x]
f_df.key.unique()

array(['HINGE_HARDWARE', 'HANDLE_HARDWARE', 'FINISH', 'MATERIAL',
       'CABINET_STYLE', 'CABINETS', 'LEVEL', 'COUNTER_TOPS',
       'SINK_MATERIAL', 'BASIN_TYPE', 'SINK_BOWL_TYPE', 'FAUCET_STYLE',
       'FAUCET_HANDLES', 'FAUCET_SPOUT_TYPE', 'BACKSPALSH_MATERIAL',
       'BACKSPLASH_TYPE', 'DIMMABLE', 'BULB_TYPE', 'LIGHT_FIXTURES',
       'IN_DOOR_WATER', 'REFRIGERATOR_STYLE', 'FUEL_TYPE', 'RANGE_TYPE',
       'RANGE_STYLE', 'DISHWASHER', 'INSTALLATION_TYPE', 'MICROWAVE',
       'BATHROOMS_HINGE_HARDWARE', 'BATHROOMS_HANDLE_HARDWARE',
       'BATHROOMS_FINISH', 'BATHROOMS_MATERIAL',
       'BATHROOMS_CABINET_STYLE', 'BATHROOMS_CABINETS',
       'COUNTER_TOP_LEVEL', 'BATHROOM_COUNTER_TOPS', 'SINK_TYPE',
       'LAVATORY_SINK', 'MOUNTING_OPTION', 'BATH_FAUCET', 'TOILET',
       'LIGHT_LOCATION', 'BATH_VANITY_LIGHT',
       'BATH_CEILING_LIGHT_LOCATION', 'BATHROOM_BULB_TYPE',
       'BATH_CEILING_LIGHT', 'SCOPE', 'SHOWER@/@TUB_FINISH',
       'TRIM_FIXTURES_PAN_SURROUND_TYPE', 'TRIM_FI

In [None]:
value_mapping = {'NEW_DOORS_/_DRAWERS':'FRONTS, DOORS',
                }

key_mapping = {'NEW_DOORS_/_DRAWERS':'FRONTS, DOORS',
                }

In [93]:
pd.json_normalize(json_data[0]['scope_collection_items'][0]['scope_items'], 
                  record_path=['sub_items'], meta=['name', 'is_selected', 'scope_type'])

Unnamed: 0,_id,key,value,name,is_selected,scope_type
0,60749774008b0f001846029c,HINGE_HARDWARE,NONE,Cabinets,True,CABINETS
1,60749774008b0f001846029d,HANDLE_HARDWARE,NONE,Cabinets,True,CABINETS
2,60749774008b0f001846029e,FINISH,MELAMINE,Cabinets,True,CABINETS
3,60749774008b0f001846029f,MATERIAL,ALL_WOOD,Cabinets,True,CABINETS
4,60749774008b0f00184602a0,CABINET_STYLE,SHAKER,Cabinets,True,CABINETS
5,60749774008b0f00184602a1,CABINETS,NEW_DOORS_/_DRAWERS,Cabinets,True,CABINETS
6,60749774008b0f00184602a3,LEVEL,L2,Counter tops,True,COUNTER_TOPS
7,60749774008b0f00184602a4,COUNTER_TOPS,QUARTZ,Counter tops,True,COUNTER_TOPS
8,60749774008b0f00184602a6,SINK_MATERIAL,STAINLESS_STEEL,Sink Bowl Type,True,SINK_BOWL_TYPE
9,60749774008b0f00184602a7,BASIN_TYPE,SINGLE_BOWL,Sink Bowl Type,True,SINK_BOWL_TYPE


In [101]:
pd.DataFrame(json_data[0]['scope_collection_items'])

Unnamed: 0,_id,scope_type,is_selected,scope_items
0,60749774008b0f001846029a,KITCHEN,False,"[{'_id': '60749774008b0f001846029b', 'name': '..."
1,60749774008b0f00184602b4,APPLIANCES,False,"[{'_id': '60749774008b0f00184602b5', 'name': '..."
2,60749774008b0f00184602c1,BATHROOM,False,"[{'_id': '60749774008b0f00184602c2', 'name': '..."
3,60749774008b0f00184602ea,FLOORING,False,"[{'_id': '60749774008b0f00184602eb', 'name': '..."
4,60749774008b0f00184602f9,PAINT_AND_LIVING_FIXTURES,False,"[{'_id': '60749774008b0f00184602fa', 'name': '..."
5,60749774008b0f0018460306,REPAIR_WINDOWS,False,"[{'_id': '60749774008b0f0018460307', 'name': '..."


In [99]:
pd.DataFrame(json_data)

Unnamed: 0,_id,design_style,is_deleted,name,style_name,org_id,user_id,scope_collection_items,metadata,modification_notes,__v
0,606236b7e5e61e0011beb013,"{'name': 'Studio', 'mainImage': {'url': 'https...",False,Bid Gen Project,Studio,,a60f2a75-9e0a-4302-8f98-6e49370443b3,"[{'_id': '60749774008b0f001846029a', 'scope_ty...","{'estimated_cost': '', 'description': '', 'sum...","[{'_id': '606236b7e5e61e0011beb092', 'modified...",0


In [139]:
pd.DataFrame(json_data[0]['scope_collection_items'])['scope_items'].apply(lambda x: pd.json_normalize(x[0],
                                                            record_path=['sub_items'], 
                                                            meta=['name', 'is_selected', 'scope_type']))

0                            _id              key  ...
1                            _id                 ke...
2                            _id                   ...
3                            _id               key ...
4                            _id    key value   nam...
5                            _id                   ...
Name: scope_items, dtype: object

In [73]:
with open("../data/external/pinecone.json") as f:
    p_data = json.loads(f.read())

In [27]:
pd.json_normalize(p_data, record_path=['project_floor_plans'])

Unnamed: 0,photo_images,uuid,name,units_count,total_units_count,bathrooms_per_unit,bedrooms_per_unit,area_per_unit,scope_id
0,[b2f6614b-7cbf-48ab-bb9b-4582c20ebc8d],e8f837df-c3f3-40a7-a7eb-c2cb7a629079,Pinyon,60,100,1,1,706,
1,[9c17ce17-5c30-4ad9-bff7-30478f748a58],769b6659-a294-4e3a-bff9-6de727cddf9a,Fp-2,50,50,2,2,900,
2,[],2c3b9155-da27-4b9c-b90a-cc905b703618,Fp-1,40,40,2,2,1000,


In [69]:
json_data[0]['current_state']

{'state': 'BID_UPLOADED',
 'bid_package': [{'files': [{'uuid': '3e377b3b-af9a-48f2-9aa8-6354d6678a0d',
     'photo_bucket_path': 'tailorbirdhomes-dev',
     'photo_folder_path': 'Projects/60819acd008b0f00184617a8/cost%20summary%20pdf',
     'photo_file_name': 'projectSummary.pdf',
     'description': '',
     'photo_tag': None,
     'is_archived': False}],
   'modification_note': {'modified_on': '2021-04-22T19:48:26.994Z',
    'modification_note': 'Project state updated',
    'modified_by': ''},
   'description': 'With more pics'}],
 'modification_note': {'modified_on': '2021-04-22T19:48:27.032Z',
  'modification_note': 'Updated ProjectB2B State',
  'modified_by': ''}}

In [72]:
pd.json_normalize(json_data[0]['project_state_log'])

Unnamed: 0,state,bid_package,modification_note.modified_on,modification_note.modification_note,modification_note.modified_by
0,PROJECT_INITIATED,"[{'_id': '60819acd008b0f00184617a9', 'modifica...",2021-04-22T15:48:29.839Z,Project state updated,
1,FLOORPLAN_ADDED,"[{'_id': '60819b31008b0f00184618a8', 'files': ...",2021-04-22T15:50:09.275Z,Updated ProjectB2B State,
2,FLOORPLAN_ADDED,"[{'_id': '60819bba008b0f00184618a9', 'files': ...",2021-04-22T15:52:26.915Z,Updated ProjectB2B State,
3,FLOORPLAN_ADDED,"[{'_id': '60819bbc008b0f00184618aa', 'files': ...",2021-04-22T15:52:28.536Z,Updated ProjectB2B State,
4,SCOPE_CONFIRMED,"[{'_id': '60819d69008b0f00184618bc', 'files': ...",2021-04-22T15:59:37.293Z,Updated ProjectB2B State,
5,BID_UPLOADED,"[{'_id': '60819dd4008b0f00184618c0', 'files': ...",2021-04-22T16:01:24.357Z,Updated ProjectB2B State,
6,BID_APPROVED,"[{'_id': '60819deb008b0f00184618c2', 'files': ...",2021-04-22T16:01:47.819Z,Updated ProjectB2B State,
7,BID_UPLOADED,"[{'_id': '6081d2bb008b0f00184618cc', 'files': ...",2021-04-22T19:47:07.240Z,Updated ProjectB2B State,
8,BID_UPLOADED,"[{'_id': '6081d30b008b0f00184618ce', 'files': ...",2021-04-22T19:48:27.032Z,Updated ProjectB2B State,


In [64]:
re.search('current_state.bid_package', str(json_data[0]))

<re.Match object; span=(561, 573), match="'bid_package">