In [1]:
import pandas as pd
import json

In [2]:
with open('./all_scraped_data.json') as file:
    data = json.load(file)
    
data[:1]

[{'url': 'https://www.houseplans.com/plan/2047-square-feet-4-bedroom-3-bathroom-2-garage-country-craftsman-farmhouse-ranch-sp346871',
  'image_urls': ['houseplan_images/url_1/image_1.jpg',
   'houseplan_images/url_1/image_2.jpg',
   'houseplan_images/url_1/image_3.jpg',
   'houseplan_images/url_1/image_4.jpg',
   'houseplan_images/url_1/image_5.jpg',
   'houseplan_images/url_1/image_6.jpg',
   'houseplan_images/url_1/image_7.jpg',
   'houseplan_images/url_1/image_8.jpg'],
  'categories': {'Basic Features': ['Bedrooms : 4',
    'Baths : 3',
    'Stories: 1',
    'Garages: 2'],
   'Dimension': ['Depth : 62\' 10"', 'Height : 29\' 3"', "Width : 55'"],
   'Area': ['Garage : 467 sq/ft',
    'Main Floor : 2047 sq/ft',
    'Porch : 423 sq/ft'],
   'Ceiling': ["Garage Ceiling : 9'", "Main Ceiling : 9'"],
   'Roof': ['Primary Pitch : 7:12',
    'Roof Type : stick',
    'Secondary Pitch : 3:12'],
   'Exterior Wall Framing': ['Exterior Wall Finish : siding', 'Framing : 2x4'],
   'Bedroom Features'

In [4]:
# Extracting the relevant data and forming a structured dictionary
structured_data = []
for entry in data:
    if entry["image_urls"]:
        url_directory = entry["image_urls"][0].split('/')[1]
    else:
        url_directory = "No URL"
    
    basic_features = {}
    if "Basic Features" in entry["categories"]:
        basic_features = {feature.split(':')[0].strip(): feature.split(':')[1].strip() for feature in entry["categories"]["Basic Features"]}
    
    dimensions = {}
    if "Dimension" in entry["categories"]:
        dimensions = {dimension.split(':')[0].strip(): dimension.split(':')[1].strip() for dimension in entry["categories"]["Dimension"]}
    
    structured_data.append({
        "Image Directory": url_directory,
        **basic_features,
        **dimensions
    })

# Converting the structured data into a pandas DataFrame
df = pd.DataFrame(structured_data)

# Displaying the DataFrame
df.head()

Unnamed: 0,Image Directory,Bedrooms,Baths,Stories,Garages,Depth,Height,Width
0,url_1,4,3.0,1,2,"62' 10""","29' 3""",55'
1,url_2,3,2.5,1,2,"47' 8""","25' 1""",45'
2,url_3,4,3.5,1,3,"69' 10""","29' 9""","83' 1"""
3,url_4,1,1.0,1,3,45',"29' 10""",66'
4,url_5,1,1.0,1,2,38',"26' 7""",48'


In [5]:
df.describe()

Unnamed: 0,Image Directory,Bedrooms,Baths,Stories,Garages,Depth,Height,Width
count,1004,995,995,995,995,995,953,995
unique,996,8,14,4,9,272,224,260
top,No URL,3,2,2,2,50',26',40'
freq,9,467,318,529,440,34,34,61


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Image Directory  1004 non-null   object
 1   Bedrooms         995 non-null    object
 2   Baths            995 non-null    object
 3   Stories          995 non-null    object
 4   Garages          995 non-null    object
 5   Depth            995 non-null    object
 6   Height           953 non-null    object
 7   Width            995 non-null    object
dtypes: object(8)
memory usage: 62.9+ KB


In [7]:
# Handle NaN values (for this example, we'll drop rows with any NaN values)
df = df.dropna()

In [8]:
df.isna().sum()

Image Directory    0
Bedrooms           0
Baths              0
Stories            0
Garages            0
Depth              0
Height             0
Width              0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 953 entries, 0 to 1003
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Image Directory  953 non-null    object
 1   Bedrooms         953 non-null    object
 2   Baths            953 non-null    object
 3   Stories          953 non-null    object
 4   Garages          953 non-null    object
 5   Depth            953 non-null    object
 6   Height           953 non-null    object
 7   Width            953 non-null    object
dtypes: object(8)
memory usage: 67.0+ KB


In [10]:
# Remove non-numeric characters and convert to float
import numpy as np
def convert_to_float(value):
    if isinstance(value, str):
        value = value.replace("'", "").replace("\"", "").replace(",", "").strip()
        parts = value.split()
        if len(parts) == 2:
            return float(parts[0]) + float(parts[1]) / 12
        elif len(parts) == 1:
            return float(parts[0])
    return float(value)


In [11]:
columns_to_convert = ["Bedrooms", "Baths", "Stories", "Garages", "Depth", "Height", "Width"]
for column in columns_to_convert:
    df[column] = df[column].apply(lambda x: convert_to_float(x) if pd.notnull(x) else np.nan)


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 953 entries, 0 to 1003
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Image Directory  953 non-null    object 
 1   Bedrooms         953 non-null    float64
 2   Baths            953 non-null    float64
 3   Stories          953 non-null    float64
 4   Garages          953 non-null    float64
 5   Depth            953 non-null    float64
 6   Height           953 non-null    float64
 7   Width            953 non-null    float64
dtypes: float64(7), object(1)
memory usage: 67.0+ KB


In [13]:
df.head()

Unnamed: 0,Image Directory,Bedrooms,Baths,Stories,Garages,Depth,Height,Width
0,url_1,4.0,3.0,1.0,2.0,62.833333,29.25,55.0
1,url_2,3.0,2.5,1.0,2.0,47.666667,25.083333,45.0
2,url_3,4.0,3.5,1.0,3.0,69.833333,29.75,83.083333
3,url_4,1.0,1.0,1.0,3.0,45.0,29.833333,66.0
4,url_5,1.0,1.0,1.0,2.0,38.0,26.583333,48.0
