In [1]:
import cv2
import glob
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

In [8]:
class PreProcessing():

    def __init__(self, im_path, txt_path):
        self.im_path = im_path
        self.txt_path = txt_path

    ## Step 1: Create Dataset of Images
    def load_images(self):
        # Variables
        bathroom_data = []
        kitchen_data  = []
        frontal_data  = []
        bedroom_data  = []
        
        # Read Images and PreProcessing
        for i , name in enumerate(glob.glob(self.im_path + "\\*")):
            # Read images
            img = cv2.imread(name)
            # Resize and Normalize
            img = cv2.resize(img, (32, 32))/255.0
            # RGB Color
            #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)    % CV2 read images in GBR colors
            # Find the location of the place in each image
            location = name.split("\\")[-1].split('_')[-1].split('.')[-2]
            # Create 4 dataset of images related to each location
            if location == "bathroom": bathroom_data.append(img)
            if location == "kitchen" : kitchen_data.append(img)
            if location == "frontal" : frontal_data.append(img)
            if location == "bedroom" : bedroom_data.append(img)

            if i % 100 == 0:
                    print(f"[INFO]: {i}/21000 processed!")
        return bathroom_data, kitchen_data, frontal_data, bedroom_data

    ## Step 2: Create Dataset of Features
    def read_text(self):
        txt_data = []
        # Read Text file and add Header to each column
        df = pd.read_csv(self.txt_path,  sep=" ", 
                  names=["F1", "F2", "F3", "F4", "Price"])
        # Consider the four first Column as inputs
        txt_data = df.loc[:,["F1", "F2", "F3", "F4"]]
        return txt_data
    
    ## Step 3: Create Labels
    def labels(self, df):
        # Consider the  last Column as Label
        labels = df.loc[:,["Price"]]
        return labels

    ## Step 4: Label binarizer
    def label_binarizer(self, labels):
        LB = LabelBinarizer()
        all_labels = LB.fit_transform(labels)
        return all_labels

    ## Step 5: Train-Test Split
    def train_test_split(txt_data, bathroom_data, kitchen_data, frontal_data, bedroom_data, labels):
        """random_state=42 to ensure that all calls to train_test_split use the same random seed,
         so the train/test splits are consistent."""
        txt_train, txt_test           = train_test_split(txt_data,      test_size = 0.2, random_state=42 )
        bathroom_train, bathroom_test = train_test_split(bathroom_data, test_size = 0.2, random_state=42 )
        kitchen_train, kitchen_test   = train_test_split(kitchen_data,  test_size = 0.2, random_state=42 )
        frontal_train, frontal_test   = train_test_split(frontal_data,  test_size = 0.2, random_state=42 )
        bedroom_train, bbedroom_test  = train_test_split(bedroom_data,  test_size = 0.2, random_state=42 )
        labels_train, labels_test     = train_test_split(labels,        test_size = 0.2, random_state=42 )
        return txt_train, txt_test, bathroom_train, bathroom_test, kitchen_train, kitchen_test, frontal_train, frontal_test, bedroom_train, bbedroom_test, labels_train, labels_test

In [9]:
im_path = r"house_dataset"
txt_path = r"HousesInfo.txt"

Data = PreProcessing(im_path, txt_path)
im_data, rooms, house_number = Data.load_images()

[INFO]: 0/21000 processed!
[INFO]: 100/21000 processed!
[INFO]: 200/21000 processed!
[INFO]: 300/21000 processed!
[INFO]: 400/21000 processed!
[INFO]: 500/21000 processed!
[INFO]: 600/21000 processed!
[INFO]: 700/21000 processed!
[INFO]: 800/21000 processed!
[INFO]: 900/21000 processed!
[INFO]: 1000/21000 processed!
[INFO]: 1100/21000 processed!
[INFO]: 1200/21000 processed!
[INFO]: 1300/21000 processed!
[INFO]: 1400/21000 processed!
[INFO]: 1500/21000 processed!
[INFO]: 1600/21000 processed!
[INFO]: 1700/21000 processed!
[INFO]: 1800/21000 processed!
[INFO]: 1900/21000 processed!
[INFO]: 2000/21000 processed!
[INFO]: 2100/21000 processed!


In [13]:
im_data = np.array(im_data).reshape(535, 4, 32, 32, 3)
print(f"The shape of image Dataset is: {np.array(im_data).shape} ")

The shape of image Dataset is: (535, 4, 32, 32, 3) 


In [10]:
txt_data, labels = Data.read_text()
print(f"The shape of text Datase is: {np.array(txt_data).shape}")
print(f"The shape of labes is: {np.array(labels).shape}")

The shape of text Datase is: (535, 4)
The shape of labes is: (535, 1)


In [11]:
#lb = label_binarizer(labels)
#print(lb[100])
labels = Data.label_binarizer(labels)
print(f"a sample of binarized label is: {labels[np.random.randint(0,len(labels))]}")

a sample of binarized label is: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [14]:
txt_train, txt_test, img_train, img_test, labels_train, labels_test = train_test_split(im_data,txt_data, labels )