In [7]:
import cv2
import glob
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

In [8]:
class PreProcessing():

    def __init__(self, im_path, txt_path):
        self.im_path = im_path
        self.txt_path = txt_path

    def load_images(self):
        # Variables
        im_data   = []
        rooms = []
        house_number = []
        # Read Dataset
        for i , name in enumerate(glob.glob(self.im_path + "\\*")):
            # Read images
            img = cv2.imread(name)
            # Resize and Normalize
            img = cv2.resize(img, (32, 32))/255.0
            # RGB Color
            #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)    % CV2 read images in GBR colors
            # Create Dataset
            im_data.append(img)
            # Create Labels
            rooms.append(name.split("\\")[-1].split('_')[-1].split('.')[-2]) 
            house_number.append(name.split("\\")[-1].split('_')[0])
            if i % 100 == 0:
                    print(f"[INFO]: {i}/21000 processed!")
        
        return im_data, rooms, house_number

    def read_text(self):
        txt_data = []
        # Read Text file
        df = pd.read_csv(self.txt_path,  sep=" ", 
                  names=["F1", "F2", "F3", "F4", "Price"])
        # Consider the four first Column as inputs
        txt_data = df.loc[:,["F1", "F2", "F3", "F4"]]
        # Consider the  last Column as Label
        labels = df.loc[:,["Price"]]

        return txt_data, labels
    
    def label_binarizer(self, labels):
        LB = LabelBinarizer()
        all_labels = LB.fit_transform(labels)

        return all_labels
    
    def train_test_split(txt_data, im_data, labels):
        txt_train, txt_test,labels_train, labels_test = train_test_split(txt_data, labels, test_size = 0.2 )
        img_train, img_test = train_test_split(im_data, labels, test_size = 0.2 )
        return txt_train, txt_test, img_train, img_test, labels_train, labels_test

In [9]:
im_path = r"house_dataset"
txt_path = r"HousesInfo.txt"

Data = PreProcessing(im_path, txt_path)
im_data, rooms, house_number = Data.load_images()

[INFO]: 0/21000 processed!
[INFO]: 100/21000 processed!
[INFO]: 200/21000 processed!
[INFO]: 300/21000 processed!
[INFO]: 400/21000 processed!
[INFO]: 500/21000 processed!
[INFO]: 600/21000 processed!
[INFO]: 700/21000 processed!
[INFO]: 800/21000 processed!
[INFO]: 900/21000 processed!
[INFO]: 1000/21000 processed!
[INFO]: 1100/21000 processed!
[INFO]: 1200/21000 processed!
[INFO]: 1300/21000 processed!
[INFO]: 1400/21000 processed!
[INFO]: 1500/21000 processed!
[INFO]: 1600/21000 processed!
[INFO]: 1700/21000 processed!
[INFO]: 1800/21000 processed!
[INFO]: 1900/21000 processed!
[INFO]: 2000/21000 processed!
[INFO]: 2100/21000 processed!


In [13]:
im_data = np.array(im_data).reshape(535, 4, 32, 32, 3)
print(f"The shape of image Dataset is: {np.array(im_data).shape} ")

The shape of image Dataset is: (535, 4, 32, 32, 3) 


In [10]:
txt_data, labels = Data.read_text()
print(f"The shape of text Datase is: {np.array(txt_data).shape}")
print(f"The shape of labes is: {np.array(labels).shape}")

The shape of text Datase is: (535, 4)
The shape of labes is: (535, 1)


In [11]:
#lb = label_binarizer(labels)
#print(lb[100])
labels = Data.label_binarizer(labels)
print(f"a sample of binarized label is: {labels[np.random.randint(0,len(labels))]}")

a sample of binarized label is: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [14]:
txt_train, txt_test, img_train, img_test, labels_train, labels_test = train_test_split(im_data,txt_data, labels )