In [1]:
# Import Necessary libraries
import re
import pandas as pd
from collections import OrderedDict

In [8]:
# Feature extraction :Convert text File raw data To csv feature file 
def features_extraction(txt_file_path,save_csv_file_path):
    ''' Feature extraction :Convert text File raw data To csv feature file
        ----------  Parameter ---------- 
        txt_file_path : Pass Raw data File Path ( source )
        save_csv_file_path : Pass Destination File Path ( Target )
    '''
    
    # Function return feature Values of each raw_data
    def in_data(raw_data):
        ''' Function return feature Values of each raw_data '''
        
        # Create feature Variable
        garden, renovation,king_visit, curse_orcerer,holy_tree = (False for i in range(5))
        dock, capital, royal_market, guarding_tower,river, knight = (0.0 for i in range(6))
        house_id, dining_rooms,bedrooms, bathrooms, king_blessed = (0 for i in range(5))
        location, land_of_farm, date_built, date_price = ('NA' for i in range(4))
        
        # tack raw_data line by line
        for line in raw_data:
            #print(line)
            
            # House ID
            if re.findall(r'House ID',line) != []:
                house_id = (line.split(' ')[-1])
                continue

            # Date Built
            if re.findall(r'Date Built',line) != []:
                date_built = line.split(' ')[3]
                date_price = line.split(' ')[11]
                continue

            # Garden
            if re.findall(r'garden',line) != []:
                if re.findall(r'There a beautiful garden in the front',line) != []:
                    garden = True
                    continue

            # Dock      
            if re.findall(r'Dock',line) != []:
                dock = float(re.findall('\d+\.\d+', line)[-1])
                continue

            # Capital
            if re.findall(r'Capital',line) != []:
                capital = float(re.findall('\d+\.\d+', line)[-1])   
                continue

            # Royal Market    
            if re.findall(r'Royal Market',line) != []:
                royal_market = float(re.findall('\d+\.\d+', line)[-1])
                continue

            # Guarding Tower
            if re.findall(r'Guarding Tower',line) != []:
                guarding_tower = float(re.findall('\d+\.\d+', line)[-1])   
                continue

            # River
            if re.findall(r'River',line) != []:
                river = float(re.findall('\d+\.\d+', line)[-1]) 
                continue

            # Renovation
            if re.findall(r'underwent',line) != []:
                renovation= True    
                continue

            # Dining Rooms
            if re.findall(r'dining rooms',line) != []:
                dining_rooms = int(re.findall('\d+', line)[-1]) 
                continue

            # Bedroom
            if re.findall(r'bedroom',line) != []:
                bedrooms = int(re.findall('\d+', line)[-1])    
                continue

            # Bathrooms
            if re.findall(r'bathrooms',line) != []:
                bathrooms = int(re.findall('\d+', line)[-1]) 
                continue

            # king visit
            if re.findall(r'Visited',line) != []:
                king_visit = True    
                continue

            # cursed by sorcerer
            if re.findall(r'cursed by sorcerer',line) != []:
                curse_orcerer = True   
                continue


            # King blessed    
            if re.findall(r'King blessed',line) != []:
                king_blessed = int(re.findall('\d+', line)[-1])  
                continue
            
            # Land of Farm
            if re.findall(r'land of farm',line) != []:
                if re.findall(r'huge',line) != []:
                    land_of_farm = "huge"
                    continue
                elif re.findall(r'small',line) != []:
                    land_of_farm = "small"
                    continue
                else:
                    land_of_farm = "no"
                    continue

            # Location
            if re.findall(r'Location',line) != []:
                location = line.split(':')[-1]
                continue

            # Holy tree
            if re.findall(r'Holy tree stands',line) != []:
                holy_tree = True
                continue

            # Distance from Knight
            if re.findall(r'Distance from Knight',line) != []:
                knight = float(re.findall('\d+\.\d+', line)[-1])
                continue
        
        # return features
        return (house_id, date_built, date_price, garden, dock, capital, royal_market, guarding_tower,  \
               river, renovation, dining_rooms, bedrooms, bathrooms, king_visit, curse_orcerer,        \
               king_blessed, land_of_farm, location, holy_tree, knight)
    
    # read text file and store each line as a list
    file_context = [line.rstrip('\n') for line in open(txt_file_path)]
    
    # define features list
    house_id, date_built, date_price, garden, dock, capital, royal_market, guarding_tower,  \
    river, renovation, dining_rooms, bedrooms, bathrooms, king_visit, curse_orcerer,        \
    king_blessed, land_of_farm, location, holy_tree, knight = ([] for i in range(20))

    raw_data = []
    
    # split each record and call function
    for line in file_context:
        if line != '':
            raw_data.append(line)
        else:
            if raw_data != []:
                ans = in_data(raw_data)
                raw_data.clear()

                house_id.append(ans[0]), date_built.append(ans[1]), date_price.append(ans[2]), garden.append(ans[3]), \
                dock.append(ans[4]), capital.append(ans[5]), royal_market.append(ans[6]), guarding_tower.append(ans[7]), \
                river.append(ans[8]), renovation.append(ans[9]), dining_rooms.append(ans[10]), bedrooms.append(ans[11]), \
                bathrooms.append(ans[12]), king_visit.append(ans[13]), curse_orcerer.append(ans[14]), \
                king_blessed.append(ans[15]), land_of_farm.append(ans[16]), location.append(ans[17]), \
                holy_tree.append(ans[18]), knight.append(ans[19])
    
    dict_csv = OrderedDict({
        'house_id':house_id, 'date_built':date_built, 'date_price':date_price, 'garden':garden, 'dock':dock, \
        'capital':capital, 'royal_market':royal_market, 'guarding_tower':guarding_tower, 'river':river, \
        'renovation':renovation, 'dining_rooms':dining_rooms, 'bedrooms':bedrooms, 'bathrooms':bathrooms, \
        'king_visit':king_visit, 'curse_orcerer':curse_orcerer, 'king_blessed':king_blessed, \
        'land_of_farm':land_of_farm, 'location':location, 'holy_tree':holy_tree, 'knight':knight
    })
    
    df = pd.DataFrame.from_dict(dict_csv)
    df.to_csv(save_csv_file_path, index=False)

In [7]:
# features extraction Bob.txt --> Bob.csv
features_extraction('Housing Prices/Bob.txt','Dataset/Bob.csv')

In [180]:
# features extraction Bright_Brothers.txt --> Bright_Brothers.csv
features_extraction('Housing Prices/Bright_Brothers.txt','Dataset/Bright_Brothers.csv')

In [181]:
# features extraction Masters_of_Stones.txt --> Masters_of_Stones.csv
features_extraction('Housing Prices/Masters_of_Stones.txt','Dataset/Masters_of_Stones.csv')

In [182]:
# features extraction The_Greens.txt --> The_Greens.csv
features_extraction('Housing Prices/The_Greens.txt','Dataset/The_Greens.csv')

In [183]:
# features extraction The_Kings.txt --> The_Kings.csv
features_extraction('Housing Prices/The_Kings.txt','Dataset/The_Kings.csv')

In [184]:
# features extraction The_Lannisters.txt --> The_Lannisters.csv
features_extraction('Housing Prices/The_Lannisters.txt','Dataset/The_Lannisters.csv')

In [185]:
# features extraction The_Ollivers.txt --> The_Ollivers.csv
features_extraction('Housing Prices/The_Ollivers.txt','Dataset/The_Ollivers.csv')

In [186]:
# features extraction The_Overlords.txt --> The_Overlords.csv
features_extraction('Housing Prices/The_Overlords.txt','Dataset/The_Overlords.csv')

In [187]:
# features extraction The_Starks.txt --> The_Starks.csv
features_extraction('Housing Prices/The_Starks.txt','Dataset/The_Starks.csv')

In [195]:
# features extraction Wood_Priest.txt --> Wood_Priest.csv
features_extraction('Housing Prices/Wood_Priest.txt','Dataset/Wood_Priest.csv')

In [194]:
# features extraction Not_Known.txt --> Not_Known.csv
features_extraction('Housing Prices/Not_Known.txt','Dataset/Not_Known.csv')