# Dataset Preparation

This notebook provide translation and preliminery data cleaning for our original dataset

#### First we import libraries from Python

In [1]:
import pandas as pd
import numpy as np

Original_Dataset = pd.read_csv("2020_H2_Resident_CHI.csv")
Original_Dataset.drop(columns=list(Original_Dataset.columns[5:7]),inplace = True)
Original_Dataset.drop(columns=["land sector position building sector house number plate","the use zoning or compiles and checks","main use","shifting level","the note","serial number"],inplace = True)
Transformed_Dataset = pd.DataFrame()

This cell provides transition of each column in original dataset.

In [10]:
#Translation of district column

Transformed_Dataset["district"] = Original_Dataset["district"].map({"文山區":"Wenshan District",\
                                  "中正區":"ZhongZheng District",\
                                  "內湖區":"Neihu District",\
                                  "萬華區":"Wanhua District",\
                                  "中山區":"Zhongshan District",\
                                  "南港區":"Nangang District",\
                                  "大同區":"Datong District",\
                                  "松山區":"Songshan District",\
                                  "大安區":"Daan District",\
                                  "信義區":"Xinyi District",\
                                  "北投區":"Beitou District",\
                                  "士林區":"Shilin District"})

#Translation of transaction type column
Transformed_Dataset["transaction type"] = Original_Dataset["transaction type"].map({\
                                  "房地(土地+建物)":"Land+Building",\
                                  "房地(土地+建物)+車位":"Land+Building+Garage",\
                                  "建物":"Building"})

#Translation of land shifting total area column
Transformed_Dataset["land shifting total area"] = Original_Dataset["land shifting total area square meter"]

#Translation of transaction year column
def get_year(a):  #This function can extract year from the original value in the column
    try:
        return int(str(int(a))[:-4])
    except:
        return None
    
Transformed_Dataset["transaction year"] = Original_Dataset['transaction year month and day'].apply(get_year)

#Translation of total pen number column and divides the information into three additional columns
num_building = []
num_land = []
num_garage = []
def get_nums(item): #This function returns count of three categories in pen number column 
    global num_building
    global num_land
    global num_garage
    num_land.append(int(item.split("建物")[0][-1]))
    num_building.append(int(item.split("車位")[0].split("建物")[1]))
    num_garage.append(int(item.split("車位")[1]))

Original_Dataset['transaction pen number'].apply(get_nums)

Transformed_Dataset["num_building"] = num_building
Transformed_Dataset["num_land"] = num_land
Transformed_Dataset["num_garage"] = num_garage

#Translation of land shifting total floor number column
def get_chi_num(item): #This function extract chinese number character from original value
    try:
        return item.split("層")[0]
    except:
        return None
    
chi_num_dict={"一":1,"二":2,"三":3,"四":4,"五":5,"六":6,"七":7,"八":8,"九":9,"十":10,\
              "十一":11,"十二":12,"十三":13,"十四":14,"十五":15,"十六":16,"十七":17,"十八":18,"十九":19,"二十":20,\
              "二十一":21,"二十二":22,"二十三":23,"二十四":24,"二十五":25,"二十六":26,"二十七":27,"二十八":28,"二十九":29,"三十":30,\
              "三十一":31,"三十二":32,"三十三":33,"三十四":34,"三十五":35,"三十六":36,"三十七":37,"三十八":38,"三十九":39,"四十":40,\
              "四十一":41,"四十二":42,"四十三":43,"四十四":44,"四十五":45,"四十六":46,"四十七":47,"四十八":48,"四十九":49,"五十":50}

chi_num = Original_Dataset['total floor number'].apply(get_chi_num)
Transformed_Dataset["total floor number"] = chi_num.map(chi_num_dict)

#Translation of building state column
building_state_dict={"透天厝":"House",\
                     "公寓(5樓含以下無電梯)":"Apartment_5storey",\
                     "華廈(10層含以下有電梯)":"Apartment_5to10storey",\
                     "住宅大樓(11層含以上有電梯)":"Apartment_11storeyorgreater",\
                     "套房(1房1廳1衛)":"Suite",\
                     "店面(店鋪)":"Storefront",\
                     "辦公商業大樓":"Commercial Building",\
                     "其他":"Other"}
Transformed_Dataset["building state"] = Original_Dataset["building state"].map(building_state_dict)

#Translation of building material column
building_material_dict = {'鋼筋混凝土造':"Reinforced Concrete",\
                          "加強磚造":"Strengthened Brickwork",\
                          "鋼骨鋼筋混凝土造":"Steel Reinforced Concrete",\
                          "見其他登記事項":"Other",\
                          "磚造":"Brickwork",\
                          "壁式預鑄鋼筋混凝土造":"Precast Reinforced Concrete",\
                          "見使用執照":"Other",\
                          "木造":"Wood",\
                          "石造":"Stone",\
                          "鋼筋混凝土加強磚造":"Reinforced Concrete and Strengthened Brickwork",\
                          "預力混凝土造":"Prestressed Concrete",\
                          "鋼骨混凝土造":"Steel Construction",\
                          "土造":"Clay"}

Transformed_Dataset["main building materials"] = Original_Dataset["main building materials"].map(building_material_dict)

#Translation of construction to complete the years column
Transformed_Dataset["complete year"] = Original_Dataset['construction to complete the years'].apply(get_year)

#Translation of building shifting total area column
Transformed_Dataset['building shifting total area'] = Original_Dataset['building shifting total area']

#Translation of num_room column
Transformed_Dataset['num_room'] = Original_Dataset['Building present situation pattern - room']

#Translation of num_bathroom column
Transformed_Dataset['num_bathroom'] = Original_Dataset['building present situation pattern - health']

#Translation of compartment column
Transformed_Dataset['compartment'] = Original_Dataset['building present situation pattern - compartmented'].map({"有":1,"無":0})

#Translation of management org column
Transformed_Dataset['management org'] = Original_Dataset['Whether there is manages the organization'].map({"有":1,"無":0})

#Translation of carpark type column
carpark_dict={np.nan:"No carpark", '坡道平面':"Ramp Plane", '坡道機械':"Ramp Machinery", '升降機械':"Lifting Plane", '升降平面':"Lifting Machinery", '塔式車位':"Tower", '其他':"other", '一樓平面':"Ground Floor"}
Transformed_Dataset['carpark type'] = Original_Dataset['the berth category'].map(carpark_dict)

#Translation of carpark shifting area column
Transformed_Dataset['carpark shifting area'] = Original_Dataset['berth shifting total area square meter']

#Translation of carpark total price column
Transformed_Dataset['carpark total price'] = Original_Dataset['the berth total price NTD']

#Translation of main building area column
Transformed_Dataset['main building area'] = Original_Dataset['main building area']

#Translation of subsidiary building area column
Transformed_Dataset['subsidiary building area'] = Original_Dataset['subsidiary building area']

#Translation of balcony area column
Transformed_Dataset['balcony area'] = Original_Dataset['balcony area']

#Translation of elevator column
Transformed_Dataset['elevator'] = Original_Dataset['elevator'].map({"有":1,"無":0})

#Translation of total price column
Transformed_Dataset['total ntd'] = Original_Dataset['total price NTD']

#Translation of unit price column
Transformed_Dataset['unit ntd'] = Original_Dataset['the unit price (NTD / square meter)']

#### Export transformed dataset to a csv file

In [12]:
#Export transformed dataset to a csv file
Transformed_Dataset.to_csv("2020_H2_Resident_ENG.csv", index = False)

### Reference

Source of data : Ministry of the Interior of Republic of China

https://plvr.land.moi.gov.tw/DownloadOpenData