## Data introduction and import necessary Libraries

**Resolution:** 30m * 30m<br>
**Target:** Target data set<br>
**Point of Interest (POI):** POI Kernel Density, Resolution range: 250 m - 2500 m, interval = 250 m<br>
**Road Network (RN):** Road Kernel Density, Resolution range: 250 m - 2500 m, interval = 250 m<br>
**XM_Boundary:** mask of all the layers

In [1]:
import math
import json
import copy
import time
import numpy as np
import pandas as pd
from sl_1 import * # my custom module 

In [2]:
# Extract and generate headers
extrat_begin('Target', 'begin')

Generate header named begin.txt from layer Target


## Data preprocessing

### Read ASCII to array and generate header

In [3]:
def read_ASC_Data(file_name, ly_names):
    '''input: file_name, a file name, parent file name of ly_names
       input: ly_names, a list, list of layer names in an ASCII data format
       output: ly_dict: a dictionary whose key is the layer name and the value is stored in an array format
    '''     
    ly_dict = {}
    for name in ly_names:
        ly_dict[name] = np.loadtxt('%s/%s.txt' % (file_name, name), skiprows = 6)
    print('All ASCII data has been read')
    return ly_dict

In [4]:
def get_name_sme(start,mid,end):
    '''input: start, start name of lyer name list
    input: mid, middle name prefixs of lyer name list
    input: end, end name of lyer name list
    output: a lyer name list
    '''
    ly_name = [start]
    ly_name += [mid + str(i) for i in range(250,2750,250)]
    ly_name.append(end)
    return ly_name
# get_name_sme('Target','poi_all_','XM_Boundary') # test example

### Data consistency test
Check whether there are missing values in different positions of the layer, <br>
and auto fill it if missing

In [5]:
def Na_Test(layer_names,stan_layer,ly_dict):
    '''input: layer_names, a list of layer names to be checked for missing values
       input: stan_layer, the normalized layer name used to verify that other layers have missing values
       ly_dict: the dict data read by read_ASC_Data function.
       output: For each layer in the name list, the following judgment is made. 
               If all the row raster cells of a layer are missing from the standard layer, 
               the layer is added to the result return list, 
               otherwise, the next layer is performed. Judgment.
    '''
    list1 = []
    for name in layer_names:
        # Fill in missing values start
        ly_dict[name][np.where(ly_dict[name] == -9999)] = 0 # Fill all missing values (- 9999) with 0
        ly_dict[name][np.where(ly_dict[stan_layer] == -9999)] = -9999 # Make - 9999 consistent with the standardized layer
        # Fill in missing values end
        a = max(ly_dict[name][np.where(ly_dict[stan_layer] == -9999)])
        b = min(ly_dict[name][np.where(ly_dict[stan_layer] != -9999)])
        if a == -9999 and b != -9999:
            continue
        else:
            list1.append(name)
    if len(list1) == 0:
        print('All layers pass the inspection, no missing values exist, and are consistent with the standardized layer')
    else:
        print('The following layers have missing values')
        print(list1)
    return
### test
# test_name = get_name_sme('Target','poi_all_','XM_Boundary')
# ly_dict = copy.deepcopy(read_ASC_Data('ASCII_POI', test_name))
# Na_Test(test_name[:-1],test_name[-1],ly_dict)

### Dimension reduction

In [6]:
def revel_array(dict_one):
    '''input: dict_one, a dictionary whose key is the layer name and the value is stored in an array format
       output: A dict whose value is all converted to one-dimensional array
    '''
    new_dict = copy.deepcopy(dict_one)
    for key,value in new_dict.items():
        new_dict[key] = np.ravel(value, order='C') # Expand to one dimension by row
    num_rows = len(list(new_dict.values())[0])
    new_dict['ID'] = np.array([i for i in range(num_rows)]) # Build index with name ID
    return new_dict

# ly_dict_reval = revel_array(ly_dict)
# ly_dict_reval
# len(ly_dict_reval['Target']) # show total number of records

### conversion to dataframe and to csv

In [None]:
def ly_df_csv(start,mid,end):
    '''
    input: start, start name of lyer name list
    input: mid, middle name prefixs of lyer name list
    input: end, end name of lyer name list
    output: write a csv file of combined data from input file list
    '''
    ly_names = get_name_sme(start,mid,end)
    if mid == 'poi_all_' or mid == 'poi_sel_':
        name_asc = 'ASCII_POI'
    else:
        name_asc = 'ASCII_RN'
    ly_dict0 = read_ASC_Data(name_asc, ly_names) # read ASCII data
    ly_dict = copy.deepcopy(ly_dict0)
#     print(ly_dict['Target'])
#     print(np.shape(ly_dict['Target']))
    test_name = ly_names
    Na_Test(test_name[:-1],test_name[-1], ly_dict)
    ly_dict_reval = revel_array(ly_dict) # Dimension reduction
#     print(len(ly_dict_reval['Target'])) # show total number of records
    ly_df = pd.DataFrame(ly_dict_reval,columns=ly_dict_reval.keys())
    write_name = 'bw_' + mid[:-1]
    ly_df.to_csv(r'data\%s.csv' % (write_name),index = False)
    return
# start = 'Target'
# end = 'XM_Boundary'
# mid = 'poi_all_'
# ly_df_csv(start,mid,end)

### write three csv data
- bw_poi_all
- bw_poi_sel
- bw_RN

In [16]:
# this step spend about five minutes overall
start = 'Target'
end = 'XM_Boundary'
for mid in ['poi_all_', 'poi_sel_', 'RN_']:
    ly_df_csv(start,mid,end)

## Classification of Urban Built-up Areas

### import sklearn libraries and read data

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# import graphviz 
# import pydotplus
from sklearn import tree
from sklearn.metrics import confusion_matrix

### The effect of bandwidth of POI and Road network on accuracy

In [3]:
def get_acc(ind, node_num, input_data,Train,Test):
    '''
    Returns the decision tree accuracy calculated for different layers based on different bandwidth estimates
    '''
    x_features = input_data.iloc[:,[ind]].columns
    y_col = 'Target'
    X_Train = Train[x_features].copy()
    X_Test = Test[x_features].copy()
    Y_Train = Train[[y_col]].copy()
    Y_Test = Test[[y_col]].copy() # Get sub dataframe
    clf = DecisionTreeClassifier(max_leaf_nodes=node_num, random_state=1)
    clf.fit(X_Train, Y_Train)
    predictions = clf.predict(X_Test)
#     acc = accuracy_score(y_true = Y_Test, y_pred = predictions)
    tn, fp, fn, tp = confusion_matrix(y_true = Y_Test, y_pred = predictions).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2*precision*recall/(precision+recall)
    return f1_score
# [get_acc(i,2) for i in range(1,11)]
# max_leaf_nodes=2, random_state=1

### results to csv

In [5]:
def BW_POI_RN():
    '''
    output: write a csv file of bandwidth of POI and Road network on accuracy
    '''
    dict0 = {}
    dict0['BandWidth'] = list(range(250,2750,250))
    name_list1 = ['bw_poi_all', 'bw_poi_sel', 'bw_RN']
    name_list2 = ['POI_All','POI_Sel','RN']
    for i in range(3):
        name1 = name_list1[i]
        name2 = name_list2[i]
        ly_df = pd.read_csv(r'data\%s.csv' % (name1))
        data = ly_df.copy()
#         print(data.columns)
        clean_data = data.copy()
        clean_data = clean_data[clean_data.loc[:,'Target'] != -9999]
#         print(clean_data.head())
        # Perform Test and Train split
        input_data = clean_data.copy()
        Train, Test = train_test_split(clean_data, test_size=0.33, random_state=160)
        input_data = clean_data.copy()
        acc_scores = [get_acc(i,2,input_data,Train,Test) for i in range(1,11)]
        dict0[name2] = acc_scores
    df_POI_RN = pd.DataFrame(dict0)
    df_POI_RN.to_csv(r'data\BandWidth_POI_RN.csv',index = False)
BW_POI_RN()