In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn import preprocessing

In [2]:
df_train = pd.read_pickle('../resources/trainFE1.pkl')
df_test = pd.read_pickle('../resources/testFE1.pkl')
combine = pd.concat([df_train, df_test], sort=False)

In [3]:
combine.shape

(581012, 67)

Standardize features:

In [4]:
stdList = ['Elevation', 'Elev_Band_5', 'Elev_Band_3', 'Elev_GT_2600',
          'Aspect', 'Slope', 'Slope_GT_22',
          'Horizontal_Distance_To_Hydrology', 'HDH4',
          'Vertical_Distance_To_Hydrology', 'VDH_non0', 'VDH_Sign',
          'Horizontal_Distance_To_Roadways', 'HDR3',
          'Hillshade_9am_Noon', 'Hillshade_3pm',
          'Horizontal_Distance_To_Fire_Points', 'HDFP3']

In [5]:
df_to_scale = combine[stdList] #To be scaled
df_not_scaled = combine[combine.columns.difference(stdList)] #Not to be scaled

In [6]:
scaler = preprocessing.StandardScaler()
data_scaled = scaler.fit_transform(df_to_scale)

scaled_data contains the standardized features

In [7]:
df_scaled = pd.DataFrame(data_scaled).rename(index=str, columns={
    0: 'Elevation',
    1: 'Elev_Band_5',
    2: 'Elev_Band_3',
    3: 'Elev_GT_2600',
    4: 'Aspect',
    5: 'Slope',
    6: 'Slope_GT_22',
    7: 'Horizontal_Distance_To_Hydrology',
    8: 'HDH4',
    9: 'Vertical_Distance_To_Hydrology',
    10: 'VDH_non0',
    11: 'VDH_Sign',
    12: 'Horizontal_Distance_To_Roadways',
    13: 'HDR3',
    14: 'Hillshade_9am_Noon',
    15: 'Hillshade_3pm',
    16: 'Horizontal_Distance_To_Fire_Points',
    17: 'HDFP3'
})

In [8]:
df_scaled.head()

Unnamed: 0,Elevation,Elev_Band_5,Elev_Band_3,Elev_GT_2600,Aspect,Slope,Slope_GT_22,Horizontal_Distance_To_Hydrology,HDH4,Vertical_Distance_To_Hydrology,VDH_non0,VDH_Sign,Horizontal_Distance_To_Roadways,HDR3,Hillshade_9am_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,HDFP3
0,-1.297805,0.275753,-2.860756,-2.860756,-0.935157,-1.48282,-0.399164,-0.053767,0.903527,-0.796273,-3.74524,-1.205608,-1.180146,-0.998597,0.524422,0.14296,3.246283,0.705698
1,-1.319235,0.275753,-2.860756,-2.860756,-0.89048,-1.616363,-0.399164,-0.270188,0.903527,-0.899197,0.267006,-2.826843,-1.257106,-0.998597,0.584236,0.221342,3.205504,0.705698
2,-0.554907,0.275753,0.349558,0.349558,-0.148836,-0.681563,-0.399164,-0.006719,0.903527,0.318742,0.267006,0.415628,0.532212,1.001405,1.092651,-0.196691,3.126965,0.705698
3,-0.622768,0.275753,0.349558,0.349558,-0.005869,0.520322,-0.399164,-0.129044,0.903527,1.227908,0.267006,0.415628,0.474492,1.001405,1.212278,-0.536343,3.194931,0.705698
4,-1.301377,0.275753,-2.860756,-2.860756,-0.98877,-1.616363,-0.399164,-0.547771,-1.106774,-0.813427,0.267006,-2.826843,-1.256464,-0.998597,0.554329,0.195215,3.165479,0.705698


**To do**  
Join df_scaled and df_not_scaled

In [9]:
(df_scaled.shape, df_not_scaled.shape)

((581012, 18), (581012, 49))

df_scaled has dtypes float64  
df_not_scaled has dtypes int64

In [10]:
df_scaled.dtypes, df_not_scaled.dtypes

(Elevation                             float64
 Elev_Band_5                           float64
 Elev_Band_3                           float64
 Elev_GT_2600                          float64
 Aspect                                float64
 Slope                                 float64
 Slope_GT_22                           float64
 Horizontal_Distance_To_Hydrology      float64
 HDH4                                  float64
 Vertical_Distance_To_Hydrology        float64
 VDH_non0                              float64
 VDH_Sign                              float64
 Horizontal_Distance_To_Roadways       float64
 HDR3                                  float64
 Hillshade_9am_Noon                    float64
 Hillshade_3pm                         float64
 Horizontal_Distance_To_Fire_Points    float64
 HDFP3                                 float64
 dtype: object, Cover_Type                float64
 Hillshade_9am               int64
 Hillshade_9am_Noon_3pm      int64
 Hillshade_Noon              int64

In [11]:
df_not_scaled = df_not_scaled.astype(dtype='float64')

Reset indices

In [12]:
df_scaled.reset_index(drop=True, inplace=True)
df_not_scaled.reset_index(drop=True, inplace=True)

In [13]:
df_train_scaled = df_scaled[:15120]
df_test_scaled = df_scaled[15120:]

df_train_not_scaled = df_not_scaled[:15120]
df_test_not_scaled = df_not_scaled[15120:]

In [14]:
df_train_all = pd.concat([df_train_scaled, df_train_not_scaled], axis=1, sort=None)
df_test_all = pd.concat([df_test_scaled, df_test_not_scaled], axis=1, sort=None)

In [15]:
df_test_all.drop('Cover_Type', axis=1, inplace=True)

**Solved NaN values problem**  
Converted df_not_scaled to float64  
Reset indices in df_scaled and df_not_scaled

In [16]:
df_train_all.to_pickle('../resources/trainSTD1.pkl')
df_test_all.to_pickle('../resources/testSTD1.pkl')

Cell for column name reference

In [17]:
combine.columns

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_