In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble   import RandomForestClassifier
from sklearn.svm import NuSVC

In [2]:
pd.set_option('display.max_columns',         500)
pd.set_option('display.width',       1000)
pd.set_option('display.max_colwidth',        -1)

In [3]:
df = pd.read_csv(
    filepath_or_buffer='..\\data\\covtype.data',
    sep=',',
    header=None
)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [5]:
columns_map = {

    # 10 quantitative variables

    0: 'elevation',
    1: 'aspect',
    2: 'slope',
    3: 'hor_dist_to_hydr',
    4: 'vert_dist_to_hydr',
    5: 'hor_dist_to_roadway',
    6: 'hillshade_9am',
    7: 'hillshade_noon',
    8: 'hillshade_3pm',
    9: 'hor_dist_to_ignition',

    # 4 binary wilderness areas

    10: 'rawah',
    11: 'neota',
    12: 'comanche',
    13: 'cache_la_poudre',

    # Soil Types: 1 to 40 ; based on the USFS Ecological Landtype Units (ELUs) for this study area

    14: '2702',
    15: '2703',
    16: '2704',
    17: '2705',
    18: '2706',
    19: '2717',
    20: '3501',
    21: '3502',
    22: '4201',
    23: '4703',
    24: '4704',
    25: '4744',
    26: '4758',
    27: '5101',
    28: '5151',
    29: '6101',
    30: '6102',
    31: '6731',
    32: '7101',
    33: '7102',
    34: '7103',
    35: '7201',
    36: '7202',
    37: '7700',
    38: '7701',
    39: '7702',
    40: '7709',
    41: '7710',
    42: '7745',
    43: '7746',
    44: '7755',
    45: '7756',
    46: '7757',
    47: '7790',
    48: '8703',
    49: '8707',
    50: '8708',
    51: '8771',
    52: '8772',
    53: '8776',

    # Forest Cover Type Classes

    54: 'cover_type'

}

In [6]:
df.rename(
    columns=columns_map,
    inplace=True
)

In [7]:
df.head()

Unnamed: 0,elevation,aspect,slope,hor_dist_to_hydr,vert_dist_to_hydr,hor_dist_to_roadway,hillshade_9am,hillshade_noon,hillshade_3pm,hor_dist_to_ignition,rawah,neota,comanche,cache_la_poudre,2702,2703,2704,2705,2706,2717,3501,3502,4201,4703,4704,4744,4758,5101,5151,6101,6102,6731,7101,7102,7103,7201,7202,7700,7701,7702,7709,7710,7745,7746,7755,7756,7757,7790,8703,8707,8708,8771,8772,8776,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


In [8]:
# euclidean distance calculation  
df['dist_to_hydr'] = np.sqrt( df.hor_dist_to_hydr**2 + df.vert_dist_to_hydr**2)

In [11]:
df.drop(['hor_dist_to_hydr','vert_dist_to_hydr'], axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 54 columns):
elevation               581012 non-null int64
aspect                  581012 non-null int64
slope                   581012 non-null int64
hor_dist_to_roadway     581012 non-null int64
hillshade_9am           581012 non-null int64
hillshade_noon          581012 non-null int64
hillshade_3pm           581012 non-null int64
hor_dist_to_ignition    581012 non-null int64
rawah                   581012 non-null int64
neota                   581012 non-null int64
comanche                581012 non-null int64
cache_la_poudre         581012 non-null int64
2702                    581012 non-null int64
2703                    581012 non-null int64
2704                    581012 non-null int64
2705                    581012 non-null int64
2706                    581012 non-null int64
2717                    581012 non-null int64
3501                    581012 non-null int64
3502           

In [13]:
df['hillshade'] = np.sqrt(  df['hillshade_9am']**2 + df['hillshade_noon']**2 + df['hillshade_3pm']**2)

In [14]:
# df.drop(['hillshade_9am','hillshade_noon','hillshade_3pm'], axis=1, inplace=True)

In [15]:
df.to_pickle('..\\data\\OHE.pkl')

In [16]:
df['wilderness_area'] = df[['rawah', 'neota',
                            'comanche', 'cache_la_poudre']].idxmax(1)
df.drop(['rawah', 'neota', 'comanche', 'cache_la_poudre'], axis=1, inplace=True)

In [17]:
soil = ['2702', '2703', '2704', '2705', '2706', '2717', '3501', '3502', '4201', '4703',
        '4704', '4744', '4758', '5101', '5151', '6101', '6102', '6731', '7101', '7102',
        '7103', '7201', '7202', '7700', '7701', '7702', '7709', '7710', '7745', '7746',
        '7755', '7756', '7757', '7790', '8703', '8707', '8708', '8771', '8772', '8776']
df['soil']= df[soil].idxmax(1)
df['geo_soil'] = df['soil'].astype(str).str[0]
df['climate_soil'] = df['soil'].astype(str).str[1]
df.drop(soil, axis=1, inplace=True)

In [18]:
df.to_pickle('..\\data\\no_OHE.pkl')