# CMF Data Analysis Training

## Initialisation

In [5]:
import pandas as pd
import numpy as np
%matplotlib inline

## Loading and preparing data

In [6]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample = pd.read_csv('../data/sample.csv')

In [7]:
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
print('Sample shape ', sample.shape)

Train shape:  (551012, 57)
Test shape:  (30000, 55)
Sample shape  (30000, 8)


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551012 entries, 0 to 551011
Data columns (total 57 columns):
Unnamed: 0                            551012 non-null int64
Cover_Type                            551012 non-null object
ID                                    551012 non-null int64
Elevation                             551012 non-null int64
Aspect                                551012 non-null int64
Slope                                 551012 non-null int64
Horizontal_Distance_To_Hydrology      551012 non-null int64
Vertical_Distance_To_Hydrology        551012 non-null int64
Horizontal_Distance_To_Roadways       551012 non-null int64
Hillshade_9am                         551012 non-null int64
Hillshade_Noon                        551012 non-null int64
Hillshade_3pm                         551012 non-null int64
Horizontal_Distance_To_Fire_Points    551012 non-null int64
Wilderness_Area_0                     551012 non-null int64
Wilderness_Area_1                     551012 non

#### Data Fields

* Elevation - Elevation in meters
* Aspect - Aspect in degrees azimuth
* Slope - Slope in degrees
* Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features
* Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features
* Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway
* Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice
* Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice
* Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice
* Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points
* Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation
* Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation
* Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation

In [9]:
train.head()

Unnamed: 0.1,Unnamed: 0,Cover_Type,ID,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,0,Aspen,30000,2708,83,28,120,58,120,245,...,0,0,0,0,0,0,0,0,0,0
1,1,Lodgepole Pine,30001,2572,110,15,60,8,934,245,...,0,0,0,0,0,0,0,0,0,0
2,2,Lodgepole Pine,30002,3200,64,16,124,24,1442,233,...,0,0,0,0,0,0,0,0,0,0
3,3,Lodgepole Pine,30003,2625,46,19,371,7,1020,222,...,0,0,0,0,0,0,0,0,0,0
4,4,Ponderosa Pine,30004,2340,94,16,30,8,421,243,...,0,0,0,0,0,0,0,0,0,0


In [10]:
test.head()

Unnamed: 0,ID,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,0,3237,39,6,190,4,2992,221,226,141,...,1,0,0,0,0,0,0,0,0,0
1,1,2206,300,36,42,16,1387,99,200,228,...,0,0,0,0,0,0,0,0,0,0
2,2,2982,210,8,60,-5,5343,214,247,169,...,0,0,0,0,0,0,0,0,0,0
3,3,3067,66,9,283,17,3777,229,221,124,...,0,0,0,0,0,0,0,0,0,0
4,4,2452,107,13,67,8,446,242,224,109,...,0,0,0,0,0,0,0,0,0,0


In [11]:
sample.head()

Unnamed: 0,ID,Aspen,Cottonwood/Willow,Douglas fir,Krummholz,Lodgepole Pine,Ponderosa Pine,Spruce fir
0,0,0.016384,0.004711,0.029947,0.035235,0.487387,0.061516,0.36482
1,1,0.016384,0.004711,0.029947,0.035235,0.487387,0.061516,0.36482
2,2,0.016384,0.004711,0.029947,0.035235,0.487387,0.061516,0.36482
3,3,0.016384,0.004711,0.029947,0.035235,0.487387,0.061516,0.36482
4,4,0.016384,0.004711,0.029947,0.035235,0.487387,0.061516,0.36482


In [12]:
coverType = train.Cover_Type
coverType.value_counts()

Lodgepole Pine       268556
Spruce fir           201020
Ponderosa Pine        33896
Krummholz             19415
Douglas fir           16501
Aspen                  9028
Cottonwood/Willow      2596
Name: Cover_Type, dtype: int64

In [13]:
testId = test.ID
train.drop(['Unnamed: 0', 'Cover_Type', 'ID'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)
train['Test'] = False
test['Test'] = True
data = pd.concat([train, test], ignore_index=True)

## Data wrangling

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 581012 entries, 0 to 581011
Data columns (total 55 columns):
Elevation                             581012 non-null int64
Aspect                                581012 non-null int64
Slope                                 581012 non-null int64
Horizontal_Distance_To_Hydrology      581012 non-null int64
Vertical_Distance_To_Hydrology        581012 non-null int64
Horizontal_Distance_To_Roadways       581012 non-null int64
Hillshade_9am                         581012 non-null int64
Hillshade_Noon                        581012 non-null int64
Hillshade_3pm                         581012 non-null int64
Horizontal_Distance_To_Fire_Points    581012 non-null int64
Wilderness_Area_0                     581012 non-null int64
Wilderness_Area_1                     581012 non-null int64
Wilderness_Area_2                     581012 non-null int64
Wilderness_Area_3                     581012 non-null int64
Soil_Type_0                           581012 non-

In [15]:
data.describe()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,2959.365301,155.656807,14.103704,269.428217,46.418855,2350.146611,212.146049,223.318716,142.528263,1980.291226,...,0.044175,0.090392,0.077716,0.002773,0.003255,0.000205,0.000513,0.026803,0.023762,0.01506
std,279.984734,111.913721,7.488242,212.549356,58.295232,1559.25487,26.769889,19.768697,38.274529,1324.19521,...,0.205483,0.286743,0.267725,0.052584,0.056957,0.01431,0.022641,0.161508,0.152307,0.121791
min,1859.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2996.0,127.0,13.0,218.0,30.0,1997.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3858.0,360.0,66.0,1397.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
data.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area_0                     0
Wilderness_Area_1                     0
Wilderness_Area_2                     0
Wilderness_Area_3                     0
Soil_Type_0                           0
Soil_Type_1                           0
Soil_Type_2                           0
Soil_Type_3                           0
Soil_Type_4                           0
Soil_Type_5                           0
Soil_Type_6                           0
Soil_Type_7                           0
Soil_Type_8                           0
Soil_Type_9                           0
Soil_Type_10                          0


### Memory optimisation

In [17]:
for col in data.columns:
    if set(data[col]) == set((0,1)):
        data[col] = data[col].astype(bool)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 581012 entries, 0 to 581011
Data columns (total 55 columns):
Elevation                             581012 non-null int64
Aspect                                581012 non-null int64
Slope                                 581012 non-null int64
Horizontal_Distance_To_Hydrology      581012 non-null int64
Vertical_Distance_To_Hydrology        581012 non-null int64
Horizontal_Distance_To_Roadways       581012 non-null int64
Hillshade_9am                         581012 non-null int64
Hillshade_Noon                        581012 non-null int64
Hillshade_3pm                         581012 non-null int64
Horizontal_Distance_To_Fire_Points    581012 non-null int64
Wilderness_Area_0                     581012 non-null bool
Wilderness_Area_1                     581012 non-null bool
Wilderness_Area_2                     581012 non-null bool
Wilderness_Area_3                     581012 non-null bool
Soil_Type_0                           581012 non-null

### Split the data back to training and testing sets

In [19]:
train = data[data.Test == 0].drop('Test', axis=1).copy()
test = data[data.Test == 1].drop('Test', axis=1).copy()

### Save prepared datastes to pickle

In [21]:
train.to_pickle('../data/train_prep.pkl')
test.to_pickle('../data/test_prep.pkl')