# CMF Data Analysis Training

## Initialisation

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline

## Loading and preparing data

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample.csv')

In [13]:
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
print('Sample shape ', sample.shape)

Train shape:  (551012, 57)
Test shape:  (30000, 55)
Sample shape  (30000, 8)


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551012 entries, 0 to 551011
Data columns (total 57 columns):
Unnamed: 0                            551012 non-null int64
Cover_Type                            551012 non-null object
ID                                    551012 non-null int64
Elevation                             551012 non-null int64
Aspect                                551012 non-null int64
Slope                                 551012 non-null int64
Horizontal_Distance_To_Hydrology      551012 non-null int64
Vertical_Distance_To_Hydrology        551012 non-null int64
Horizontal_Distance_To_Roadways       551012 non-null int64
Hillshade_9am                         551012 non-null int64
Hillshade_Noon                        551012 non-null int64
Hillshade_3pm                         551012 non-null int64
Horizontal_Distance_To_Fire_Points    551012 non-null int64
Wilderness_Area_0                     551012 non-null int64
Wilderness_Area_1                     551012 non

#### Data Fields

* Elevation - Elevation in meters
* Aspect - Aspect in degrees azimuth
* Slope - Slope in degrees
* Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features
* Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features
* Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway
* Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice
* Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice
* Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice
* Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points
* Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation
* Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation
* Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation

In [26]:
train.head()

Unnamed: 0.1,Unnamed: 0,Cover_Type,ID,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,0,Aspen,30000,2708,83,28,120,58,120,245,...,0,0,0,0,0,0,0,0,0,0
1,1,Lodgepole Pine,30001,2572,110,15,60,8,934,245,...,0,0,0,0,0,0,0,0,0,0
2,2,Lodgepole Pine,30002,3200,64,16,124,24,1442,233,...,0,0,0,0,0,0,0,0,0,0
3,3,Lodgepole Pine,30003,2625,46,19,371,7,1020,222,...,0,0,0,0,0,0,0,0,0,0
4,4,Ponderosa Pine,30004,2340,94,16,30,8,421,243,...,0,0,0,0,0,0,0,0,0,0


In [28]:
coverType = train.Cover_Type
coverType.value_counts()

Lodgepole Pine       268556
Spruce fir           201020
Ponderosa Pine        33896
Krummholz             19415
Douglas fir           16501
Aspen                  9028
Cottonwood/Willow      2596
Name: Cover_Type, dtype: int64

In [31]:
testId = test.ID
train.drop(['Unnamed: 0', 'Cover_Type', 'ID'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)
train['Test'] = False
test['Test'] = True
data = pd.concat([train, test], ignore_index=True)

## Data wrangling

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 581012 entries, 0 to 581011
Data columns (total 55 columns):
Elevation                             581012 non-null int64
Aspect                                581012 non-null int64
Slope                                 581012 non-null int64
Horizontal_Distance_To_Hydrology      581012 non-null int64
Vertical_Distance_To_Hydrology        581012 non-null int64
Horizontal_Distance_To_Roadways       581012 non-null int64
Hillshade_9am                         581012 non-null int64
Hillshade_Noon                        581012 non-null int64
Hillshade_3pm                         581012 non-null int64
Horizontal_Distance_To_Fire_Points    581012 non-null int64
Wilderness_Area_0                     581012 non-null int64
Wilderness_Area_1                     581012 non-null int64
Wilderness_Area_2                     581012 non-null int64
Wilderness_Area_3                     581012 non-null int64
Soil_Type_0                           581012 non-