In [1]:
import pandas as pd
import numpy as np

# Load the train set and the test set
train_df = pd.read_csv('all/train-set.csv')
test_df = pd.read_csv('all/test-set.csv')

In [2]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528720 entries, 0 to 528719
Data columns (total 56 columns):
Id                                    528720 non-null int64
Elevation                             528720 non-null int64
Aspect                                528720 non-null int64
Slope                                 528720 non-null int64
Horizontal_Distance_To_Hydrology      528720 non-null int64
Vertical_Distance_To_Hydrology        528720 non-null int64
Horizontal_Distance_To_Roadways       528720 non-null int64
Hillshade_9am                         528720 non-null int64
Hillshade_Noon                        528720 non-null int64
Hillshade_3pm                         528720 non-null int64
Horizontal_Distance_To_Fire_Points    528720 non-null int64
Wilderness_Area1                      528720 non-null int64
Wilderness_Area2                      528720 non-null int64
Wilderness_Area3                      528720 non-null int64
Wilderness_Area4                      528720 non-

In [3]:
train_df.head(10)
test_df.head(10)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,528721,2953,224,9,55,11,1600,206,251,176,...,0,1,0,0,0,0,0,0,0,0
1,528722,2977,233,14,42,1,1282,196,252,192,...,0,0,0,0,0,0,0,0,0,0
2,528723,3055,227,9,120,5,2724,207,249,178,...,0,0,0,0,0,0,0,0,0,0
3,528724,2615,107,13,277,74,967,241,224,110,...,0,0,0,0,0,0,0,0,0,0
4,528725,3033,59,15,457,94,4086,233,213,99,...,0,0,0,0,0,0,0,0,0,0
5,528726,3054,336,12,381,80,3277,193,229,179,...,1,0,0,0,0,0,0,0,0,0
6,528727,3094,9,9,424,-80,1470,210,222,149,...,0,0,0,0,0,0,0,0,0,0
7,528728,2010,84,35,300,143,330,244,158,13,...,0,0,0,0,0,0,0,0,0,0
8,528729,2672,338,25,60,33,1422,158,196,172,...,0,0,1,0,0,0,0,0,0,0
9,528730,2929,88,8,42,0,3515,233,227,126,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Change some attributes of the set in order to have a bette result in the classifier
def change_feature(data_input):
    data = data_input
    data['Ele_minus_VDH'] = data.Elevation - data.Vertical_Distance_To_Hydrology
    data['Ele_plus_VDH'] = data.Elevation + data.Vertical_Distance_To_Hydrology
    # Add some relation between Elevation and Vertical_Distance_To_Hydrology
    data['Distanse_to_Hydrolody'] = (data['Horizontal_Distance_To_Hydrology'] ** 2 + data['Vertical_Distance_To_Hydrology'] ** 2) ** 0.5
    # Calculate the relative distance 
    data['Hydro_plus_Fire'] = data['Horizontal_Distance_To_Hydrology'] + data['Horizontal_Distance_To_Fire_Points']
    data['Hydro_minus_Fire'] = data['Horizontal_Distance_To_Hydrology'] - data['Horizontal_Distance_To_Fire_Points']
    data['Hydro_plus_Road'] = data['Horizontal_Distance_To_Hydrology'] + data['Horizontal_Distance_To_Roadways']
    data['Hydro_minus_Road'] = data['Horizontal_Distance_To_Hydrology'] - data['Horizontal_Distance_To_Roadways']
    data['Fire_plus_Road'] = data['Horizontal_Distance_To_Fire_Points'] + data['Horizontal_Distance_To_Roadways']
    data['Fire_minus_Road'] = data['Horizontal_Distance_To_Fire_Points'] - data['Horizontal_Distance_To_Roadways']
    # Some relations of hydrology, fire point and road

    # Because there a 40 attributes for the soil and 4 attributes for the wilderness area and those values are binary
    # In fact they have only attribute valued 1 and the others are 0 so if we delete them and create a new attribute with int type, the data set will be more precise
    # Get the type number of soil and create it into the new attribute "Soil".
    # Do same thing as "Wilderness_Area"
    data['Soil'] = 0
    for i in range(1, 41):
        data['Soil'] = data['Soil'] + i * data['Soil_Type' + str(i)]
    data['Wilderness_Area'] = 0
    for i in range(1, 5):
        data['Wilderness_Area'] = data['Wilderness_Area'] + i * data['Wilderness_Area' + str(i)]
    for i in range(1, 41):
        data = data.drop(['Soil_Type' + str(i)], axis=1)
    for i in range(1, 5):
        data = data.drop(['Wilderness_Area' + str(i)], axis=1)
    return data

In [5]:
# Get the features for training
def get_features():
    return ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
            'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
            'Horizontal_Distance_To_Fire_Points',
            'Ele_minus_VDH', 'Ele_plus_VDH', 'Distanse_to_Hydrolody', 'Hydro_plus_Fire', 'Hydro_minus_Fire',
            'Hydro_plus_Road',
            'Hydro_minus_Road', 'Fire_plus_Road', 'Fire_minus_Road', 'Soil', 'Wilderness_Area']

In [6]:
train_df = change_feature(train_df)
test_df = change_feature(test_df)

In [7]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528720 entries, 0 to 528719
Data columns (total 23 columns):
Id                                    528720 non-null int64
Elevation                             528720 non-null int64
Aspect                                528720 non-null int64
Slope                                 528720 non-null int64
Horizontal_Distance_To_Hydrology      528720 non-null int64
Vertical_Distance_To_Hydrology        528720 non-null int64
Horizontal_Distance_To_Roadways       528720 non-null int64
Hillshade_9am                         528720 non-null int64
Hillshade_Noon                        528720 non-null int64
Hillshade_3pm                         528720 non-null int64
Horizontal_Distance_To_Fire_Points    528720 non-null int64
Cover_Type                            528720 non-null int64
Ele_minus_VDH                         528720 non-null int64
Ele_plus_VDH                          528720 non-null int64
Distanse_to_Hydrolody                 528720 non-

In [8]:
features = get_features()

# Get train set's X and Y
x_train = train_df[:][features].values
y_train = train_df['Cover_Type'].values

# Get test set's id and X
test_id = test_df['Id']
x_test = test_df[:][features].values

In [9]:
# Use the ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(max_features=0.3, n_estimators=500)

print("Begin")

# Training
clf.fit(x_train, y_train)

# Predicting
output = clf.predict(x_test)

print("Over")

Begin
Over


In [10]:
# Get the result and save it into a csv file
result = np.c_[test_id.astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['Id', 'Cover_Type'])
df_result.to_csv('all/forest.csv', index=False)