In [57]:
import sys
import sklearn

# Common imports
import numpy as np
import os
import pandas as pd

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
PROJECT_ID = "forest"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [59]:
FOREST_PATH = os.path.join("datasets", "forest")

def load_forest_data(filename, forest_path=FOREST_PATH):
    csv_path = os.path.join(forest_path, filename)
    return pd.read_csv(csv_path)

In [61]:
train_data = load_forest_data("train.csv")
test_data = load_forest_data("test.csv")

In [65]:
train_data.size, test_data.size

(846720, 31124060)

In [66]:
train_data.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [68]:
#no missing data, all of them numerical
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   Id                                  15120 non-null  int64
 1   Elevation                           15120 non-null  int64
 2   Aspect                              15120 non-null  int64
 3   Slope                               15120 non-null  int64
 4   Horizontal_Distance_To_Hydrology    15120 non-null  int64
 5   Vertical_Distance_To_Hydrology      15120 non-null  int64
 6   Horizontal_Distance_To_Roadways     15120 non-null  int64
 7   Hillshade_9am                       15120 non-null  int64
 8   Hillshade_Noon                      15120 non-null  int64
 9   Hillshade_3pm                       15120 non-null  int64
 10  Horizontal_Distance_To_Fire_Points  15120 non-null  int64
 11  Wilderness_Area1                    15120 non-null  int64
 12  Wild

In [70]:
train_data.describe()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
count,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,...,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0,15120.0
mean,7560.5,2749.322553,156.676653,16.501587,227.195701,51.076521,1714.023214,212.704299,218.965608,135.091997,...,0.045635,0.040741,0.001455,0.006746,0.000661,0.002249,0.048148,0.043452,0.030357,4.0
std,4364.91237,417.678187,110.085801,8.453927,210.075296,61.239406,1325.066358,30.561287,22.801966,45.895189,...,0.208699,0.197696,0.038118,0.081859,0.02571,0.047368,0.214086,0.20388,0.171574,2.000066
min,1.0,1863.0,0.0,0.0,0.0,-146.0,0.0,0.0,99.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,3780.75,2376.0,65.0,10.0,67.0,5.0,764.0,196.0,207.0,106.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,7560.5,2752.0,126.0,15.0,180.0,32.0,1316.0,220.0,223.0,138.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
75%,11340.25,3104.0,261.0,22.0,330.0,79.0,2270.0,235.0,235.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
max,15120.0,3849.0,360.0,52.0,1343.0,554.0,6890.0,254.0,254.0,248.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


In [77]:
y=train_data[['Cover_Type']]
X=train_data.copy().drop(['Id','Cover_Type'], axis=1)

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
len(X_train),len(X_test)

(12096, 3024)

In [81]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler # for standardisation of data, another option would be the MinMaxScaler

#Create a pipeline for numerical attributes
num_pipeline=Pipeline([
    ('std_scaler',StandardScaler()),
])
X_train_prepared=num_pipeline.fit_transform(X_train)
X_test_prepared=num_pipeline.fit_transform(X_test)

In [83]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4,5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=3, verbose=3)
grid_search.fit(X_train_prepared, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=3, weights=uniform, score=0.761, total=   1.4s
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=3, weights=uniform, score=0.768, total=   1.4s
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s
  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=3, weights=uniform, score=0.764, total=   1.4s
[CV] n_neighbors=3, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=3, weights=distance, score=0.769, total=   1.4s
[CV] n_neighbors=3, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=3, weights=distance, score=0.782, total=   1.4s
[CV] n_neighbors=3, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=3, weights=distance, score=0.779, total=   1.4s
[CV] n_neighbors=4, weights=uniform ..................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=4, weights=uniform, score=0.751, total=   1.4s
[CV] n_neighbors=4, weights=uniform ..................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=4, weights=uniform, score=0.759, total=   1.5s
[CV] n_neighbors=4, weights=uniform ..................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=4, weights=uniform, score=0.757, total=   1.5s
[CV] n_neighbors=4, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=4, weights=distance, score=0.769, total=   1.4s
[CV] n_neighbors=4, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=4, weights=distance, score=0.786, total=   1.4s
[CV] n_neighbors=4, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=4, weights=distance, score=0.778, total=   1.4s
[CV] n_neighbors=5, weights=uniform ..................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=5, weights=uniform, score=0.756, total=   1.5s
[CV] n_neighbors=5, weights=uniform ..................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=5, weights=uniform, score=0.762, total=   1.5s
[CV] n_neighbors=5, weights=uniform ..................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ...... n_neighbors=5, weights=uniform, score=0.755, total=   1.5s
[CV] n_neighbors=5, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=5, weights=distance, score=0.768, total=   1.4s
[CV] n_neighbors=5, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=5, weights=distance, score=0.779, total=   1.4s
[CV] n_neighbors=5, weights=distance .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ..... n_neighbors=5, weights=distance, score=0.773, total=   1.4s


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   25.7s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid=[{'n_neighbors': [3, 4, 5],
                          'weights': ['uniform', 'distance']}],
             verbose=3)

In [96]:
grid_search.best_params_, grid_search.best_score_

({'n_neighbors': 4, 'weights': 'distance'}, 0.7776951058201057)

In [85]:
final_model=grid_search.best_estimator_

In [94]:
y_predictions=final_model.predict(X_test_prepared)
sum(y_predictions==y_test["Cover_Type"])/len(y_predictions)

0.8068783068783069