In [1]:
# data manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.compose import ColumnTransformer

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# saving
import joblib

In [2]:
# Reading the dataset
df = pd.read_csv('data/covtype.csv')
df.head(10)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
5,2579,132,6,300,-15,67,230,237,140,6031,...,0,0,0,0,0,0,0,0,0,2
6,2606,45,7,270,5,633,222,225,138,6256,...,0,0,0,0,0,0,0,0,0,5
7,2605,49,4,234,7,573,222,230,144,6228,...,0,0,0,0,0,0,0,0,0,5
8,2617,45,9,240,56,666,223,221,133,6244,...,0,0,0,0,0,0,0,0,0,5
9,2612,59,10,247,11,636,228,219,124,6230,...,0,0,0,0,0,0,0,0,0,5


First, let's split the dataset into train and test subsets

In [3]:
X = df.drop('Cover_Type', axis=1)
y = df['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:
X_train.shape, X_test.shape

((464809, 54), (116203, 54))

Checking for missing values

In [5]:
X_train.isnull().sum()

Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area_1                     0
Wilderness_Area_2                     0
Wilderness_Area_3                     0
Wilderness_Area_4                     0
Soil_Type_1                           0
Soil_Type_2                           0
Soil_Type_3                           0
Soil_Type_4                           0
Soil_Type_5                           0
Soil_Type_6                           0
Soil_Type_7                           0
Soil_Type_8                           0
Soil_Type_9                           0
Soil_Type_10                          0
Soil_Type_11                          0


Checking for duplicated data

In [6]:
X_train[X_train.duplicated()]

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40


Checking for outliers using Isolation Forest

In [7]:
outliers = IsolationForest(random_state=42).fit_predict(X_train)

In [8]:
X_train.iloc[outliers == -1]

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40
80925,3028,319,40,30,56,5022,91,171,202,3997,...,0,0,0,0,0,0,0,0,0,0
80638,2977,342,38,0,0,5005,118,156,160,4026,...,0,0,0,0,0,0,0,0,0,0
247724,1987,23,45,67,66,95,153,107,55,350,...,0,0,0,0,0,0,0,0,0,0
247186,1945,31,43,30,24,60,170,110,40,309,...,0,0,0,0,0,0,0,0,0,0
248291,2108,309,45,295,221,285,66,167,219,67,...,0,0,0,0,0,0,0,0,0,0
3372,2289,105,41,277,185,876,250,157,0,335,...,0,0,0,0,0,0,0,0,0,0
3035,2096,86,42,60,45,408,240,137,0,256,...,0,0,0,0,0,0,0,0,0,0
241164,2033,81,35,330,166,360,241,154,12,997,...,0,0,0,0,0,0,0,0,0,0
241624,2054,96,35,360,187,390,250,168,16,1020,...,0,0,0,0,0,0,0,0,0,0
80639,2977,0,40,30,8,4980,132,135,117,4018,...,0,0,0,0,0,0,0,0,0,0


Apparently we have only 12 outliers in the training set. Since they're only a small portion of the dataset (12 out of +400k), we can remove them.

In [9]:
X_train = X_train.iloc[outliers == 1]
y_train = y_train.iloc[outliers == 1]

Feature Scaling with min-max

In [10]:
numeric_cols = X_train.iloc[:, :10].columns

In [11]:
preprocessing = ColumnTransformer(
    transformers=[
        ('scaler', MinMaxScaler(), numeric_cols)
    ],
    remainder='passthrough'
)

In [12]:
X_train_scaled = preprocessing.fit_transform(X_train)

In [13]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

In [14]:
X_train_scaled.head(10)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40
0,0.656156,0.497222,0.30303,0.193271,0.279009,0.387523,0.877953,0.976378,0.57874,0.445343,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.604104,0.563889,0.393939,0.0,0.216428,0.477167,0.779528,0.992126,0.692913,0.110429,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.761762,0.930556,0.060606,0.115963,0.246415,0.484052,0.830709,0.92126,0.637795,0.314975,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.604104,0.675,0.136364,0.107373,0.237288,0.395251,0.799213,0.972441,0.716535,0.089375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.43043,0.163889,0.287879,0.115963,0.269883,0.024589,0.905512,0.771654,0.354331,0.258226,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.842843,0.286111,0.424242,0.614889,0.554107,0.335394,0.996063,0.751969,0.181102,0.182236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,0.390891,0.8,0.075758,0.163207,0.302477,0.161023,0.814961,0.940945,0.673228,0.12744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.448949,0.188889,0.060606,0.085898,0.232073,0.057328,0.88189,0.909449,0.559055,0.370608,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.535536,0.569444,0.348485,0.077309,0.276402,0.261768,0.787402,1.0,0.704724,0.163971,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.372372,0.091667,0.287879,0.09592,0.267275,0.229732,0.838583,0.767717,0.437008,0.185722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# saving the column_transformer
joblib.dump(preprocessing, 'models and parameters/preprocessing.pkl')

['models and parameters/preprocessing.pkl']

In [16]:
X_train_scaled.to_csv('data/x_train_processed.csv', index=False)
X_test.to_csv('data/x_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)