In [1]:
import pandas as pd
import numpy as np
import featuretools as ft
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_validate
from itertools import combinations

In [2]:
data = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
data.drop(columns=["Id"], inplace = True)
test.set_index("Id", inplace = True)

In [3]:
def make_name(function, columns):
    return f"{function}_" + "_".join([column.split("_")[-1] for column in columns])

def means(columns):
    for df in [data, test]:
        df[make_name("mean", columns)] = df[[*columns]].mean(axis=1)
        
def pairwise_sums(columns):
    for df in [data, test]:
        for pair in combinations(columns, 2):
            df[make_name("sum", pair)] = df[[*pair]].sum(axis=1)
            
def pairwise_diffs_abs(columns):
    for df in [data, test]:
        for pair in combinations(columns, 2):
            df[make_name("diff", pair)] = np.abs(df[pair[0]] - df[pair[1]])
             
def make_pairwise(columns):
    pairwise_sums(columns)
    pairwise_diffs_abs(columns)
            
def eucl_distance(columns):
      for df in [data, test]:
        for pair in combinations(columns, 2):
            df[make_name("eucl", pair)] = np.sqrt(df[pair[0]]**2 + df[pair[1]]**2) 
            
def deal_with_inf():
    for df in [data, test]:
        df[df==np.inf] = np.nan
        df.fillna(df.max(), inplace=True)

In [4]:
distances_h = ["Horizontal_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Horizontal_Distance_To_Fire_Points"]
distances_v = ["Elevation", "Vertical_Distance_To_Hydrology"]
hillshades = ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]
angles = ["Aspect", "Slope"]
intervals = distances_h + distances_v

In [5]:
make_pairwise(intervals)
eucl_distance(intervals)

for group in [distances_h, distances_v, intervals, hillshades, angles]:
    means(group)

deal_with_inf()

In [6]:
X, y = data.loc[:, data.columns != 'Cover_Type'], data.loc[:, "Cover_Type"]

In [7]:
X = ft.selection.remove_highly_correlated_features(X, pct_corr_threshold=0.992)
test = test[test.columns.intersection(X.columns.to_list())]

In [8]:
clf = ExtraTreesClassifier(random_state=42, n_estimators=280, max_features=15)
clf.fit(X, y)
prediction = clf.predict(test)

In [9]:
def make_submission(df, pred, n):
    submission = pd.DataFrame({
            "Id": df.index,
            "Cover_Type": pred
        })
    submission.to_csv('data/sub/submission_{}.csv'.format(n), index=False)

In [10]:
make_submission(test, prediction, 24)