In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool

In [2]:
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')
FEATURES = [col for col in train_df.columns if col not in ['Id', 'Cover_Type']]

In [3]:
data = [] 
for f in train_df.columns:
    if f == 'Cover_Typet':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
    if 'Type' in f or 'Area' in f or f =='Cover_Typet' or f =='Id':
        level = 'nominal'
    elif 'cat' in f or f =='Id':
        level = 'nominal'
    elif train_df[f].dtype == float:
        level = 'interval'
    elif train_df[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    
    if f =='Id':
        keep == False
        
    dtype = train_df[f].dtype
    
    f_dict = {
        'varname': f, 
        'role': role, 
        'level': level, 
        'keep': keep, 
        'dtype': dtype
    }
    
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [4]:
meta

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Id,input,nominal,True,int64
Elevation,input,nominal,True,int64
Aspect,input,nominal,True,int64
Slope,input,nominal,True,int64
Horizontal_Distance_To_Hydrology,input,nominal,True,int64
Vertical_Distance_To_Hydrology,input,nominal,True,int64
Horizontal_Distance_To_Roadways,input,nominal,True,int64
Hillshade_9am,input,nominal,True,int64
Hillshade_Noon,input,nominal,True,int64
Hillshade_3pm,input,nominal,True,int64


In [5]:
def reduce_mem_usage(df, varbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
            else:
                if c_min > np.iinfo(np.float16).min and c_max < np.iinfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.iinfo(np.float32).min and c_max < np.iinfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} Mb'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
        

In [6]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Memory usage after optimization is: 259.40 Mb
Decreased by 84.8%
Memory usage after optimization is: 63.90 Mb
Decreased by 84.8%


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Data columns (total 56 columns):
 #   Column                              Dtype
---  ------                              -----
 0   Id                                  int32
 1   Elevation                           int16
 2   Aspect                              int16
 3   Slope                               int8 
 4   Horizontal_Distance_To_Hydrology    int16
 5   Vertical_Distance_To_Hydrology      int16
 6   Horizontal_Distance_To_Roadways     int16
 7   Hillshade_9am                       int16
 8   Hillshade_Noon                      int16
 9   Hillshade_3pm                       int16
 10  Horizontal_Distance_To_Fire_Points  int16
 11  Wilderness_Area1                    int8 
 12  Wilderness_Area2                    int8 
 13  Wilderness_Area3                    int8 
 14  Wilderness_Area4                    int8 
 15  Soil_Type1                          int8 
 16  Soil_Type2                          

In [8]:
v = train_df.columns
for f in v:
    dist_value = train_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))
    

Variables                                       Id has 4000000 distinct values
Variables                                Elevation has 2525 distinct values
Variables                                   Aspect has 440 distinct values
Variables                                    Slope has 68 distinct values
Variables         Horizontal_Distance_To_Hydrology has 1636 distinct values
Variables           Vertical_Distance_To_Hydrology has 916 distinct values
Variables          Horizontal_Distance_To_Roadways has 7760 distinct values
Variables                            Hillshade_9am has 301 distinct values
Variables                           Hillshade_Noon has 221 distinct values
Variables                            Hillshade_3pm has 326 distinct values
Variables       Horizontal_Distance_To_Fire_Points has 8112 distinct values
Variables                         Wilderness_Area1 has 2 distinct values
Variables                         Wilderness_Area2 has 2 distinct values
Variables             