In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool

In [2]:
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')
FEATURES = [col for col in train_df.columns if col not in ['Id', 'Cover_Type']]

In [3]:
data = [] 
for f in train_df.columns:
    if f == 'Cover_Typet':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
        
    if 'Type' in f or 'Area' in f or f =='Cover_Typet' or f =='Id':
        level = 'nominal'
    elif 'cat' in f or f =='Id':
        level = 'nominal'
    elif train_df[f].dtype == float:
        level = 'interval'
    elif train_df[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    
    if f =='Id':
        keep == False
        
    dtype = train_df[f].dtype
    
    f_dict = {
        'varname': f, 
        'role': role, 
        'level': level, 
        'keep': keep, 
        'dtype': dtype
    }
    
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [4]:
meta

Unnamed: 0_level_0,role,level,keep,dtype
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Id,input,nominal,True,int64
Elevation,input,nominal,True,int64
Aspect,input,nominal,True,int64
Slope,input,nominal,True,int64
Horizontal_Distance_To_Hydrology,input,nominal,True,int64
Vertical_Distance_To_Hydrology,input,nominal,True,int64
Horizontal_Distance_To_Roadways,input,nominal,True,int64
Hillshade_9am,input,nominal,True,int64
Hillshade_Noon,input,nominal,True,int64
Hillshade_3pm,input,nominal,True,int64


In [5]:
def reduce_mem_usage(df, varbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
            else:
                if c_min > np.iinfo(np.float16).min and c_max < np.iinfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.iinfo(np.float32).min and c_max < np.iinfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} Mb'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
        

In [6]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Memory usage after optimization is: 259.40 Mb
Decreased by 84.8%
Memory usage after optimization is: 63.90 Mb
Decreased by 84.8%


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Data columns (total 56 columns):
 #   Column                              Dtype
---  ------                              -----
 0   Id                                  int32
 1   Elevation                           int16
 2   Aspect                              int16
 3   Slope                               int8 
 4   Horizontal_Distance_To_Hydrology    int16
 5   Vertical_Distance_To_Hydrology      int16
 6   Horizontal_Distance_To_Roadways     int16
 7   Hillshade_9am                       int16
 8   Hillshade_Noon                      int16
 9   Hillshade_3pm                       int16
 10  Horizontal_Distance_To_Fire_Points  int16
 11  Wilderness_Area1                    int8 
 12  Wilderness_Area2                    int8 
 13  Wilderness_Area3                    int8 
 14  Wilderness_Area4                    int8 
 15  Soil_Type1                          int8 
 16  Soil_Type2                          

In [8]:
v = train_df.columns
for f in v:
    dist_value = train_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))
    

Variables                                       Id has 4000000 distinct values
Variables                                Elevation has 2525 distinct values
Variables                                   Aspect has 440 distinct values
Variables                                    Slope has 68 distinct values
Variables         Horizontal_Distance_To_Hydrology has 1636 distinct values
Variables           Vertical_Distance_To_Hydrology has 916 distinct values
Variables          Horizontal_Distance_To_Roadways has 7760 distinct values
Variables                            Hillshade_9am has 301 distinct values
Variables                           Hillshade_Noon has 221 distinct values
Variables                            Hillshade_3pm has 326 distinct values
Variables       Horizontal_Distance_To_Fire_Points has 8112 distinct values
Variables                         Wilderness_Area1 has 2 distinct values
Variables                         Wilderness_Area2 has 2 distinct values
Variables             

In [9]:
train_df = train_df.drop(index=int(np.where(train_df['Cover_Type'] == 5)[0]))
train_df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,206,234,193,...,0,0,0,0,0,0,0,0,0,1
1,1,3026,182,5,280,29,3270,233,240,106,...,0,0,0,0,0,0,0,0,0,2
2,2,3106,13,7,351,37,2914,208,234,137,...,0,0,0,0,0,0,0,0,0,1
3,3,3022,276,13,192,16,3034,207,238,156,...,0,0,0,0,0,0,0,0,0,2
4,4,2906,186,13,266,22,2916,231,231,154,...,0,0,0,0,0,0,0,0,0,2


In [10]:
v = test_df.columns
for f in v:
    dist_value = test_df[f].value_counts().shape[0]
    print('Variables {:>40} has {} distinct values'.format(f, dist_value))

Variables                                       Id has 1000000 distinct values
Variables                                Elevation has 2488 distinct values
Variables                                   Aspect has 433 distinct values
Variables                                    Slope has 67 distinct values
Variables         Horizontal_Distance_To_Hydrology has 1612 distinct values
Variables           Vertical_Distance_To_Hydrology has 858 distinct values
Variables          Horizontal_Distance_To_Roadways has 7576 distinct values
Variables                            Hillshade_9am has 280 distinct values
Variables                           Hillshade_Noon has 218 distinct values
Variables                            Hillshade_3pm has 320 distinct values
Variables       Horizontal_Distance_To_Fire_Points has 7929 distinct values
Variables                         Wilderness_Area1 has 2 distinct values
Variables                         Wilderness_Area2 has 2 distinct values
Variables             

In [12]:
missing = 0
for f in train_df.columns:
    missing += train_df[f].isnull().sum()
    print('Variables : {:>30}\t missings : {}'.format(f, train_df[f].isnull().sum()))
    print('Sum of missing value : {}'.format(missing))

Variables :                             Id	 missings : 0
Sum of missing value : 0
Variables :                      Elevation	 missings : 0
Sum of missing value : 0
Variables :                         Aspect	 missings : 0
Sum of missing value : 0
Variables :                          Slope	 missings : 0
Sum of missing value : 0
Variables : Horizontal_Distance_To_Hydrology	 missings : 0
Sum of missing value : 0
Variables : Vertical_Distance_To_Hydrology	 missings : 0
Sum of missing value : 0
Variables : Horizontal_Distance_To_Roadways	 missings : 0
Sum of missing value : 0
Variables :                  Hillshade_9am	 missings : 0
Sum of missing value : 0
Variables :                 Hillshade_Noon	 missings : 0
Sum of missing value : 0
Variables :                  Hillshade_3pm	 missings : 0
Sum of missing value : 0
Variables : Horizontal_Distance_To_Fire_Points	 missings : 0
Sum of missing value : 0
Variables :               Wilderness_Area1	 missings : 0
Sum of missing value : 0
Variables

In [13]:
v = meta[(meta.level == 'nominal') & meta.keep].index
train_df[v].describe()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
count,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,...,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0,3999999.0
mean,1999999.0,2980.192,151.5857,15.09753,271.3155,51.66261,1766.642,211.8375,221.0614,140.8109,...,0.03746201,0.03782076,0.011995,0.0160535,0.01071275,0.0122075,0.04075151,0.03923926,0.03161851,1.771335
std,1154701.0,289.0482,109.9611,8.546724,226.5497,68.21597,1315.61,30.75996,22.23134,43.69864,...,0.1898911,0.1907626,0.1088629,0.1256813,0.1029466,0.1098111,0.197714,0.1941637,0.1749822,0.8938047
min,0.0,1773.0,-33.0,-3.0,-92.0,-317.0,-287.0,-4.0,49.0,-53.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,999999.5,2760.0,60.0,9.0,110.0,4.0,822.0,198.0,210.0,115.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1999999.0,2966.0,123.0,14.0,213.0,31.0,1436.0,218.0,224.0,142.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,2999998.0,3217.0,247.0,20.0,361.0,78.0,2365.0,233.0,237.0,169.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,3999999.0,4383.0,407.0,64.0,1602.0,647.0,7666.0,301.0,279.0,272.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


In [14]:
for i in v:
    print(i)

Id
Elevation
Aspect
Slope
Horizontal_Distance_To_Hydrology
Vertical_Distance_To_Hydrology
Horizontal_Distance_To_Roadways
Hillshade_9am
Hillshade_Noon
Hillshade_3pm
Horizontal_Distance_To_Fire_Points
Wilderness_Area1
Wilderness_Area2
Wilderness_Area3
Wilderness_Area4
Soil_Type1
Soil_Type2
Soil_Type3
Soil_Type4
Soil_Type5
Soil_Type6
Soil_Type7
Soil_Type8
Soil_Type9
Soil_Type10
Soil_Type11
Soil_Type12
Soil_Type13
Soil_Type14
Soil_Type15
Soil_Type16
Soil_Type17
Soil_Type18
Soil_Type19
Soil_Type20
Soil_Type21
Soil_Type22
Soil_Type23
Soil_Type24
Soil_Type25
Soil_Type26
Soil_Type27
Soil_Type28
Soil_Type29
Soil_Type30
Soil_Type31
Soil_Type32
Soil_Type33
Soil_Type34
Soil_Type35
Soil_Type36
Soil_Type37
Soil_Type38
Soil_Type39
Soil_Type40
Cover_Type


In [16]:
v = meta[(meta.level == 'ordinal') & meta.keep].index
print(v)

Index([], dtype='object', name='varname')


In [None]:
s1 = train_df.sample(frac=0.2)
s2 = train_df.sample(frac=0.2)

i = 1 
plt.figure()
fig, ax = plt.subplots(2, 5, figsize=(20,12))
for f in v: