# Discretize numerical features.
we will discretize them into binary attributes depending on whether they have 
spent money or not.

In [1]:
import os
import sys
ROOT = os.path.abspath(os.path.join(os.path.dirname('./'), os.pardir))
print(ROOT)

/Users/javigamero/MyMac/DS_Master/DM_Preprocess&Classif/Spaceship-Titanic


In [2]:
import utils 
import numpy as np
import pandas as pd
from mixed_naive_bayes import MixedNB 
from sklearn.preprocessing import Normalizer, StandardScaler

In [3]:
train_raw = pd.read_csv(ROOT + '/data/train_nooutliers.csv')
train_X = utils.one_hot_encode(train_raw.drop(['Transported', 'PassengerId'], 
                                              axis = 1))

num_features = train_raw.select_dtypes(exclude=['object', 'bool']).columns
cat_features = train_X.drop(num_features, axis = 1).columns

In [4]:
train_X.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_1.0,VIP_1.0,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_S
count,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0,7781.0
mean,28.84218,176.682689,410.19599,127.290965,272.618687,267.646832,0.09716,0.69207,0.248426,0.199203,0.396222,0.021977,0.092148,0.084822,0.051921,0.095103,0.340959,0.30446,0.000386,0.520756
std,14.247889,525.727157,1449.553758,386.893359,996.170485,1001.617675,0.296194,0.461667,0.432128,0.399427,0.489143,0.146617,0.289253,0.278635,0.221883,0.293377,0.474062,0.460208,0.019633,0.499601
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,37.0,18.0,25.0,9.0,29.0,19.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
max,79.0,8168.0,21066.0,6331.0,16594.0,12708.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


At least, half of the passengers have not spend money.

In [5]:
num_features = np.array(num_features)
num_features_exceptAge = np.delete(num_features, 0)
num_features_exceptAge

array(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
      dtype=object)

In [6]:
for i in num_features_exceptAge:
    feat = train_X[i]
    feat_disc = [1 if n!=0 else 0 for n in feat] # did they spendMoney?
    train_X.loc[:, i] = feat_disc

In [7]:
train_X.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_1.0,VIP_1.0,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_S
0,39,0,0,0,0,0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,1,1,1,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,58,1,1,0,1,1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,33,0,1,1,1,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,16,1,1,1,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [8]:
from sklearn.preprocessing import KBinsDiscretizer

In [9]:
discretizer = KBinsDiscretizer(
    n_bins=3, encode='ordinal', random_state=1234, strategy='uniform'
)
discretizer.fit(train_X[['Age']])
train_X['Age'] = discretizer.transform(train_X[['Age']])

train_discretized = train_X.copy()
train_discretized['Transported'] = train_raw['Transported']

In [10]:
train_discretized.to_csv(ROOT + '/data/train_discretize_oh.csv', index=False)

Same to test:

In [11]:
test_raw = utils.load_test()
test = utils.one_hot_encode(test_raw.drop(['PassengerId'], axis = 1))

In [12]:
for i in num_features_exceptAge:
    feat = test[i]
    feat_disc = [1 if n!=0 else 0 for n in feat] # did they spendMoney?
    test.loc[:, i] = feat_disc

test['Age'] = discretizer.transform(test[['Age']])

In [13]:
test.to_csv(ROOT + '/data/test_discretize_oh.csv', index=False)