In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [32]:
data = pd.read_csv("../data/cat_in_the_dat.csv")
pd.set_option('display.max_rows', 1000000) 
pd.set_option('display.max_columns', 1000000) 

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 25 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      600000 non-null  int64  
 1   bin_0   582106 non-null  float64
 2   bin_1   581997 non-null  float64
 3   bin_2   582070 non-null  float64
 4   bin_3   581986 non-null  object 
 5   bin_4   581953 non-null  object 
 6   nom_0   581748 non-null  object 
 7   nom_1   581844 non-null  object 
 8   nom_2   581965 non-null  object 
 9   nom_3   581879 non-null  object 
 10  nom_4   581965 non-null  object 
 11  nom_5   582222 non-null  object 
 12  nom_6   581869 non-null  object 
 13  nom_7   581997 non-null  object 
 14  nom_8   582245 non-null  object 
 15  nom_9   581927 non-null  object 
 16  ord_0   581712 non-null  float64
 17  ord_1   581959 non-null  object 
 18  ord_2   581925 non-null  object 
 19  ord_3   582084 non-null  object 
 20  ord_4   582070 non-null  object 
 21  ord_5   58

In [4]:
for col in data.columns:
    print(f'col : {col}')
    print()
    print(data[col].value_counts())
    print()

col : id

0         1
400012    1
399996    1
399997    1
399998    1
         ..
200000    1
200001    1
200002    1
200003    1
599999    1
Name: id, Length: 600000, dtype: int64

col : bin_0

0.0    528377
1.0     53729
Name: bin_0, dtype: int64

col : bin_1

0.0    474018
1.0    107979
Name: bin_1, dtype: int64

col : bin_2

0.0    419845
1.0    162225
Name: bin_2, dtype: int64

col : bin_3

F    366212
T    215774
Name: bin_3, dtype: int64

col : bin_4

N    312344
Y    269609
Name: bin_4, dtype: int64

col : nom_0

Red      323286
Blue     205861
Green     52601
Name: nom_0, dtype: int64

col : nom_1

Triangle     164190
Polygon      152563
Trapezoid    119438
Circle       104995
Square        26503
Star          14155
Name: nom_1, dtype: int64

col : nom_2

Hamster    164897
Axolotl    152319
Lion       119504
Dog        104825
Cat         26276
Snake       14144
Name: nom_2, dtype: int64

col : nom_3

India         164869
Costa Rica    151827
Russia        119840
Finland       

<font size=5> Manual Label Encoding </font>

In [5]:
data['ord_2'].value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [6]:
mapping = {
    "Freezing": 0,
    "Warm": 1,
    "Cold": 2,
    "Boiling Hot": 3,
    "Hot": 4,
    "Lava Hot": 5
}

data.loc[:, "ord_2"] = data["ord_2"].map(mapping)


In [7]:
data.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

<font size=5> sklearn preprocessing Label Encoding </font>

In [8]:
from sklearn import preprocessing

data = pd.read_csv("../data/cat_in_the_dat.csv")

 # fill NaN values in ord_2 column
data.loc[:, "ord_2"] = data.ord_2.fillna("NONE") # initialize LabelEncoder
#The reason is LabelEncoder from scikit- learn does not handle NaN values,
# and ord_2 column has NaN values in it.

data.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

In [9]:
lbl_enc = preprocessing.LabelEncoder()
# fit label encoder and transform values on ord_2 column
# P.S: do not use this directly. fit first, then transform
data.loc[:, "ord_2"] = lbl_enc.fit_transform(data.ord_2.values)

In [10]:
data.ord_2.value_counts()

2    142726
6    124239
1     97822
0     84790
3     67508
4     64840
5     18075
Name: ord_2, dtype: int64

In [12]:
#MAKE SURE YOU FILL NA BEFORE LABEL ENCODING

def label_encoding_transformer(data, cols):
    label_encoder_dict = {}
    lbl_enc = preprocessing.LabelEncoder()
    for col in cols:
        data.loc[:, col] = lbl_enc.fit_transform(data[col].values)
        le_name_mapping = dict(zip(lbl_enc.classes_, lbl_enc.transform(lbl_enc.classes_)))
        print(le_name_mapping)
        label_encoder_dict[col] = le_name_mapping
        
    return data, label_encoder_dict
        
        

In [13]:
data, le_dict = label_encoding_transformer(data, cols=['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5'])

{'Contributor': 0, 'Expert': 1, 'Grandmaster': 2, 'Master': 3, 'Novice': 4, nan: 5}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, nan: 15}
{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, nan: 26}
{'AG': 0, 'AI': 1, 'AU': 2, 'AW': 3, 'Ay': 4, 'BL': 5, 'BX': 6, 'Bx': 7, 'CN': 8, 'CU': 9, 'Cn': 10, 'DI': 11, 'DN': 12, 'DR': 13, 'DT': 14, 'Dj': 15, 'Dn': 16, 'EC': 17, 'Ey': 18, 'FB': 19, 'FH': 20, 'Fl': 21, 'GZ': 22, 'HF': 23, 'HK': 24, 'HO': 25, 'Hk': 26, 'IA': 27, 'IS': 28, 'Ib': 29, 'In': 30, 'Io': 31, 'Iq': 32, 'JQ': 33, 'JT': 34, 'Ji': 35, 'Kq': 36, 'LS': 37, 'LY': 38, 'Lo': 39, 'MF': 40, 'MU': 41, 'MV': 42, 'MX': 43, 'Mg': 44, 'Mq': 45, 'NS': 46, 'NT': 47, 'Nh': 48, 'OM': 49, 'OZ': 50, 'Oe

<font size=5> One Hot Encoding </font>

In [None]:
# create random 1-d array with 1001 different categories (int)
example = np.random.randint(1000, size=1000000)
# initialize OneHotEncoder from scikit-learn
# keep sparse = False to get dense array
ohe = preprocessing.OneHotEncoder(sparse=False)
# fit and transform data with dense one hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1)) # print size in bytes for dense array
print(f"Size of dense array: {ohe_example.nbytes}")

In [None]:
# initialize OneHotEncoder from scikit-learn # keep sparse = True to get sparse array
ohe = preprocessing.OneHotEncoder(sparse=True)
# fit and transform data with sparse one-hot encoder
ohe_example = ohe.fit_transform(example.reshape(-1, 1)) # print size of this sparse matrix
print(f"Size of sparse array: {ohe_example.data.nbytes}")
full_size = (
ohe_example.data.nbytes +
ohe_example.indptr.nbytes + ohe_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

In [169]:

def ohe_encoding_transformer(data, cols):
    df = data.copy(deep=True)
    ohe = preprocessing.OneHotEncoder(sparse=False)
    for col in cols:
        df.loc[:, col] = df[col].fillna("NONE").values.astype(str) #converts the column to string
        ohe_df = pd.DataFrame(ohe.fit_transform(df[col].values.reshape(-1,1)))
        ohe_df.columns = [col + '_' + category.split("_")[1] for category in ohe.get_feature_names_out()]
        print(ohe_df.columns)
        df.reset_index(drop=True, inplace=True)
        df = pd.concat([df, ohe_df], axis=1)
        df.drop(columns=col, inplace=True)
    return df

In [54]:
data = pd.read_csv("../data/cat_in_the_dat.csv")
new_data = ohe_encoding_transformer(data, ['nom_1', 'nom_2'])

Index(['nom_1_Circle', 'nom_1_NONE', 'nom_1_Polygon', 'nom_1_Square',
       'nom_1_Star', 'nom_1_Trapezoid', 'nom_1_Triangle'],
      dtype='object')
Index(['nom_2_Axolotl', 'nom_2_Cat', 'nom_2_Dog', 'nom_2_Hamster',
       'nom_2_Lion', 'nom_2_NONE', 'nom_2_Snake'],
      dtype='object')


In [55]:
new_data.head(1000)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,nom_1_Circle,nom_1_NONE,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Axolotl,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,nom_2_NONE,nom_2_Snake
0,0,0.0,0.0,0.0,F,N,Red,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,1.0,1.0,0.0,F,Y,Red,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,1.0,0.0,F,N,Red,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3,,0.0,0.0,F,N,Red,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,0.0,,0.0,T,N,Red,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,5,0.0,,1.0,T,N,Red,China,Bassoon,a2e1bf0b1,ae6737c29,8c30b9b0b,690411ac0,05afc0f8b,2.0,Expert,Hot,b,Q,wa,3.0,4.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,6,0.0,0.0,0.0,F,N,Red,Costa Rica,Bassoon,87a5be0d7,cdc35bd00,1cba571fa,b8e63cace,4d3766412,1.0,Grandmaster,Cold,c,R,rg,5.0,6.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,7,0.0,0.0,1.0,T,N,Red,Finland,Bassoon,104aee31d,2a50808ba,81d67e1bb,bd9643a20,a651dec43,3.0,Expert,Cold,b,Y,PS,1.0,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,0.0,0.0,0.0,F,N,Blue,Russia,Oboe,024efa364,a4a81ab45,429114096,94c5fd40c,,1.0,Novice,Boiling Hot,c,N,mX,6.0,3.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,9,0.0,0.0,,F,Y,Red,Finland,Theremin,9fa084b36,e7aa94f40,56d35c774,0279391c5,79b29d54c,3.0,Contributor,Lava Hot,n,I,OZ,1.0,8.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


<font size=5> Frequency - Count Encoding </font>

In [74]:
def frequency_encoding_transformer(data, cols):
    for col in cols:
        data.loc[:, col] = data[col].fillna("NONE")
        data[col] = data.groupby(col)[col].transform('count')
        
    return data

In [75]:
data = pd.read_csv("../data/cat_in_the_dat.csv")
new_data = frequency_encoding_transformer(data, ['nom_0', 'nom_3'])

In [76]:
new_data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,323286,Trapezoid,Hamster,119840,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,323286,Star,Axolotl,18121,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,323286,,Hamster,26425,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,323286,Circle,Hamster,104601,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,323286,Triangle,Hamster,151827,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


<font size=5> Concatenating columns and counting </font>

In [77]:
data = pd.read_csv("../data/cat_in_the_dat.csv")
data.loc[:, 'nom_1__nom_2'] = data['nom_1'].astype(str) + "__" + data['nom_2'].astype(str)
data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,nom_1__nom_2
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,Trapezoid__Hamster
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,Star__Axolotl
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0,nan__Hamster
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0,Circle__Hamster
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,Triangle__Hamster


In [78]:
new_data = frequency_encoding_transformer(data, ['nom_1__nom_2'])

In [79]:
new_data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,nom_1__nom_2
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,32675
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,3619
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0,5045
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0,28879
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,45326


<font size=5> Code Rare Category </font>

<font size=5> Used for two purposes </font>

1. To **ensure the ML model DOES NOT memorize some rare categories (target) based on the few rows of data it sees for that target**
2. To **encode unknown categories** as Rare in production

In [162]:

rare_count_threshold = 1000 #rows
def rare_encoding_transformer(data, cols, rare_count_threshold = 100000):
    rare_label_dict = {}
    for col in cols:
        data.loc[:, col] = data[col].fillna("NONE")
        series = data[col].value_counts() > rare_count_threshold
        non_rare_cats = list(series[series==True].index)
        rare_cats = list(series[series==False].index)
        
        rare_label_dict[col + '_rare'] = rare_cats
        rare_label_dict[col + '_non_rare'] = non_rare_cats
        
        print(f'rare_cats: {rare_cats}')
        print(f'non_rare_cats: {non_rare_cats}')
        
        data.loc[:, col][data[col].isin(rare_cats)] = 'Rare'
    return data, rare_label_dict
        

In [163]:
data = pd.read_csv("../data/cat_in_the_dat.csv")
new_data, rare_label_dict = rare_encoding_transformer(data, ['ord_1'])

rare_cats: ['Grandmaster', 'Master', 'NONE']
non_rare_cats: ['Novice', 'Expert', 'Contributor']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, col][data[col].isin(rare_cats)] = 'Rare'


In [164]:
new_data['ord_1'].value_counts()

Rare           189905
Novice         160597
Expert         139677
Contributor    109821
Name: ord_1, dtype: int64

In [165]:
rare_label_dict

{'ord_1_rare': ['Grandmaster', 'Master', 'NONE'],
 'ord_1_non_rare': ['Novice', 'Expert', 'Contributor']}

<font size=5> OHE sparse and SVD </font>

In [176]:
def ohe_encoding_sparse_transformer(data, cols):
    ohe = preprocessing.OneHotEncoder(sparse=True)
    sparse = ohe.fit_transform(data[cols]) #assumes more than one col or will throw error for reshape
    return sparse


In [183]:
from sklearn import decomposition

def svd_encoding_transformer(sparse_ohe_matrix, n_components = 120):
    svd = decomposition.TruncatedSVD(n_components=n_components)
    svd_components = svd.fit_transform(sparse_ohe_matrix)
    return svd_components

In [184]:
svd_components = svd_encoding_transformer(sparse)

<font size=5> RF Model Building with SVD features from OHE encoding</font>

In [185]:
data = pd.read_csv("../data/cat_in_the_dat.csv")
sparse = ohe_encoding_sparse_transformer(data, cols = list(set(data.columns) - set(['id', 'target'])))
svd_components = svd_encoding_transformer(sparse)

In [186]:
type(svd_components)

numpy.ndarray

In [189]:
svd_df = pd.DataFrame(svd_components)
svd_df.columns = ['svd_dim_' + str(col) for col in svd_df.columns]
svd_df.head()

Unnamed: 0,svd_dim_0,svd_dim_1,svd_dim_2,svd_dim_3,svd_dim_4,svd_dim_5,svd_dim_6,svd_dim_7,svd_dim_8,svd_dim_9,svd_dim_10,svd_dim_11,svd_dim_12,svd_dim_13,svd_dim_14,svd_dim_15,svd_dim_16,svd_dim_17,svd_dim_18,svd_dim_19,svd_dim_20,svd_dim_21,svd_dim_22,svd_dim_23,svd_dim_24,svd_dim_25,svd_dim_26,svd_dim_27,svd_dim_28,svd_dim_29,svd_dim_30,svd_dim_31,svd_dim_32,svd_dim_33,svd_dim_34,svd_dim_35,svd_dim_36,svd_dim_37,svd_dim_38,svd_dim_39,svd_dim_40,svd_dim_41,svd_dim_42,svd_dim_43,svd_dim_44,svd_dim_45,svd_dim_46,svd_dim_47,svd_dim_48,svd_dim_49,svd_dim_50,svd_dim_51,svd_dim_52,svd_dim_53,svd_dim_54,svd_dim_55,svd_dim_56,svd_dim_57,svd_dim_58,svd_dim_59,svd_dim_60,svd_dim_61,svd_dim_62,svd_dim_63,svd_dim_64,svd_dim_65,svd_dim_66,svd_dim_67,svd_dim_68,svd_dim_69,svd_dim_70,svd_dim_71,svd_dim_72,svd_dim_73,svd_dim_74,svd_dim_75,svd_dim_76,svd_dim_77,svd_dim_78,svd_dim_79,svd_dim_80,svd_dim_81,svd_dim_82,svd_dim_83,svd_dim_84,svd_dim_85,svd_dim_86,svd_dim_87,svd_dim_88,svd_dim_89,svd_dim_90,svd_dim_91,svd_dim_92,svd_dim_93,svd_dim_94,svd_dim_95,svd_dim_96,svd_dim_97,svd_dim_98,svd_dim_99,svd_dim_100,svd_dim_101,svd_dim_102,svd_dim_103,svd_dim_104,svd_dim_105,svd_dim_106,svd_dim_107,svd_dim_108,svd_dim_109,svd_dim_110,svd_dim_111,svd_dim_112,svd_dim_113,svd_dim_114,svd_dim_115,svd_dim_116,svd_dim_117,svd_dim_118,svd_dim_119
0,2.609447,-0.63587,-0.478159,-0.5802,0.862028,-0.308048,-0.690824,-0.111764,-0.50031,0.60542,0.084631,-0.477631,-0.137144,-0.232458,-0.099026,0.858419,0.579492,0.710856,-0.163638,-0.28742,-0.336129,0.326473,-0.471304,-0.590535,-0.602506,-0.10026,-0.635938,0.187313,-0.065029,-0.148035,-0.299553,0.676178,-0.130481,-0.105288,0.730563,-0.036859,-0.383837,0.061456,-0.639237,0.047387,-0.42002,-0.524771,0.232043,-0.588154,0.066224,-0.033847,-0.338065,-0.022039,-0.051499,-0.094931,-0.063833,-0.016833,-0.060642,-0.005658,0.013945,-0.251573,-0.062202,-0.057673,-0.124638,-0.562413,-0.37551,0.57756,-0.126287,-0.106143,-0.012359,-0.149735,-0.117355,-0.033373,0.028891,-0.056356,-0.098173,-0.006475,-0.022313,-0.021725,-0.015816,-0.02482,-0.008904,-0.005648,-0.06385,-0.027194,-0.004996,-0.01964,-0.011769,0.011372,0.006915,-0.00382,-0.026708,-0.023786,0.031904,-0.067272,-0.00991,-0.089919,-0.070161,-0.023099,-0.010835,-0.001245,-0.004654,0.00314,-0.006167,-0.031169,-0.009502,-0.010346,-0.006917,-0.007454,-0.023657,-0.015681,-0.015114,-0.029231,-0.021815,0.013232,-0.040186,-0.035504,-0.041334,-0.040001,0.026478,-0.030523,0.003796,-0.078005,-0.045511,-0.077416
1,1.891295,0.756349,-0.523643,-0.56981,-0.612412,-0.769693,-0.584967,0.973533,-0.244634,-0.579556,-0.037194,0.260947,-0.141936,-0.689396,-0.046094,0.33484,-0.2378,0.288107,-0.000918,0.652891,0.11052,-0.218615,-0.256467,-0.213307,0.550514,1.307842,0.636281,-0.081737,-0.635404,-0.30913,-0.078637,-0.077514,0.000468,0.062473,-0.060496,-0.097835,-0.136895,-0.032074,-0.016197,-0.198825,0.104786,-0.100515,-0.561073,0.013056,0.690311,-0.134567,0.27612,0.042893,-0.191785,0.546293,0.014956,-0.692237,-0.073101,0.013123,-0.23307,-0.229553,0.119219,0.444421,0.319767,-0.104839,0.357597,-0.110046,0.69977,-0.466795,-0.194473,-0.214143,-0.153402,-0.053406,0.012112,-0.034163,-0.131692,-0.014549,0.013039,0.060445,-0.010066,0.115192,0.160984,0.067363,-0.061488,-0.0939,0.038613,-0.372335,-0.489772,0.409473,-0.494963,-0.320276,-0.016548,0.021812,-0.120466,0.147814,-0.011458,0.06915,0.000533,-0.051298,-0.00328,-0.00913,-0.045683,0.010209,-0.02547,-0.047923,0.014658,-0.289241,-0.637952,0.629502,0.391764,-0.022324,-0.025474,-0.013255,-0.014388,-0.023711,0.008058,0.064775,-0.018891,-0.064284,-0.059126,0.012269,-0.053822,-0.179583,0.023428,0.102604
2,2.193049,-0.650929,-0.533774,-0.628237,0.809735,-0.45579,-0.648283,1.221235,-0.125032,0.603452,0.090332,-0.380727,-0.063289,0.681028,0.003297,0.473922,-0.139254,0.103331,-0.633241,0.253658,0.092179,-0.165792,-0.002733,0.548223,0.134153,-0.104739,-0.185605,-0.092197,-0.012214,0.098178,-0.042114,-0.050835,-0.096206,-0.100594,-0.235303,0.789863,-0.295901,-0.27266,-0.006563,0.165292,0.171595,0.015036,-0.059332,-0.031922,-0.009567,0.011359,-0.102962,-0.007456,0.053983,-0.043712,-0.247124,-0.040877,0.894775,0.00378,-0.06629,0.280245,-0.042192,0.849072,0.461954,-0.029831,0.314422,0.233803,0.018712,-0.007114,-0.010724,-0.005384,-0.046073,-0.014988,0.010609,-0.048316,-0.062709,-0.009458,0.02887,0.049963,-0.012223,-0.113966,0.068205,0.42066,0.107456,0.793425,-0.021466,-0.492362,0.535821,-0.987694,-0.227009,-0.50955,0.037672,-0.083152,-0.264893,0.514509,-0.341094,0.241566,0.103923,-0.140215,0.769911,-0.179133,0.447972,-0.125944,-0.192823,-0.020894,-0.055505,-0.047076,0.071797,-0.266278,-0.202097,-0.014179,-0.012084,-0.016771,-0.018728,-0.007916,-0.001824,-0.003122,-0.019477,0.022086,-0.0359,-0.009914,0.003225,0.065493,0.059358,0.026404
3,2.393485,-0.646227,-0.439632,-0.522557,-0.515092,-0.514025,0.754822,-0.340583,-0.33858,0.643559,0.073035,-0.409595,0.80007,-0.142667,-0.080204,0.544326,0.10715,-0.286275,0.87911,0.035668,0.191122,-0.602884,0.911742,0.475835,-0.171709,0.626874,-0.111768,0.188308,-0.049609,-0.057495,-0.281013,0.468922,-0.026142,-0.114731,0.857744,-0.289697,-0.280098,0.134686,0.792751,0.73483,0.39789,0.071244,0.136098,-0.023658,0.032338,-0.037503,-0.155564,-0.017034,-0.050653,-0.071439,-0.054343,-0.021269,-0.053181,-0.002832,-0.002465,-0.18753,-0.019683,-0.073147,-0.047864,-0.046344,-0.077739,-0.261362,0.151025,0.820301,-0.000797,-0.341332,-0.17389,-0.048013,0.051586,-0.037395,-0.1122,-0.027998,-0.07632,-0.081517,-0.016829,-0.739855,-0.683581,-0.224568,-0.12391,0.020394,-0.053222,0.237913,0.07397,0.026466,-0.014555,-0.044714,0.00616,-0.00179,-0.08417,0.129176,-0.01945,0.149372,0.03834,-0.106851,-0.006229,-0.078928,-0.140616,-0.55966,0.704269,0.223055,-0.054055,-0.024145,0.001053,-0.004612,-0.001422,0.016231,0.005593,-0.006716,0.011644,-0.004235,-0.009099,0.001808,0.003057,0.006117,0.004206,0.00457,-0.005119,0.001553,0.011768,-0.001999
4,2.128249,-0.646862,0.905511,-0.546571,0.222153,-0.434922,-0.653134,0.599376,-0.287141,0.673464,1.026851,-0.129748,-0.141518,-0.194909,-0.124269,-0.215479,-0.639582,0.294311,-0.656308,-0.582551,-0.203699,0.00076,0.068055,0.507571,0.877341,0.001319,-0.216605,-0.64527,0.006609,-0.366256,-0.154813,-0.1762,0.387417,0.224942,-0.218876,-0.276382,0.314286,-0.634434,0.01827,0.068657,-0.44292,-0.221034,0.155693,0.783916,0.028216,-0.07934,-0.397576,-0.025088,-0.039568,-0.126207,-0.061112,-0.007508,-0.041173,0.012408,-0.021492,-0.203722,-0.057796,-0.062695,-0.312224,-0.039221,0.196654,-0.136247,0.170209,0.842333,-0.024112,-0.32369,-0.156714,-0.052313,0.014255,-0.105795,-0.024835,0.02756,-0.236948,-0.964469,-0.025289,0.470411,-0.125783,-0.114236,-0.082296,-0.017616,0.003703,0.064223,-0.033284,-0.012611,0.057267,0.377945,-0.764576,-0.164037,-0.238558,0.374591,0.016285,0.301613,0.086166,0.011058,0.067051,-0.206724,-0.036839,0.792334,0.41495,0.255119,0.001383,0.005448,0.00841,0.005094,0.023387,-0.021856,-0.018035,-0.016824,-0.007324,-0.035719,-0.033741,-0.079675,0.140501,0.093765,-0.020168,-0.092175,-0.013417,-0.11331,-0.045672,-0.026235


In [198]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_jobs=-1)
cross_val_score(clf, svd_df, data['target'], cv=5)

array([0.81414167, 0.81455   , 0.81443333, 0.81468333, 0.81370833])

<font size=5> XGB with Label Encoding </font>

In [200]:
import xgboost
data = pd.read_csv("../data/cat_in_the_dat.csv")
new_data, _ = label_encoding_transformer(data, cols = list(set(data.columns) - set(['id', 'target'])))
new_data.drop(columns='id', inplace=True)
xgb = xgboost.XGBClassifier(n_estimators=10, max_depth=7)
cross_val_score(xgb, new_data[list(set(data.columns) - set(['id', 'target']))], new_data['target'], cv=5)


{1.0: 0, 2.0: 1, 3.0: 2, 4.0: 3, 5.0: 4, 6.0: 5, 7.0: 6, nan: 7}
{0.0: 0, 1.0: 1, nan: 2}
{'Axolotl': 0, 'Cat': 1, 'Dog': 2, 'Hamster': 3, 'Lion': 4, 'Snake': 5, nan: 6}
{'007bff22a': 0, '0256c7a4b': 1, '0279391c5': 2, '0602256df': 3, '060a21580': 4, '0e5249dbe': 5, '0f269cbc8': 6, '0f94eb834': 7, '10fd52d10': 8, '115d9fd8b': 9, '1195f603e': 10, '153864851': 11, '154aacc1c': 12, '158183c63': 13, '15f03b1f4': 14, '1653ceb33': 15, '165e81a00': 16, '1a59581be': 17, '1b6796452': 18, '1c4931f19': 19, '1ce5cf721': 20, '1d88b0a79': 21, '1df855856': 22, '1e722bf45': 23, '1f2cd223e': 24, '1fe15a999': 25, '2061350b9': 26, '210039295': 27, '21595d82e': 28, '220190c9e': 29, '2218d9dfe': 30, '22e0a764d': 31, '23dc2e63d': 32, '25c087d05': 33, '2694748e3': 34, '28869f8aa': 35, '28c042258': 36, '29964d969': 37, '2b34f5aca': 38, '2c58293c4': 39, '2db5a9486': 40, '31db9b004': 41, '3210cd7c1': 42, '32412c81c': 43, '32b33a4b4': 44, '32ff72b97': 45, '35309e01b': 46, '3776bb1ae': 47, '3d0ea8194': 48, '3d70f

{1.0: 0, 2.0: 1, 3.0: 2, 4.0: 3, 5.0: 4, 6.0: 5, 7.0: 6, 8.0: 7, 9.0: 8, 10.0: 9, 11.0: 10, 12.0: 11, nan: 12}
{0.0: 0, 1.0: 1, nan: 2}
{'Circle': 0, 'Polygon': 1, 'Square': 2, 'Star': 3, 'Trapezoid': 4, 'Triangle': 5, nan: 6}
{'N': 0, 'Y': 1, nan: 2}
{0.0: 0, 1.0: 1, nan: 2}
{'Bassoon': 0, 'Oboe': 1, 'Piano': 2, 'Theremin': 3, nan: 4}
{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, nan: 26}
{'AG': 0, 'AI': 1, 'AU': 2, 'AW': 3, 'Ay': 4, 'BL': 5, 'BX': 6, 'Bx': 7, 'CN': 8, 'CU': 9, 'Cn': 10, 'DI': 11, 'DN': 12, 'DR': 13, 'DT': 14, 'Dj': 15, 'Dn': 16, 'EC': 17, 'Ey': 18, 'FB': 19, 'FH': 20, 'Fl': 21, 'GZ': 22, 'HF': 23, 'HK': 24, 'HO': 25, 'Hk': 26, 'IA': 27, 'IS': 28, 'Ib': 29, 'In': 30, 'Io': 31, 'Iq': 32, 'JQ': 33, 'JT': 34, 'Ji': 35, 'Kq': 36, 'LS': 37, 'LY': 38, 'Lo': 39, 'MF': 40, 'MU': 41, 'MV': 42, 'MX': 4

{'000b3c20a': 0, '0054c0c3a': 1, '00a731d2e': 2, '014770cf0': 3, '0165aa0c3': 4, '023ed7074': 5, '024efa364': 6, '0276cf712': 7, '0286dc1e1': 8, '0289ab250': 9, '028a6acde': 10, '029d67ae5': 11, '0385d0739': 12, '0388c582c': 13, '03c739608': 14, '03cbd5a22': 15, '03ea75c83': 16, '03f2a3450': 17, '045558e43': 18, '053a1f28a': 19, '0549ab935': 20, '0568087f1': 21, '05c7afaf4': 22, '05d793448': 23, '05eecb19a': 24, '061887f9d': 25, '0664ab302': 26, '06a30b4b2': 27, '06f40a982': 28, '06f9a4b85': 29, '07134e36b': 30, '0714feb00': 31, '071f9b28d': 32, '075ba4f4e': 33, '07648dd95': 34, '0779c1f16': 35, '079b76328': 36, '08ab6d513': 37, '09372c968': 38, '09446ec9b': 39, '094b85efd': 40, '09759f3f8': 41, '09832efe8': 42, '09b7f013b': 43, '0a29d20e1': 44, '0a82099d4': 45, '0aad1d7ff': 46, '0b1be2ef7': 47, '0b436e288': 48, '0b5f349fe': 49, '0ba49ebc6': 50, '0ba584587': 51, '0be8a204f': 52, '0c02cb003': 53, '0c2d009af': 54, '0c4c4bde8': 55, '0c5f52fde': 56, '0c64d0fe2': 57, '0c6e89e14': 58, '0c906

{'Blue': 0, 'Green': 1, 'Red': 2, nan: 3}
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, nan: 15}






















array([0.81520833, 0.81643333, 0.81595833, 0.81605833, 0.81524167])