In [1]:
# Label Encoding
# Can be used in tree-based models: XGBoost,GBM,LightGBM
# Cannot be used in linear models, svm or NN where the data are
# expected to be normalised (standardized)
mapping = {
 "Freezing": 0,
 "Warm": 1,
 "Cold": 2,
 "Boiling Hot": 3,
 "Hot": 4,
 "Lava Hot": 5 
}

In [2]:
import pandas as pd
df = pd.read_csv("./test.csv")
df.loc[:, "ord_2"] = df.ord_2.map(mapping)

In [3]:
df.ord_2.value_counts()

ord_2
0.0    95287
1.0    82940
2.0    65042
3.0    56624
4.0    44509
5.0    43493
Name: count, dtype: int64

In [5]:
# Another way to do so
import pandas as pd
from sklearn import preprocessing
# read the data
df = pd.read_csv("./test.csv")
# fill NaN values in ord_2 column
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")
# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()
# fit label encoder and transform values on ord_2 column
# P.S: do not use this directly. fit first, then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [6]:
df.ord_2.value_counts()

ord_2
2    95287
6    82940
1    65042
0    56624
3    44509
4    43493
5    12105
Name: count, dtype: int64

In [7]:
# to fit this for svm/neural network, we can binarize the label
# by splitting them into n features 2^n

# There are normal binary data storation and sparse version

import numpy as np
from scipy import sparse
# create our example feature matrix
example = np.array(
 [
 [0, 0, 1],
 [1, 0, 0],
 [1, 0, 1]
 ]
)
# print size of the origin matrix
print(example.nbytes)
# convert numpy array to sparse CSR matrix
# record the position of 1s
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(sparse_example.data.nbytes)
# total storage space for sparse matrix
print(
 sparse_example.data.nbytes + 
 sparse_example.indptr.nbytes + 
 sparse_example.indices.nbytes
)

72
32


In [9]:
import numpy as np
from scipy import sparse
# create binary matrix
example = np.array(
 [
 [0, 0, 0, 0, 1, 0],
 [0, 1, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0]
 ]
)
# print size in bytes
print(f"Size of dense array: {example.nbytes}")
# convert numpy array to sparse CSR matrix
sparse_example = sparse.csr_matrix(example)
# print size of this sparse matrix
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
 sparse_example.data.nbytes + 
 sparse_example.indptr.nbytes + 
 sparse_example.indices.nbytes
)
# print full size of this sparse matrix
print(f"Full size of sparse array: {full_size}")

Size of dense array: 144
Size of sparse array: 24
Full size of sparse array: 52


In [12]:
df = pd.read_csv("./test.csv")
df[df.ord_2 == "Boiling Hot"].shape

(56624, 24)

In [13]:
df.groupby(["ord_2"])["id"].count()

ord_2
Boiling Hot    56624
Cold           65042
Freezing       95287
Hot            44509
Lava Hot       43493
Warm           82940
Name: id, dtype: int64

In [14]:
df.groupby(["ord_2"])["id"].transform("count")

0         56624.0
1         65042.0
2         82940.0
3         44509.0
4         43493.0
           ...   
399995    82940.0
399996        NaN
399997    56624.0
399998    43493.0
399999    95287.0
Name: id, Length: 400000, dtype: float64

# Deal with NaN

In [16]:
df = pd.read_csv("./test.csv")
df.ord_2.value_counts()

ord_2
Freezing       95287
Warm           82940
Cold           65042
Boiling Hot    56624
Hot            44509
Lava Hot       43493
Name: count, dtype: int64

In [17]:
df.ord_2.fillna("NONE").value_counts()

ord_2
Freezing       95287
Warm           82940
Cold           65042
Boiling Hot    56624
Hot            44509
Lava Hot       43493
NONE           12105
Name: count, dtype: int64

# Rare Category


In [18]:
df.ord_4.fillna("NONE").value_counts()

ord_4
N       26725
P       25391
Y       24871
A       24478
R       22053
U       22017
M       21487
X       21419
C       21133
H       20501
Q       19963
T       19765
O       17347
B       16705
E       14654
K       14322
I       13123
NONE    11933
D       11401
F       11154
W        5634
Z        3973
S        3197
G        2290
V        2042
J        1331
L        1091
Name: count, dtype: int64

In [19]:
'''
We say that wherever the value count for a certain category is less than 2000, 
replace it with rare. So, now, when it comes to test data, all the new, unseen 
categories will be mapped to “RARE”, and all missing values will be mapped to 
“NONE”.
This approach will also ensure that the model works in a live setting, even if you 
have new categories.
'''

df.ord_4 = df.ord_4.fillna("NONE")
df.loc[df["ord_4"].value_counts()[df["ord_4"]].values < 2000, "ord_4"] = "RARE"

In [20]:
df.ord_4.value_counts()

ord_4
N       26725
P       25391
Y       24871
A       24478
R       22053
U       22017
M       21487
X       21419
C       21133
H       20501
Q       19963
T       19765
O       17347
B       16705
E       14654
K       14322
I       13123
NONE    11933
D       11401
F       11154
W        5634
Z        3973
S        3197
RARE     2422
G        2290
V        2042
Name: count, dtype: int64

# Categorical Model

In [21]:
import pandas as pd
from sklearn import preprocessing
# read training data
train = pd.read_csv("./train.csv")
#read test data
test = pd.read_csv("./test.csv")
# create a fake target column for test data
# since this column doesn't exist
test.loc[:, "target"] = -1
# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)
# make a list of features we are interested in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id", "target"]]

# loop over the features list
for feat in features:
    # create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()
    # note the trick here
    # since its categorical data, we fillna with a string
    # and we convert all the data to string type
    # so, no matter its int or float, its converted to string
    # int/float but categorical!!!
    temp_col = data[feat].fillna("NONE").astype(str).values
    # we can use fit_transform here as we do not
    # have any extra test data that we need to
    # transform on separately
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
 
# split the training and test data again 
train = data[data.target != -1].reset_index(drop=True)


test = data[data.target == -1].reset_index(drop=True)

Before going to any kind of model building, it’s essential to take care of cross validation. 

In [1]:
# create_folds.py
# import pandas and model_selection module of scikit-learn
import pandas as pd
from sklearn import model_selection
if __name__ == "__main__":
    # Read training data
    df = pd.read_csv("./train.csv")
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1

    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # fetch labels
    y = df.target.values

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f

    # save the new csv with kfold column
    df.to_csv("./cat_train_folds.csv", index=False)

In [4]:
df

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,kfold
0,262970,0.0,0.0,0.0,F,N,Red,Polygon,Dog,Costa Rica,...,1.0,Master,Boiling Hot,d,Q,kB,6.0,,0,0
1,320939,0.0,,0.0,F,Y,Red,Trapezoid,Lion,Finland,...,1.0,Novice,Cold,m,N,MV,7.0,5.0,0,0
2,584290,0.0,0.0,0.0,F,,Green,Trapezoid,Axolotl,Finland,...,3.0,Novice,Warm,b,P,yE,1.0,8.0,0,0
3,413237,0.0,0.0,1.0,F,N,Blue,Triangle,Hamster,India,...,1.0,Novice,Hot,k,F,OZ,6.0,12.0,0,0
4,391282,0.0,0.0,0.0,F,N,Blue,Polygon,Hamster,India,...,3.0,Expert,Boiling Hot,g,B,vx,1.0,5.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,308001,0.0,0.0,0.0,F,Y,Red,Trapezoid,Hamster,Finland,...,2.0,Grandmaster,Warm,e,F,YJ,5.0,6.0,0,4
599996,8879,0.0,0.0,1.0,T,N,Red,Trapezoid,Lion,India,...,1.0,Master,Cold,i,T,nj,5.0,12.0,0,4
599997,481886,0.0,0.0,0.0,F,Y,Green,Polygon,Snake,India,...,1.0,Expert,Warm,m,,hG,3.0,2.0,0,4
599998,157226,0.0,0.0,1.0,F,Y,Red,Triangle,Hamster,Costa Rica,...,1.0,Grandmaster,Boiling Hot,k,Y,rg,3.0,4.0,0,4


In [28]:
import pandas as pd
df = pd.read_csv("./cat_train_folds.csv")

In [29]:
df.kfold.value_counts()

kfold
0    120000
1    120000
2    120000
3    120000
4    120000
Name: count, dtype: int64

In [30]:
df[df.kfold==0].target.value_counts()

target
0    97536
1    22464
Name: count, dtype: int64

Build simple logistic regression model with one-hot encoding

In [32]:
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
def run(fold):
    # load the full training data with folds
    df = pd.read_csv("../input/cat_train_folds.csv")
    # all columns are features except id, target and kfold columns
    features = [
    f for f in df.columns if f not in ("id", "target", "kfold")
    ]
    # fill all NaN values with NONE
    # note that I am converting all columns to "strings"
    # it doesn’t matter because all are categories
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

        # get training data using folds
        df_train = df[df.kfold != fold].reset_index(drop=True)
        # get validation data using folds
        df_valid = df[df.kfold == fold].reset_index(drop=True)
        # initialize OneHotEncoder from scikit-learn
        ohe = preprocessing.OneHotEncoder()
        # fit ohe on training + validation features
        full_data = pd.concat(
        [df_train[features], df_valid[features]],
        axis=0
        )
        ohe.fit(full_data[features])
        # transform training data
        x_train = ohe.transform(df_train[features])
        # transform validation data
        x_valid = ohe.transform(df_valid[features])
        
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()
    # fit model on training data (ohe)
    model.fit(x_train, df_train.target.values)
    # predict on validation data
    # we need the probability values as we are calculating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:, 1]
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values, valid_preds)
    # print auc
    print(auc)
    
    
if __name__ == "__main__":
    # run function for fold = 0
    # we can just replace this number and 
    # run this for any fold
    run(0)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 39)