In [None]:
import collections
import logging
import pprint
import warnings

import numpy as np
import pandas as pd
import xgboost as xgb
from scipy import sparse
from sklearn import decomposition, ensemble, linear_model, metrics, preprocessing

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [3]:
df_train = pd.read_csv("../data/train.csv", index_col="id")
df_test = pd.read_csv("../data/test.csv", index_col="id")

df_train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [4]:
df_train.loc[:, "ord_2"].unique()

array(['Hot', 'Warm', 'Freezing', 'Lava Hot', 'Cold', 'Boiling Hot', nan],
      dtype=object)

In [5]:
df_train.loc[:, "ord_2"].value_counts()

ord_2
Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: count, dtype: int64

In [6]:
ord_2_mapping = {
    "Freezing": 0,
    "Cold": 1,
    "Warm": 2,
    "Hot": 3,
    "Boiling Hot": 4,
    "Lava Hot": 5,
}
df_train.loc[:, "ord_2"] = df_train.loc[:, "ord_2"].map(ord_2_mapping)
df_train.loc[:, "ord_2"].value_counts()

ord_2
0.0    142726
2.0    124239
1.0     97822
4.0     84790
3.0     67508
5.0     64840
Name: count, dtype: int64

In [7]:
df_train["new_feature"] = df_train.loc[:, "ord_1"].astype(str) \
    + "_" + df_train.loc[:, "ord_2"].astype(str)
df_train.loc[:, "new_feature"].head()

id
0    Contributor_3.0
1    Grandmaster_2.0
2            nan_0.0
3         Novice_5.0
4    Grandmaster_1.0
Name: new_feature, dtype: object

In [8]:
# Playing with sparse embeddings
ohe = sklearn.preprocessing.OneHotEncoder(sparse_output=True)
# df_train.loc[:, "ord_1"].values
df_train.loc[:, "ord_1"] = df_train.loc[:, "ord_1"].fillna("Unknown")
ohe.fit(df_train.loc[:, "ord_1"].values.reshape(-1, 1))


In [9]:
ohe.categories_

[array(['Contributor', 'Expert', 'Grandmaster', 'Master', 'Novice',
        'Unknown'], dtype=object)]

In [10]:
ohe.transform(df_train.loc[:, "ord_1"].values.reshape(-1, 1))

<600000x6 sparse matrix of type '<class 'numpy.float64'>'
	with 600000 stored elements in Compressed Sparse Row format>

## Rare categories test

In [11]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

print(f"{'target' in df_test.columns}")
print(f"{'target' in df_train.columns}")

False
True


In [12]:
df_test.loc[:, "target"] = -1
df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

features = [column for column in df_train.columns if column not in ["id", "target"]]

In [13]:
for feature in features:
    encoder = sklearn.preprocessing.OrdinalEncoder()
    temp_feature = df_full.loc[:, feature].fillna("UNKNOWN").astype(str).values.reshape(-1, 1)
    df_full.loc[:, feature] = encoder.fit_transform(temp_feature)

In [14]:
df_train = df_full[df_full.target != -1].reset_index(drop=True)
df_test = df_full[df_full.target == -1].reset_index(drop=True)

In [15]:
len(df_train) + len(df_test) == len(df_full)

True

## (Basic) Feature Engineering for Rare Categories

In [16]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

In [17]:
df_full.loc[:, "ord_4"].value_counts().sort_index()

ord_4
A    61111
B    41917
C    53245
D    28685
E    36525
F    27875
G     5694
H    51690
I    32928
J     3281
K    35998
L     2748
M    53991
N    66703
O    42957
P    63281
Q    50108
R    55098
S     7792
T    49488
U    54914
V     5149
W    13902
X    53766
Y    61528
Z     9763
Name: count, dtype: int64

In [18]:
df_full.ord_4.fillna("NONE").value_counts().sort_index()

ord_4
A       61111
B       41917
C       53245
D       28685
E       36525
F       27875
G        5694
H       51690
I       32928
J        3281
K       35998
L        2748
M       53991
N       66703
NONE    29863
O       42957
P       63281
Q       50108
R       55098
S        7792
T       49488
U       54914
V        5149
W       13902
X       53766
Y       61528
Z        9763
Name: count, dtype: int64

In [19]:
df_full.loc[:, "ord_4"] = df_full.loc[:, "ord_4"].fillna("NONE")

In [20]:
df_full.loc[:, "ord_4"].value_counts() < 2000

ord_4
N       False
P       False
Y       False
A       False
R       False
U       False
M       False
X       False
C       False
H       False
Q       False
T       False
O       False
B       False
E       False
K       False
I       False
NONE    False
D       False
F       False
W       False
Z       False
S       False
G       False
V       False
J       False
L       False
Name: count, dtype: bool

In [21]:
df_full.loc[
    df_full.loc[:, "ord_4"].value_counts()[df_full["ord_4"]].values < 2000
    , "ord_4"
] = "RARE"

# Model Training and Evaluation

In [22]:
from ml_misc import create_folds

create_folds.main()

In [23]:
data = pd.read_csv("../data/train_5_folds.csv")
data.loc[:, "fold_id"].value_counts()

fold_id
0    120000
1    120000
2    120000
3    120000
4    120000
Name: count, dtype: int64

In [24]:
data[data["fold_id"] == 0].target.value_counts()

target
0    97536
1    22464
Name: count, dtype: int64

In [25]:
data[data["fold_id"] == 1].target.value_counts()

target
0    97536
1    22464
Name: count, dtype: int64

In [26]:
data[data["fold_id"] == 2].target.value_counts()

target
0    97535
1    22465
Name: count, dtype: int64

In [27]:
data[data["fold_id"] == 3].target.value_counts()

target
0    97535
1    22465
Name: count, dtype: int64

In [28]:
data[data["fold_id"] == 4].target.value_counts()

target
0    97535
1    22465
Name: count, dtype: int64

In [7]:
def run_linear_regression(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")
    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]
    
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")
        
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)
    
    encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=True)
    X_full = pd.concat(
        [df_train[features], df_valid[features]],
        axis=0
    )
    # not sure we should be fitting an encoder on the train AND test sets...
    encoder.fit(X_full[features])
    X_train = encoder.transform(df_train[features])
    X_valid = encoder.transform(df_valid[features])

    model = linear_model.LogisticRegression()
    model.fit(X_train, df_train.loc[:, "target"].values)
    validation_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, validation_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")

In [13]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_linear_regression(fold_id)

AUC = 0.7813202462705275
AUC = 0.7766943500129516
AUC = 0.7819135452537362
AUC = 0.7787575441738795
AUC = 0.7779618497362158


## Using Random Forest
The performance here is worse.

In [22]:
def run_rf(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")
    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]

    # Fill NA's in with "UNKNOWN"
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")

    for feature in features:
        # not sure we should be fitting an encoder on the train AND test sets...
        encoder = sklearn.preprocessing.OrdinalEncoder()
        encoder.fit(df.loc[:, feature].values.reshape(-1, 1))
        df.loc[:, feature] = encoder.transform(df.loc[:, feature].values.reshape(-1, 1))

    # Split data:
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)
    X_train = df_train.loc[:, features].values
    X_valid = df_valid.loc[:, features].values

    model = ensemble.RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, df_train.loc[:, "target"].values)
    validation_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, validation_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")

In [23]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_rf(fold_id)

Fold = 0, AUC = 0.7056549136390169
Fold = 1, AUC = 0.7149112856963676
Fold = 2, AUC = 0.7162345860630353
Fold = 3, AUC = 0.7149732397020793
Fold = 4, AUC = 0.7167638231664938


### Random Forest, but with SVD

In [33]:
def run_rf_with_svd(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")

    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]

    # Fill NAs in with "UNKNOWN"
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")

    # Split data:
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)

    encoder = preprocessing.OneHotEncoder(sparse_output=True)
    encoder.fit(df.loc[:, features])
    X_train = encoder.transform(df_train.loc[:, features])
    X_valid = encoder.transform(df_valid.loc[:, features])

    # Abhishek used 120, I'm using 128 because powers of two
    svd = decomposition.TruncatedSVD(n_components=128)
    full_sparse = sparse.vstack((X_train, X_valid))
    svd.fit(full_sparse)

    X_train = svd.transform(X_train)
    X_valid = svd.transform(X_valid)

    model = ensemble.RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, df_train.loc[:, "target"].values)

    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, valid_preds)

    print(f"Fold = {fold_id}, AUC = {auc}")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_rf_with_svd(fold_id)

### XGBoost with one-hot encoding

In [None]:
def run_xgb(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")
    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]

    # Fill NA's in with "UNKNOWN"
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")

    for feature in features:
        # not sure we should be fitting an encoder on the train AND test sets...
        encoder = sklearn.preprocessing.OrdinalEncoder()
        encoder.fit(df.loc[:, feature].values.reshape(-1, 1))
        df.loc[:, feature] = encoder.transform(df.loc[:, feature].values.reshape(-1, 1))

    # Split data:
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)
    X_train = df_train.loc[:, features].values
    X_valid = df_valid.loc[:, features].values

    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200,
    )
    model.fit(X_train, df_train.loc[:, "target"].values)
    validation_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, validation_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_xgb(fold_id)