In [18]:
import collections
import importlib
import logging
import pprint
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
from scipy import sparse
from sklearn import decomposition, ensemble, linear_model, metrics, preprocessing

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [3]:
df_train = pd.read_csv("../data/train.csv", index_col="id")
df_test = pd.read_csv("../data/test.csv", index_col="id")

df_train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [4]:
df_train.loc[:, "ord_2"].unique()

array(['Hot', 'Warm', 'Freezing', 'Lava Hot', 'Cold', 'Boiling Hot', nan],
      dtype=object)

In [5]:
df_train.loc[:, "ord_2"].value_counts()

ord_2
Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: count, dtype: int64

In [6]:
ord_2_mapping = {
    "Freezing": 0,
    "Cold": 1,
    "Warm": 2,
    "Hot": 3,
    "Boiling Hot": 4,
    "Lava Hot": 5,
}
df_train.loc[:, "ord_2"] = df_train.loc[:, "ord_2"].map(ord_2_mapping)
df_train.loc[:, "ord_2"].value_counts()

ord_2
0.0    142726
2.0    124239
1.0     97822
4.0     84790
3.0     67508
5.0     64840
Name: count, dtype: int64

In [7]:
df_train["new_feature"] = df_train.loc[:, "ord_1"].astype(str) \
    + "_" + df_train.loc[:, "ord_2"].astype(str)
df_train.loc[:, "new_feature"].head()

id
0    Contributor_3.0
1    Grandmaster_2.0
2            nan_0.0
3         Novice_5.0
4    Grandmaster_1.0
Name: new_feature, dtype: object

In [8]:
# Playing with sparse embeddings
ohe = sklearn.preprocessing.OneHotEncoder(sparse_output=True)
# df_train.loc[:, "ord_1"].values
df_train.loc[:, "ord_1"] = df_train.loc[:, "ord_1"].fillna("Unknown")
ohe.fit(df_train.loc[:, "ord_1"].values.reshape(-1, 1))


In [9]:
ohe.categories_

[array(['Contributor', 'Expert', 'Grandmaster', 'Master', 'Novice',
        'Unknown'], dtype=object)]

In [10]:
ohe.transform(df_train.loc[:, "ord_1"].values.reshape(-1, 1))

<600000x6 sparse matrix of type '<class 'numpy.float64'>'
	with 600000 stored elements in Compressed Sparse Row format>

## Rare categories test

In [11]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

print(f"{'target' in df_test.columns}")
print(f"{'target' in df_train.columns}")

False
True


In [12]:
df_test.loc[:, "target"] = -1
df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

features = [column for column in df_train.columns if column not in ["id", "target"]]

In [13]:
for feature in features:
    encoder = sklearn.preprocessing.OrdinalEncoder()
    temp_feature = df_full.loc[:, feature].fillna("UNKNOWN").astype(str).values.reshape(-1, 1)
    df_full.loc[:, feature] = encoder.fit_transform(temp_feature)

In [14]:
df_train = df_full[df_full.target != -1].reset_index(drop=True)
df_test = df_full[df_full.target == -1].reset_index(drop=True)

In [15]:
len(df_train) + len(df_test) == len(df_full)

True

## (Basic) Feature Engineering for Rare Categories

In [16]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

In [17]:
df_full.loc[:, "ord_4"].value_counts().sort_index()

ord_4
A    61111
B    41917
C    53245
D    28685
E    36525
F    27875
G     5694
H    51690
I    32928
J     3281
K    35998
L     2748
M    53991
N    66703
O    42957
P    63281
Q    50108
R    55098
S     7792
T    49488
U    54914
V     5149
W    13902
X    53766
Y    61528
Z     9763
Name: count, dtype: int64

In [18]:
df_full.ord_4.fillna("NONE").value_counts().sort_index()

ord_4
A       61111
B       41917
C       53245
D       28685
E       36525
F       27875
G        5694
H       51690
I       32928
J        3281
K       35998
L        2748
M       53991
N       66703
NONE    29863
O       42957
P       63281
Q       50108
R       55098
S        7792
T       49488
U       54914
V        5149
W       13902
X       53766
Y       61528
Z        9763
Name: count, dtype: int64

In [19]:
df_full.loc[:, "ord_4"] = df_full.loc[:, "ord_4"].fillna("NONE")

In [20]:
df_full.loc[:, "ord_4"].value_counts() < 2000

ord_4
N       False
P       False
Y       False
A       False
R       False
U       False
M       False
X       False
C       False
H       False
Q       False
T       False
O       False
B       False
E       False
K       False
I       False
NONE    False
D       False
F       False
W       False
Z       False
S       False
G       False
V       False
J       False
L       False
Name: count, dtype: bool

In [21]:
df_full.loc[
    df_full.loc[:, "ord_4"].value_counts()[df_full["ord_4"]].values < 2000
    , "ord_4"
] = "RARE"

# Model Training and Evaluation

In [22]:
from ml_misc import create_folds

create_folds.main()

In [23]:
data = pd.read_csv("../data/train_5_folds.csv")
data.loc[:, "fold_id"].value_counts()

fold_id
0    120000
1    120000
2    120000
3    120000
4    120000
Name: count, dtype: int64

In [24]:
data[data["fold_id"] == 0].target.value_counts()

target
0    97536
1    22464
Name: count, dtype: int64

In [25]:
data[data["fold_id"] == 1].target.value_counts()

target
0    97536
1    22464
Name: count, dtype: int64

In [26]:
data[data["fold_id"] == 2].target.value_counts()

target
0    97535
1    22465
Name: count, dtype: int64

In [27]:
data[data["fold_id"] == 3].target.value_counts()

target
0    97535
1    22465
Name: count, dtype: int64

In [28]:
data[data["fold_id"] == 4].target.value_counts()

target
0    97535
1    22465
Name: count, dtype: int64

In [7]:
def run_linear_regression(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")
    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]
    
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")
        
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)
    
    encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=True)
    X_full = pd.concat(
        [df_train[features], df_valid[features]],
        axis=0
    )
    # not sure we should be fitting an encoder on the train AND test sets...
    encoder.fit(X_full[features])
    X_train = encoder.transform(df_train[features])
    X_valid = encoder.transform(df_valid[features])

    model = linear_model.LogisticRegression()
    model.fit(X_train, df_train.loc[:, "target"].values)
    validation_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, validation_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")

In [13]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_linear_regression(fold_id)

AUC = 0.7813202462705275
AUC = 0.7766943500129516
AUC = 0.7819135452537362
AUC = 0.7787575441738795
AUC = 0.7779618497362158


## Using Random Forest
The performance here is worse.

In [22]:
def run_rf(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")
    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]

    # Fill NA's in with "UNKNOWN"
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")

    for feature in features:
        # not sure we should be fitting an encoder on the train AND test sets...
        encoder = sklearn.preprocessing.OrdinalEncoder()
        encoder.fit(df.loc[:, feature].values.reshape(-1, 1))
        df.loc[:, feature] = encoder.transform(df.loc[:, feature].values.reshape(-1, 1))

    # Split data:
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)
    X_train = df_train.loc[:, features].values
    X_valid = df_valid.loc[:, features].values

    model = ensemble.RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, df_train.loc[:, "target"].values)
    validation_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, validation_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")

In [23]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_rf(fold_id)

Fold = 0, AUC = 0.7056549136390169
Fold = 1, AUC = 0.7149112856963676
Fold = 2, AUC = 0.7162345860630353
Fold = 3, AUC = 0.7149732397020793
Fold = 4, AUC = 0.7167638231664938


### Random Forest, but with SVD

In [33]:
def run_rf_with_svd(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")

    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]

    # Fill NAs in with "UNKNOWN"
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")

    # Split data:
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)

    encoder = preprocessing.OneHotEncoder(sparse_output=True)
    encoder.fit(df.loc[:, features])
    X_train = encoder.transform(df_train.loc[:, features])
    X_valid = encoder.transform(df_valid.loc[:, features])

    # Abhishek used 120, I'm using 128 because powers of two
    svd = decomposition.TruncatedSVD(n_components=128)
    full_sparse = sparse.vstack((X_train, X_valid))
    svd.fit(full_sparse)

    X_train = svd.transform(X_train)
    X_valid = svd.transform(X_valid)

    model = ensemble.RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, df_train.loc[:, "target"].values)

    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, valid_preds)

    print(f"Fold = {fold_id}, AUC = {auc}")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_rf_with_svd(fold_id)

### XGBoost with one-hot encoding

In [None]:
def run_xgb(fold_id: int):
    df = pd.read_csv("../data/train_5_folds.csv")
    features = [f for f in df.columns if f not in {"id", "target", "fold_id"}]

    # Fill NA's in with "UNKNOWN"
    for feature in features:
        df.loc[:, feature] = df.loc[:, feature].astype(str).fillna("UNKNOWN")

    for feature in features:
        # not sure we should be fitting an encoder on the train AND test sets...
        encoder = sklearn.preprocessing.OrdinalEncoder()
        encoder.fit(df.loc[:, feature].values.reshape(-1, 1))
        df.loc[:, feature] = encoder.transform(df.loc[:, feature].values.reshape(-1, 1))

    # Split data:
    df_train = df[df.fold_id != fold_id].reset_index(drop=True)
    df_valid = df[df.fold_id == fold_id].reset_index(drop=True)
    X_train = df_train.loc[:, features].values
    X_valid = df_valid.loc[:, features].values

    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=200,
    )
    model.fit(X_train, df_train.loc[:, "target"].values)
    validation_preds = model.predict_proba(X_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.loc[:, "target"].values, validation_preds)
    print(f"Fold = {fold_id}, AUC = {auc}")

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for fold_id in range(5):
        run_xgb(fold_id)

In [2]:
df = pd.read_csv("../data/adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
df.shape

(48842, 15)

In [20]:
# Folding:
from ml_misc import create_folds
importlib.reload(create_folds)

create_folds.main(input_file_name="../data/adult.csv", n_splits=5, target_column="income")

In [21]:
df = pd.read_csv("../data/adult_5_folds.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,fold_id
0,0,56,Local-gov,305767,HS-grad,9,Married-civ-spouse,Other-service,Husband,Asian-Pac-Islander,Male,0,0,40,China,<=50K,0
1,1,41,Private,247752,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K,0
2,2,61,Private,48549,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,38,United-States,>50K,0
3,3,51,Private,249339,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,0
4,4,18,Self-emp-not-inc,132986,HS-grad,9,Never-married,Farming-fishing,Own-child,White,Male,0,0,40,United-States,<=50K,0


In [22]:
# initial processing
features = [
    "age", "workclass", "educational-num", "marital-status", "occupation", "relationship", "race", "gender",
    "capital-gain", "capital-loss", "hours-per-week", "native-country"
]
target = ["income"]
X_full = df.loc[:, features].reset_index(drop=True)
y_full = df.loc[:, target].reset_index(drop=True)
print(f"{len(X_full) == len(y_full) and len(X_full) == len(df)}")

True


In [23]:
X_full

Unnamed: 0,age,workclass,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,56,Local-gov,9,Married-civ-spouse,Other-service,Husband,Asian-Pac-Islander,Male,0,0,40,China
1,41,Private,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States
2,61,Private,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,38,United-States
3,51,Private,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
4,18,Self-emp-not-inc,9,Never-married,Farming-fishing,Own-child,White,Male,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
48837,29,Private,9,Divorced,Machine-op-inspct,Unmarried,White,Male,0,0,56,United-States
48838,21,?,10,Never-married,?,Other-relative,Asian-Pac-Islander,Male,0,0,10,Vietnam
48839,42,?,5,Never-married,?,Own-child,Black,Male,0,0,35,United-States
48840,65,Private,10,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States


In [24]:
categorical_features = ["workclass", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

In [26]:
X_full.loc[:, "workclass"].unique()

array(['Local-gov', 'Private', 'Self-emp-not-inc', 'Federal-gov', '?',
       'Self-emp-inc', 'State-gov', 'Never-worked', 'Without-pay'],
      dtype=object)

In [27]:
y_full.loc[:, "income"].unique()

array(['<=50K', '>50K'], dtype=object)

In [30]:
for cf in categorical_features:
    print(X_full.loc[:, cf].value_counts())

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64
marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse           37
Name: count, dtype: int64
occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64
relationship
Husband           19716
Not-in-f

In [32]:
X_full.loc[:, "workclass"].fillna(value="NONE").value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [35]:
# Encoding:
for cf in categorical_features:
    encoder = preprocessing.OrdinalEncoder()
    X_full.loc[:, cf] = encoder.fit_transform(X_full.loc[:, cf].values.reshape(-1, 1))

In [37]:
X_full.loc[:, "workclass"].value_counts()

workclass
4.0    33906
6.0     3862
2.0     3136
0.0     2799
7.0     1981
5.0     1695
1.0     1432
8.0       21
3.0       10
Name: count, dtype: int64

In [43]:
X_full.dtypes

age                 int64
workclass          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
dtype: object

In [44]:
X_full_2 = X_full.convert_dtypes()
X_full_2.dtypes

age                Int64
workclass          Int64
educational-num    Int64
marital-status     Int64
occupation         Int64
relationship       Int64
race               Int64
gender             Int64
capital-gain       Int64
capital-loss       Int64
hours-per-week     Int64
native-country     Int64
dtype: object

In [51]:
# Encoding, part 2:
for cf in categorical_features:
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(X_full.loc[:, cf].values.reshape(-1, 1))

In [52]:
cf

'native-country'

In [53]:
native_country_encodings = encoder.transform(X_full.loc[:, cf].values.reshape(-1, 1))

In [54]:
native_country_encodings

<48842x42 sparse matrix of type '<class 'numpy.float64'>'
	with 48842 stored elements in Compressed Sparse Row format>

In [76]:
X_full_2  = np.block([[X_full.values], [native_country_encodings]])

ValueError: Mismatched array shapes in block along axis 0.

In [60]:
X_full.values.shape

(48842, 12)

In [61]:
native_country_encodings.shape

(48842, 42)

In [66]:
a = np.array((1, 2, 3))
b = np.array((1, 2, 3))
print(f"{a.shape = }, {b.shape = }")

a.shape = (3,), b.shape = (3,)


In [68]:
X_full.values.reshape((48842, 12, 1))

array([[[56],
        [2.0],
        [9],
        ...,
        [0],
        [40],
        [3.0]],

       [[41],
        [4.0],
        [9],
        ...,
        [0],
        [40],
        [39.0]],

       [[61],
        [4.0],
        [10],
        ...,
        [0],
        [38],
        [39.0]],

       ...,

       [[42],
        [0.0],
        [5],
        ...,
        [0],
        [35],
        [39.0]],

       [[65],
        [4.0],
        [10],
        ...,
        [0],
        [40],
        [39.0]],

       [[22],
        [4.0],
        [7],
        ...,
        [0],
        [30],
        [39.0]]], dtype=object)