# 1. Imports

In [48]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from xgboost import XGBClassifier, XGBRFClassifier
from category_encoders import TargetEncoder
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from lightgbm import LGBMClassifier
import lightgbm
from sklearn.metrics import accuracy_score, log_loss
from catboost import CatBoostClassifier



plt.rcdefaults()
plt.rcParams['figure.figsize'] = [12, 12]
%config InlineBackend.figure_format = 'retina'

# 2. Load Data

In [64]:
cols = pd.read_csv('data/census-bureau.columns', header=None)[0].str.replace(' ', '_', regex=True).tolist()
df = pd.read_csv('data/census-bureau.data',names=cols)
df.head()

Unnamed: 0,age,class_of_worker,detailed_industry_recode,detailed_occupation_recode,education,wage_per_hour,enroll_in_edu_inst_last_wk,marital_stat,major_industry_code,major_occupation_code,...,country_of_birth_father,country_of_birth_mother,country_of_birth_self,citizenship,own_business_or_self_employed,fill_inc_questionnaire_for_veteran's_admin,veterans_benefits,weeks_worked_in_year,year,label
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [65]:
def create_numeric_label(label):
    if label == '- 50000.':
        return 0
    return 1
x = df.drop(columns=['label']).copy()
y = pd.DataFrame(df['label'])
y.label = y.label.apply(create_numeric_label)

In [66]:
x_dev, x_test, y_dev, y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)
x_train, x_val, y_train, y_val = train_test_split(X_dev,y_dev,test_size=0.25,random_state=42,stratify=y_dev)

# 3. Initial Data Exploration

## 3.1 Overall Data

In [52]:
cols = x.columns
num_cols = x._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))

In [53]:
df.dtypes.value_counts()

object     29
int64      12
float64     1
dtype: int64

In [54]:
missing_data = pd.DataFrame(df.isnull().sum(), columns=['missing_cnt']).reset_index()
missing_data.columns = ['col_name', 'missing_cnt']
missing_data['percentage'] = (missing_data['missing_cnt'] / len(df)) * 100
missing_data.sort_values(by='missing_cnt', ascending=False, inplace=True)
missing_data.head()

Unnamed: 0,col_name,missing_cnt,percentage
11,hispanic_origin,874,0.438045
0,age,0,0.0
24,weight,0,0.0
25,migration_code-change_in_msa,0,0.0
26,migration_code-change_in_reg,0,0.0


Negligible missing data. Only 0.438%

# 3.2 Only on dev Data

# 4. Data Preprocessing

In [73]:
x_train.hispanic_origin.fillna('Do not know',inplace=True)
x_val.hispanic_origin.fillna('Do not know',inplace=True)
x_test.hispanic_origin.fillna('Do not know',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


# 5. Model Training

In [74]:
cat = CatBoostClassifier(allow_writing_files=False,learning_rate = 0.07,depth=8)

In [75]:
cat.fit(x_train,y_train,cat_features=cat_cols)

0:	learn: 0.5661331	total: 325ms	remaining: 5m 25s
1:	learn: 0.4724712	total: 485ms	remaining: 4m 2s
2:	learn: 0.3940073	total: 749ms	remaining: 4m 8s
3:	learn: 0.3367966	total: 1s	remaining: 4m 10s
4:	learn: 0.2909727	total: 1.38s	remaining: 4m 35s
5:	learn: 0.2523826	total: 1.59s	remaining: 4m 23s
6:	learn: 0.2246169	total: 1.77s	remaining: 4m 10s
7:	learn: 0.2057925	total: 1.96s	remaining: 4m 3s
8:	learn: 0.1903676	total: 2.24s	remaining: 4m 6s
9:	learn: 0.1809283	total: 2.52s	remaining: 4m 9s
10:	learn: 0.1712410	total: 2.73s	remaining: 4m 5s
11:	learn: 0.1641042	total: 2.92s	remaining: 4m
12:	learn: 0.1582336	total: 3.11s	remaining: 3m 56s
13:	learn: 0.1531340	total: 3.29s	remaining: 3m 51s
14:	learn: 0.1494235	total: 3.46s	remaining: 3m 47s
15:	learn: 0.1460058	total: 3.64s	remaining: 3m 43s
16:	learn: 0.1433436	total: 3.83s	remaining: 3m 41s
17:	learn: 0.1404939	total: 3.98s	remaining: 3m 37s
18:	learn: 0.1380478	total: 4.15s	remaining: 3m 34s
19:	learn: 0.1358774	total: 4.32s	r

<catboost.core.CatBoostClassifier at 0x7fcbab403580>