In [1]:
# imports
import os
import pandas as pd
import numpy as np

# constants
DATA_DIR = os.path.join('data','ad_placement')
train_path = os.path.join(DATA_DIR, 'census-income.csv')
test_path = os.path.join(DATA_DIR, 'census-income-test.csv')

with open(train_path, 'r') as train_file:
    df_train = pd.read_csv(train_file, header=None)
with open(test_path, 'r') as test_file:
    df_test = pd.read_csv(test_file, header=None)


In [2]:
df_train.apply(pd.Series.nunique,axis=0)

0        91
1         9
2        52
3        47
4        17
5      1240
6         3
7         7
8        24
9        15
10        5
11       10
12        2
13        3
14        6
15        8
16      132
17      113
18     1478
19        6
20        6
21       51
22       38
23        8
24    99800
25       10
26        9
27       10
28        3
29        4
30        7
31        5
32       43
33       43
34       43
35        5
36        3
37        3
38        3
39       53
40        2
41        2
dtype: int64

In [3]:
# all columns (except the 25th, with 99800 unique values) match in order the following list
# https://kdd.ics.uci.edu/databases/census-income/census-income.names
#   91 distinct values for attribute #0 (age) continuous
#    9 distinct values for attribute #1 (class of worker) nominal
#   52 distinct values for attribute #2 (detailed industry recode) nominal
#   47 distinct values for attribute #3 (detailed occupation recode) nominal
#   17 distinct values for attribute #4 (education) nominal
# 1240 distinct values for attribute #5 (wage per hour) continuous
#    3 distinct values for attribute #6 (enroll in edu inst last wk) nominal
#    7 distinct values for attribute #7 (marital stat) nominal
#   24 distinct values for attribute #8 (major industry code) nominal
#   15 distinct values for attribute #9 (major occupation code) nominal
#    5 distinct values for attribute #10 (race) nominal
#   10 distinct values for attribute #11 (hispanic origin) nominal
#    2 distinct values for attribute #12 (sex) nominal
#    3 distinct values for attribute #13 (member of a labor union) nominal
#    6 distinct values for attribute #14 (reason for unemployment) nominal
#    8 distinct values for attribute #15 (full or part time employment stat) nominal
#  132 distinct values for attribute #16 (capital gains) continuous
#  113 distinct values for attribute #17 (capital losses) continuous
# 1478 distinct values for attribute #18 (dividends from stocks) continuous
#    6 distinct values for attribute #19 (tax filer stat) nominal
#    6 distinct values for attribute #20 (region of previous residence) nominal
#   51 distinct values for attribute #21 (state of previous residence) nominal
#   38 distinct values for attribute #22 (detailed household and family stat) nominal
#    8 distinct values for attribute #23 (detailed household summary in household) nominal
#   10 distinct values for attribute #24 (migration code-change in msa) nominal
#    9 distinct values for attribute #25 (migration code-change in reg) nominal
#   10 distinct values for attribute #26 (migration code-move within reg) nominal
#    3 distinct values for attribute #27 (live in this house 1 year ago) nominal
#    4 distinct values for attribute #28 (migration prev res in sunbelt) nominal
#    7 distinct values for attribute #29 (num persons worked for employer) continuous
#    5 distinct values for attribute #30 (family members under 18) nominal
#   43 distinct values for attribute #31 (country of birth father) nominal
#   43 distinct values for attribute #32 (country of birth mother) nominal
#   43 distinct values for attribute #33 (country of birth self) nominal
#    5 distinct values for attribute #34 (citizenship) nominal
#    3 distinct values for attribute #35 (own business or self employed) nominal
#    3 distinct values for attribute #36 (fill inc questionnaire for veteran's admin) nominal
#    3 distinct values for attribute #37 (veterans benefits) nominal
#   53 distinct values for attribute #38 (weeks worked in year) continuous
#    2 distinct values for attribute #39 (year) nominal

In [4]:
list_var = [
    "age",
    "class_of_worker",
    "detailed_industry_recode",
    "detailed_occupation_recode",
    "education",
    "wage_per_hour",
    "enroll_in_edu_inst_last_wk",
    "marital_stat",
    "major_industry_code",
    "major_occupation_code",
    "race",
    "hispanic_origin",
    "sex",
    "member_of_a_labor_union",
    "reason_for_unemployment",
    "full_or_part_time_employment_stat",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "tax_filer_stat",
    "region_of_previous_residence",
    "state_of_previous_residence",
    "detailed_household_and_family_stat",
    "detailed_household_summary_in_household",
    "instance_weight",
    "migration_code_change_in_msa",
    "migration_code_change_in_reg",
    "migration_code_move_within_reg",
    "live_in_this_house_1_year_ago",
    "migration_prev_res_in_sunbelt",
    "num_persons_worked_for_employer",
    "family_members_under_18",
    "country_of_birth_father",
    "country_of_birth_mother",
    "country_of_birth_self",
    "citizenship",
    "own_business_or_self_employed",
    "fill_inc_questionnaire_for_veterans_admin",
    "veterans_benefits",
    "weeks_worked_in_year",
    "year",
    "income_bucket"
]

In [5]:
df_train.columns = list_var
df_test.columns = list_var

In [6]:
var_y = 'income_bucket'
list_var_num = [
    "age",
    "wage_per_hour",
    "capital_gains",
    "capital_losses",
    "dividends_from_stocks",
    "num_persons_worked_for_employer",
    "weeks_worked_in_year"
]
list_var_cat =  [x for x in list_var if x not in list_var_num and x != var_y]

In [7]:
import patsy
patsy_cat_formula = ""
for var in list_var_cat:
    if not patsy_cat_formula:
        patsy_cat_formula = var
    else:
        patsy_cat_formula = patsy_cat_formula + "+" + var

df_train_cat = patsy.dmatrix(patsy_cat_formula+"-1", data=df_train, return_type='dataframe')
df_test_cat = patsy.dmatrix(patsy_cat_formula+"-1", data=df_test, return_type='dataframe')


In [15]:
X_train = pd.concat([df_train[list_var_num], df_train_cat], axis=1)
X_test = pd.concat([df_test[list_var_num], df_test_cat], axis=1)
y_train = df_train[var_y]
y_test = df_test[var_y]

In [19]:
print df_train.shape
print df_train_cat.shape
print X_train.shape
print y_train.shape
print "\n"
print df_test.shape
print df_test_cat.shape
print X_test.shape
print y_test.shape
#seems like df_test is missing a value in one of its categoric variables (376 = 375 + 1)

(199523, 383)
(199523, 42)
(199523, 376)


(99762, 382)
(99762, 42)
(99762, 375)


In [None]:
missing = [x for x in X_train.columns if x not in X_test.columns]
print missing
#add missing column to test
X_test[missing[0]] = 0

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# build pipe: first standardize by substracting mean and dividing std
# next do classificaiton
pipe = Pipeline(steps=[('StandardScaler', StandardScaler()), ('logistic', LogisticRegression())])

Cs_try1 = np.logspace(-2, 2, 5)
print Cs_try1

cv_c_try1 = GridSearchCV(pipe, dict(logistic__C=Cs_try1))
cv_c_try1.fit(X_train, y_train)

print cv_c_try1.grid_scores_
print "best C: " + str(cv_c_try1.best_estimator_.named_steps['logistic'].C)


In [None]:
Cs_try2 = np.arange(0.1, 1, 0.1)
print Cs_try2

cv_c_try2 = GridSearchCV(pipe, dict(logistic__C=new_Cs))
cv_c_try2.fit(X_train, y_train)

print cv_c_try2.grid_scores_
print "best C: " + str(cv_c_try2.best_estimator_.named_steps['logistic'].C)


In [None]:
logistic_lasso = LogisticRegression(penalty='l1', C=cv_c_try2.best_estimator_.named_steps['logistic'].C)
logistic_lasso.fit(X_train, y_train)
print sum(logistic_lasso.coef_[0] != 0.0)
print sum(logistic_lasso.coef_[0] == 0.0)
len(X_train.columns)


In [None]:
X_train_reduced = X_train.loc[:,logistic_lasso.coef_[0] != 0.0]