In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [55]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,#NAME?,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [56]:
df.shape

(5000, 15)

In [57]:
df.dtypes

age               object
workclass         object
fnlwgt            object
education         object
education_num     object
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [58]:
df['age'] = pd.to_numeric(df['age'],errors='coerce')
df['fnlwgt'] = pd.to_numeric(df['fnlwgt'],errors='coerce')
df['fnlwgt'] = pd.to_numeric(df['fnlwgt'],errors='coerce')
df['education_num'] = pd.to_numeric(df['education_num'],errors='coerce')

In [59]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
4995,43.0,Private,222971.0,5th-6th,3.0,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,Mexico,<=50K
4996,31.0,Private,259425.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
4997,47.0,Self-emp-inc,212120.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
4998,,Private,245880.0,HS-grad,9.0,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,60,United-States,<=50K
4999,58.0,Local-gov,54947.0,Some-college,10.0,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,55,United-States,<=50K


In [60]:
df['income'].unique().sum()

'<=50K>50K'

In [61]:
df.duplicated().sum()

2

In [62]:
df.drop_duplicates(inplace=True)

In [63]:
df.isnull().sum()

age                48
workclass           0
fnlwgt            106
education           0
education_num      57
marital_status      0
occupation          0
relationship        0
race                0
sex                 0
capital_gain        0
capital_loss        0
hours_per_week      0
native_country      0
income              0
dtype: int64

In [64]:
df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

In [65]:

for col_name in df.columns:
    if df[col_name].dtypes == 'object':
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

Feature 'workclass' has 8 unique categories
Feature 'education' has 17 unique categories
Feature 'marital_status' has 7 unique categories
Feature 'occupation' has 15 unique categories
Feature 'relationship' has 6 unique categories
Feature 'race' has 6 unique categories
Feature 'sex' has 3 unique categories
Feature 'native_country' has 40 unique categories


In [66]:
df['native_country'].value_counts().sort_values(ascending=False).head(10)

United-States    4464
Mexico            103
?                  97
Canada             28
Philippines        22
Germany            22
England            16
Puerto-Rico        16
El-Salvador        16
China              15
Name: native_country, dtype: int64

In [67]:
# In this case, bucket low frequecy categories as "Other"
df['native_country'] = ['United-States ' if x == 'United-States' else 'Other' for x in df['native_country']]

print(df['native_country'].value_counts().sort_values(ascending=False))

United-States     4464
Other              534
Name: native_country, dtype: int64


In [68]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
UL = Q3 + 1.5*(IQR)
LL = Q1 - 1.5*(IQR)

In [69]:
UL

age                   75.5
fnlwgt            428039.0
education_num         16.5
capital_gain           0.0
capital_loss           0.0
hours_per_week        52.5
income                 0.0
dtype: float64

In [70]:
LL

age                  -0.5
fnlwgt           -68447.0
education_num         4.5
capital_gain          0.0
capital_loss          0.0
hours_per_week       32.5
income                0.0
dtype: float64

In [71]:
a = df['age']>75.5
b = df['fnlwgt']>428039
c = df['hours_per_week']>52.5

In [72]:
df['age'].drop(a.index,inplace=True)

In [73]:
df['fnlwgt'].drop(b.index,inplace=True)

In [74]:
df['hours_per_week'].drop(c.index,inplace=True)

In [75]:
df_num = df[df.select_dtypes(include = np.number).columns]

In [76]:
df_cat = df[df.select_dtypes(include='object').columns]

In [77]:
df_cat

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,#NAME?,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Other
...,...,...,...,...,...,...,...,...
4995,Private,5th-6th,Never-married,Machine-op-inspct,Unmarried,White,Female,Other
4996,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States
4997,Self-emp-inc,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States
4998,Private,HS-grad,Never-married,Adm-clerical,Not-in-family,White,Male,United-States


In [78]:
df_dummy = pd.get_dummies(df_cat,drop_first=True)

In [79]:
data = pd.concat([df_num,df_dummy],axis=1)

In [80]:
data

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,...,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_United-States
0,39.0,77516.0,13.0,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
1,50.0,83311.0,13.0,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
2,38.0,215646.0,9.0,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,1,1
3,53.0,234721.0,7.0,0,0,40,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,28.0,338409.0,13.0,0,0,40,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43.0,222971.0,3.0,0,0,40,0,0,0,1,...,1,0,0,0,0,0,1,1,0,0
4996,31.0,259425.0,9.0,0,0,40,1,0,0,1,...,0,0,0,0,0,0,1,0,1,1
4997,47.0,212120.0,9.0,0,0,40,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
4998,,245880.0,9.0,0,0,60,0,0,0,1,...,0,0,0,0,0,0,1,0,1,1


In [81]:
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(data)
data = pd.DataFrame(data=imp.transform(data) , columns=data.columns)



In [82]:
x= data.drop(['income'],axis=1)
y=data['income']

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.7,random_state=8)

In [85]:
from sklearn.preprocessing import StandardScaler

In [86]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [88]:
lr = LogisticRegression()

In [89]:
lr.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [90]:
y_pred = lr.predict(x_test)

In [91]:
accuracy =  accuracy_score(y_test, y_pred)

In [92]:
accuracy

0.838

In [95]:
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV 

In [94]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)
print("Accuracy:",accuracy_score(y_test, y_pred))

Accuracy: 0.7746666666666666


In [97]:
depth = np.arange(1,30)
param_dist = {"max_depth":depth, 
              "max_features": randint(1, 9), 
              "min_samples_leaf": randint(1, 9), 
              "criterion": ["gini", "entropy"]}
tree_cv = RandomizedSearchCV(dt, param_dist, cv = 5)
tree_cv.fit(x_train, y_train)
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) 
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 29, 'max_features': 7, 'min_samples_leaf': 8}
Best score is 0.8198970840480274


In [40]:
import statsmodels.api as sm

In [41]:
logit_model=sm.Logit(y_train,x_train)
result=logit_model.fit()
print(result.summary())

         Current function value: 0.443082
         Iterations: 35




                           Logit Regression Results                           
Dep. Variable:                 income   No. Observations:                 3498
Model:                          Logit   Df Residuals:                     3438
Method:                           MLE   Df Model:                           59
Date:                Mon, 10 Aug 2020   Pseudo R-squ.:                  0.1972
Time:                        21:10:05   Log-Likelihood:                -1549.9
converged:                      False   LL-Null:                       -1930.5
Covariance Type:            nonrobust   LLR p-value:                1.146e-122
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.2086      0.058      3.606      0.000       0.095       0.322
x2             0.0195      0.047      0.415      0.678      -0.073       0.112
x3             0.3516        nan        nan        n

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [42]:
# Use PolynomialFeatures in sklearn.preprocessing to create two-way interactions for all features
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

def add_interactions(df):
    # Get feature names
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns) + ['_'.join(x) for x in combos]
    
    # Find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    # Remove interaction terms with all 0 values            
    noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indicies], axis=1)
    
    return df

In [43]:
data = add_interactions(data)
print(data.head(5))

    age    fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0  39.0   77516.0           13.0        2174.0           0.0            40.0   
1  50.0   83311.0           13.0           0.0           0.0            13.0   
2  38.0  215646.0            9.0           0.0           0.0            40.0   
3  53.0  234721.0            7.0           0.0           0.0            40.0   
4  28.0  338409.0           13.0           0.0           0.0            40.0   

   income  workclass_Federal-gov  workclass_Local-gov  workclass_Private  ...  \
0     0.0                    0.0                  0.0                0.0  ...   
1     0.0                    0.0                  0.0                0.0  ...   
2     0.0                    0.0                  0.0                1.0  ...   
3     0.0                    0.0                  0.0                1.0  ...   
4     0.0                    0.0                  0.0                1.0  ...   

   race_Black_sex_Male  race_Bla

In [44]:
x= data.drop(['income'],axis=1)
y=data['income']
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.7,random_state=8)
lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)
accuracy =  accuracy_score(y_test, y_pred)
accuracy



1.0

In [45]:
y_train

2530    0.0
2239    0.0
2699    0.0
3474    0.0
3226    0.0
       ... 
2181    1.0
2409    0.0
2033    0.0
1364    0.0
4547    0.0
Name: income, Length: 3498, dtype: float64

In [46]:
x_train

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,race_Black_sex_Male,race_Black_native_country_United-States,race_Other_sex_Female,race_Other_sex_Male,race_Other_native_country_United-States,race_White_sex_Female,race_White_sex_Male,race_White_native_country_United-States,sex_Female_native_country_United-States,sex_Male_native_country_United-States
2530,21.0,294789.0,10.0,0.0,0.0,25.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2239,36.0,184659.0,10.0,0.0,0.0,52.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2699,40.0,229364.0,14.0,0.0,0.0,45.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3474,43.0,108945.0,10.0,0.0,0.0,48.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3226,58.0,310085.0,6.0,0.0,0.0,40.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,34.0,244147.0,13.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2409,27.0,387776.0,10.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2033,47.0,185465.0,9.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1364,31.0,168312.0,11.0,0.0,0.0,40.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [47]:
# Such a large set of features can cause overfitting and also slow computing
# Use feature selection to select the most important features
import sklearn.feature_selection

select = sklearn.feature_selection.SelectKBest(k=20)
selected_features = select.fit(x_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [x.columns[i] for i in indices_selected]

X_train_selected = x_train[colnames_selected]
X_test_selected = x_test[colnames_selected]

  817  841  842  855  863  864  897  906 1004 1006 1020 1077 1081 1090
 1098 1188 1193 1267 1269 1282 1288 1313 1474] are constant.
  f = msb / msw


In [48]:
print(colnames_selected)

['marital_status_Married-civ-spouse', 'age_income', 'age_marital_status_Married-civ-spouse', 'fnlwgt_income', 'education_num_income', 'education_num_marital_status_Married-civ-spouse', 'hours_per_week_income', 'hours_per_week_marital_status_Married-civ-spouse', 'income_workclass_Private', 'income_education_Bachelors', 'income_education_HS-grad', 'income_marital_status_Married-civ-spouse', 'income_occupation_Exec-managerial', 'income_occupation_Prof-specialty', 'income_race_White', 'income_sex_Male', 'income_native_country_United-States ', 'marital_status_Married-civ-spouse_race_White', 'marital_status_Married-civ-spouse_sex_Male', 'marital_status_Married-civ-spouse_native_country_United-States ']


In [49]:
lr = LogisticRegression()
lr.fit(X_train_selected,y_train)
y_pred = lr.predict(X_test_selected)
accuracy =  accuracy_score(y_test, y_pred)
accuracy



1.0

In [50]:
# Find performance of model using preprocessed data
auc_processed = find_model_perf(X_train_selected, y_train, X_test_selected, y_test)
print(auc_processed)

NameError: name 'find_model_perf' is not defined