In [1]:
# Имеются данные adult.csv (см. в материалах к занятию или на гитхабе).
# Целевой переменной является уровень дохода income (крайний правый столбец).
# Описание признаков можно найти по ссылке http://www.cs.toronto.edu/~delve/data/adult/adultDetail.html

# Вам необходимо построить модель логистической регрессии, которая предсказывает 
# уровень дохода человека. При возможности попробуйте улучшить точность предсказаний 
# (метод score) с помощью перебора признаков.

In [2]:
import pandas as pd

In [3]:
data_ = pd.read_csv("adult.csv")

In [4]:
data_.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
age                48842 non-null int64
workclass          48842 non-null object
fnlwgt             48842 non-null int64
education          48842 non-null object
educational-num    48842 non-null int64
marital-status     48842 non-null object
occupation         48842 non-null object
relationship       48842 non-null object
race               48842 non-null object
gender             48842 non-null object
capital-gain       48842 non-null int64
capital-loss       48842 non-null int64
hours-per-week     48842 non-null int64
native-country     48842 non-null object
income             48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [6]:
data_ = pd.read_csv("adult.csv")
data_['educational_num'] = data_['educational-num']
data_['marital_status'] = data_['marital-status']
data_['native_country'] = data_['native-country']
data_['capital_gain'] = data_['capital-gain']
data_['capital_loss'] = data_['capital-loss']
data_['hours_per_week'] = data_['hours-per-week']

data = data_.drop(['educational-num', 'marital-status', 'native-country', 
                   'capital-gain', 'capital-loss', 'hours-per-week'], axis =1)

In [7]:
categorical_columns = ['workclass','education','occupation',
                       'relationship','race','gender','income',
                       'marital_status','native_country']

data[categorical_columns].head()

Unnamed: 0,workclass,education,occupation,relationship,race,gender,income,marital_status,native_country
0,Private,11th,Machine-op-inspct,Own-child,Black,Male,<=50K,Never-married,United-States
1,Private,HS-grad,Farming-fishing,Husband,White,Male,<=50K,Married-civ-spouse,United-States
2,Local-gov,Assoc-acdm,Protective-serv,Husband,White,Male,>50K,Married-civ-spouse,United-States
3,Private,Some-college,Machine-op-inspct,Husband,Black,Male,>50K,Married-civ-spouse,United-States
4,?,Some-college,?,Own-child,White,Female,<=50K,Never-married,United-States


In [8]:
numerical_cloumns = ['age','fnlwgt','educational_num',
                     'capital_gain','capital_loss','hours_per_week']

data[numerical_cloumns].head()

Unnamed: 0,age,fnlwgt,educational_num,capital_gain,capital_loss,hours_per_week
0,25,226802,7,0,0,40
1,38,89814,9,0,0,50
2,28,336951,12,0,0,40
3,44,160323,10,7688,0,40
4,18,103497,10,0,0,30


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
_data = data

In [11]:
le = LabelEncoder()
for i in categorical_columns:
    le.fit(_data[i])
    tmp = pd.Series( le.transform(_data[i]) )
    _data[i+'_encoded'] = tmp
    _data = _data.drop(i,axis=1) 

In [12]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'occupation', 'relationship',
       'race', 'gender', 'income', 'educational_num', 'marital_status',
       'native_country', 'capital_gain', 'capital_loss', 'hours_per_week',
       'workclass_encoded'],
      dtype='object')

In [13]:
_data.head()

Unnamed: 0,age,fnlwgt,educational_num,capital_gain,capital_loss,hours_per_week,workclass_encoded,education_encoded,occupation_encoded,relationship_encoded,race_encoded,gender_encoded,income_encoded,marital_status_encoded,native_country_encoded
0,25,226802,7,0,0,40,4,1,7,3,2,1,0,4,39
1,38,89814,9,0,0,50,4,11,5,0,4,1,0,2,39
2,28,336951,12,0,0,40,2,7,11,0,4,1,1,2,39
3,44,160323,10,7688,0,40,4,15,7,0,2,1,1,2,39
4,18,103497,10,0,0,30,0,15,0,3,4,0,0,4,39


In [14]:
def get_formula(_cols):
    return 'income_encoded ~ ' + " + ".join(_cols)

In [15]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

In [16]:
_iteration_cols = _data.columns.drop(['income_encoded'])
_iteration_cols

Index(['age', 'fnlwgt', 'educational_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'workclass_encoded', 'education_encoded',
       'occupation_encoded', 'relationship_encoded', 'race_encoded',
       'gender_encoded', 'marital_status_encoded', 'native_country_encoded'],
      dtype='object')

In [17]:
line = get_formula(_iteration_cols)
line

'income_encoded ~ age + fnlwgt + educational_num + capital_gain + capital_loss + hours_per_week + workclass_encoded + education_encoded + occupation_encoded + relationship_encoded + race_encoded + gender_encoded + marital_status_encoded + native_country_encoded'

In [18]:
lm = smf.ols(line, _data).fit()
lm_res = lm.summary()
lm_res.tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5729,0.017,-33.812,0.000,-0.606,-0.540
age,0.0046,0.000,35.182,0.000,0.004,0.005
fnlwgt,7.278e-08,1.58e-08,4.598,0.000,4.18e-08,1.04e-07
educational_num,0.0468,0.001,65.560,0.000,0.045,0.048
capital_gain,9.119e-06,2.26e-07,40.307,0.000,8.68e-06,9.56e-06
capital_loss,0.0001,4.15e-06,26.511,0.000,0.000,0.000
hours_per_week,0.0035,0.000,24.503,0.000,0.003,0.004
workclass_encoded,-0.0044,0.001,-3.743,0.000,-0.007,-0.002
education_encoded,-0.0034,0.000,-7.469,0.000,-0.004,-0.003


In [19]:
_data=_data.drop(['native_country_encoded'],axis=1)

In [20]:
_selectedColumns = _data.columns
_selectedColumns

Index(['age', 'fnlwgt', 'educational_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'workclass_encoded', 'education_encoded',
       'occupation_encoded', 'relationship_encoded', 'race_encoded',
       'gender_encoded', 'income_encoded', 'marital_status_encoded'],
      dtype='object')

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
# попробуем выбрать какие-нибудь признаки

X = _data[ ['age','fnlwgt', 'educational_num', 'capital_gain', 'capital_loss',
            'hours_per_week', 'workclass_encoded', 'education_encoded',
            'occupation_encoded', 'relationship_encoded', 'race_encoded',
            'gender_encoded', 'marital_status_encoded'] ]

# столбец  является целевой переменной, удаляем его из X
y = _data['income_encoded']
del _data['income_encoded']

In [23]:
#_data.columns
X.columns

Index(['age', 'fnlwgt', 'educational_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'workclass_encoded', 'education_encoded',
       'occupation_encoded', 'relationship_encoded', 'race_encoded',
       'gender_encoded', 'marital_status_encoded'],
      dtype='object')

In [24]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print('train_X = ',len(train_X))
print('test_X  = ',len(test_X))

train_X =  39073
test_X  =  9769


In [26]:
model = LogisticRegression()

In [27]:
train_y 

37193    0
31093    0
33814    0
14500    0
23399    0
27487    0
14049    0
4949     0
47205    0
3316     0
19854    0
28918    0
43176    0
33920    0
21968    0
46996    0
32072    0
34987    0
6237     0
34232    0
28345    0
3524     0
37460    0
22000    0
25699    1
16545    0
23462    0
42008    0
19487    1
28580    1
        ..
189      0
2747     0
18431    0
18942    0
25658    0
41434    0
27480    0
6396     0
28693    1
19769    0
17568    0
39188    0
37819    0
5311     1
2433     0
769      1
1685     0
41090    0
16023    0
44131    0
47191    1
21962    0
37194    0
16850    0
6265     1
11284    0
44732    0
38158    0
860      0
15795    1
Name: income_encoded, Length: 39073, dtype: int64

In [28]:
# обучаем модель

model.fit( train_X, train_y )
y_predicted = model.predict( test_X )
y_predicted

array([0, 0, 1, ..., 0, 0, 0])

In [29]:
model.score(train_X, train_y)

0.7977375681416835

In [30]:
X = _data[ ['age','fnlwgt', 'educational_num', 'capital_gain', 'capital_loss',
            'hours_per_week', 'workclass_encoded', 'education_encoded',
            'occupation_encoded', 'relationship_encoded', 'race_encoded'] ]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

model_1 = LogisticRegression()

model_1.fit( train_X, train_y )
y_predicted = model_1.predict( test_X )
y_predicted
model_1.score(train_X, train_y)

0.7976096025388376

In [31]:
X = _data[ ['age','fnlwgt', 'educational_num', 'capital_gain', 'capital_loss',
            'hours_per_week', 'workclass_encoded', 'education_encoded'] ]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

model_2 = LogisticRegression()

model_2.fit( train_X, train_y )
y_predicted = model_2.predict( test_X )
y_predicted
model_2.score(train_X, train_y)

0.79753282317713

In [32]:
X = _data[ ['age','fnlwgt', 'educational_num', 'capital_gain', 'capital_loss'] ]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

model_3 = LogisticRegression()

model_3.fit( train_X, train_y )
y_predicted = model_3.predict( test_X )
y_predicted
model_3.score(train_X, train_y)

0.7964323189926548

In [33]:
X = _data[ ['age','fnlwgt', 'educational_num'] ]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

model_4 = LogisticRegression()

model_4.fit( train_X, train_y )
y_predicted = model_4.predict( test_X )
y_predicted
model_4.score(train_X, train_y)

0.7595014460113122

In [34]:
X = _data[ ['age'] ]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

model_5 = LogisticRegression()

model_5.fit( train_X, train_y )
y_predicted = model_5.predict( test_X )
y_predicted
model_5.score(train_X, train_y)

0.7462442095564712