In [545]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

In [546]:
adult = pd.read_csv('adult.csv')
adult

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


В таблице есть  значения '?' - это явно np.nan, их не очень много и можно удалить,

Но я, для категориальных признаков, буду выделять их в отдельные классы, это даст чуть больше информации для модели и немного повысит качество предсказаний

In [547]:
# adult.replace('?', np.nan, inplace=True)
# print(adult.isna().sum())
# adult.dropna(inplace=True)
# adult.isna().sum()

In [548]:
adult.income.value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [549]:
le = LabelEncoder()
y = le.fit_transform(adult.income.copy())
        
np.unique(y, return_counts=True)

(array([0, 1]), array([37155, 11687]))

In [550]:
X = adult.drop('income', 1)
X.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States


In [551]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.2+ MB


## Категориальные типы данных:
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.

education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.

marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.

occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.

relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.

race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.

sex: Female, Male.

native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. 

In [552]:
# educational-num и education - скорее это одни и те же данные в разной кодировке
(X['educational-num'].value_counts().sort_values().values == 
X['education'].value_counts().sort_values().values)


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [553]:
cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender',
'native-country']
X[cat_features]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States
1,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States
2,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States
3,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States
4,?,Some-college,Never-married,?,Own-child,White,Female,United-States
...,...,...,...,...,...,...,...,...
48837,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
48838,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
48839,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
48840,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [554]:
le = LabelEncoder()
le.fit(X[cat_features[0]])
le.transform(X[cat_features[0]])

array([4, 4, 2, ..., 4, 4, 5])

В категориальных переменных есть знаения - "?" - их я буду кодировать, как отдельный класс

In [555]:
X[cat_features[0]].unique()

array(['Private', 'Local-gov', '?', 'Self-emp-not-inc', 'Federal-gov',
       'State-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

Если количество уникальнх значений категориального признака больше двух, используем OneHotEncoder

В оставшихся случаях LabelEncoder

In [556]:
count_features = 2
for feature in cat_features:
    if X[feature].unique().size > count_features:
        ohe = OneHotEncoder()
        encoded = ohe.fit_transform(np.array(X[feature]).reshape(-1, 1)).toarray()
        for i in range(encoded.shape[1]):
            X[f'{feature}_{i}'] = encoded[:,i]
    else:
        le = LabelEncoder()
        X[f'{feature}_encoded'] = le.fit_transform(np.array(X[feature]))
        


In [557]:
drop_features = ['workclass',
                 'education',
                 'educational-num',
                 'marital-status',
                 'occupation',
                 'relationship',
                 'race',
                 'gender',
                 'native-country']
X.drop(drop_features, 1, inplace=True)

In [558]:
X.head(3)

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,...,native-country_32,native-country_33,native-country_34,native-country_35,native-country_36,native-country_37,native-country_38,native-country_39,native-country_40,native-country_41
0,25,226802,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,0,0,50,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,28,336951,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [559]:
## !! Получили 108 признаков, можно предположить, что датасета с 48842 это еще приемлимо

In [560]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Обучаем модель

In [561]:
model = LogisticRegression(
                         class_weight={0:1.2},
                         max_iter=1000,
                         solver='liblinear',
                        )
pipeline = make_pipeline(StandardScaler(), model)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight={0: 1.2}, max_iter=1000,
                                    solver='liblinear'))])

In [562]:
predict = pipeline.predict(X_test)
accuracy_score(y_test, predict), confusion_matrix(y_test, predict)

(0.858459018630997,
 array([[10641,   592],
        [ 1482,  1938]]))

In [563]:
# Значимые признаки
pd.DataFrame(pipeline['logisticregression'].coef_.ravel(),
             index=X.columns,
             columns=['Coefficient']).sort_values('Coefficient', ascending=False).head(10)

Unnamed: 0,Coefficient
capital-gain,2.410483
marital-status_2,0.843724
hours-per-week,0.349305
age,0.344324
gender_encoded,0.323249
education_9,0.289138
capital-loss,0.261654
education_12,0.256532
occupation_4,0.231147
relationship_1,0.225635


In [564]:
model = SVC(
            gamma='auto'
           )
pipeline = make_pipeline(StandardScaler(), model)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [565]:
predict = pipeline.predict(X_test)
accuracy_score(y_test, predict), confusion_matrix(y_test, predict)

(0.8545007848222207,
 array([[10593,   640],
        [ 1492,  1928]]))

In [566]:
X

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,...,native-country_32,native-country_33,native-country_34,native-country_35,native-country_36,native-country_37,native-country_38,native-country_39,native-country_40,native-country_41
0,25,226802,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,0,0,50,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,28,336951,0,0,40,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,44,160323,7688,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,18,103497,0,0,30,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,0,0,38,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,40,154374,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,58,151910,0,0,40,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,22,201490,0,0,20,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Polynomial Features

In [567]:
model = LogisticRegression(
                         class_weight={0:1.2},
                         max_iter=1000,
                         solver='liblinear',
                        )
pipeline = make_pipeline(StandardScaler(), model)
pipeline.fit(X_train, y_train)

predict = pipeline.predict(X_test)
accuracy_score(y_test, predict), confusion_matrix(y_test, predict)

(0.858459018630997,
 array([[10641,   592],
        [ 1482,  1938]]))

In [568]:
num_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

In [569]:
featurer = PolynomialFeatures(2)
X_poly = featurer.fit_transform(X[num_features])
X_poly.shape

(48842, 21)

In [570]:
X.drop(num_features, 1, inplace=True)
X

Unnamed: 0,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,workclass_8,education_0,...,native-country_32,native-country_33,native-country_34,native-country_35,native-country_36,native-country_37,native-country_38,native-country_39,native-country_40,native-country_41
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [571]:
X.values.shape, X_poly.shape

((48842, 101), (48842, 21))

In [572]:
scaler = StandardScaler()
X_poly_scaled = scaler.fit_transform(X_poly)
X_poly_scaled

array([[ 0.        , -0.99512893,  0.35167453, ..., -0.20469554,
        -0.20793128, -0.17162618],
       [ 0.        , -0.04694151, -0.94552415, ..., -0.20469554,
        -0.20793128,  0.65211914],
       [ 0.        , -0.77631645,  1.3947231 , ..., -0.20469554,
        -0.20793128, -0.17162618],
       ...,
       [ 0.        ,  1.41180837, -0.35751025, ..., -0.20469554,
        -0.20793128, -0.17162618],
       [ 0.        , -1.21394141,  0.11198424, ..., -0.20469554,
        -0.20793128, -1.26995328],
       [ 0.        ,  0.97418341,  0.93049361, ..., -0.20469554,
        -0.20793128, -0.17162618]])

In [573]:
X_ = np.concatenate((X, X_poly_scaled), axis=1)

In [574]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.3, random_state=42)

In [575]:
model = LogisticRegression(
                         class_weight={0:1.2},
                         max_iter=1000,
                         solver='liblinear',
                        )

model.fit(X_train, y_train)

predict = model.predict(X_test)
accuracy_score(y_test, predict), confusion_matrix(y_test, predict)

(0.8621442707977889,
 array([[10647,   586],
        [ 1434,  1986]]))

### модель еще немного точнее с полиноминальными признаками