### Строим логистическую регрессию - угадываем пол спортсмена по признакам

In [45]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv('adult.csv')
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


**Описание - датасет является набором данных людей которые обладают определёнными признаками в том числе и уровнем доходов, который мы будем прогнозировать**
1. age: continuous.
2. workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
3. fnlwgt: continuous.
4. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
5. education-num: continuous.
6. marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
7. occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
8. relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
9. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
10. sex: Female, Male.
11. capital-gain: continuous.
12. capital-loss: continuous.
13. hours-per-week: continuous.
14. native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
15. class: >50K, <=50K


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


**Пропусков в данных нет, но при этом часть столбцов являются не числовыми, а строковыми**

*Уберём непонятный столбец . т.к. его роль неясна*

In [7]:
data.drop(labels='fnlwgt',axis=1,inplace=True)
data.head(5)

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [8]:
data.describe()

Unnamed: 0,age,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,10.078089,1079.067626,87.502314,40.422382
std,13.71051,2.570973,7452.019058,403.004552,12.391444
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [11]:
data.describe(include='all')

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,48842,48842,48842.0,48842,48842,48842,48842,48842,48842.0,48842.0,48842.0,48842,48842
unique,,9,16,,7,15,6,5,2,,,,42,2
top,,Private,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,33906,15784,,22379,6172,19716,41762,32650,,,,43832,37155
mean,38.643585,,,10.078089,,,,,,1079.067626,87.502314,40.422382,,
std,13.71051,,,2.570973,,,,,,7452.019058,403.004552,12.391444,,
min,17.0,,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,,12.0,,,,,,0.0,0.0,45.0,,


**Проверим знаки вопроса **

In [18]:
a = 0
for i in data:
    for j in data[i]:
        if j == '?':
            a += 1
    print(i, '=', a)
    a = 0

age = 0
workclass = 2799
education = 0
educational-num = 0
marital-status = 0
occupation = 2809
relationship = 0
race = 0
gender = 0
capital-gain = 0
capital-loss = 0
hours-per-week = 0
native-country = 857
income = 0


*Посмотрим на столбцы "семейный статус" и "отношения", думаю надо от одного отказаться*

In [19]:
data.pivot_table(index='marital-status', columns='relationship', values='age',  dropna=False, aggfunc='count')

relationship,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
marital-status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Divorced,,3628.0,181.0,455.0,2369.0,
Married-AF-spouse,12.0,,1.0,1.0,,23.0
Married-civ-spouse,19704.0,23.0,201.0,143.0,,2308.0
Married-spouse-absent,,330.0,54.0,61.0,183.0,
Never-married,,7114.0,920.0,6750.0,1333.0,
Separated,,637.0,79.0,146.0,668.0,
Widowed,,851.0,70.0,25.0,572.0,


*Удаляем малоинформативные столбцы, столбцы в которых много отсутствующих данных и столбцы смысла которых я не понимаю* 

In [20]:
data.drop(labels=['educational-num', 'relationship', 'capital-gain', 'capital-loss', 'native-country'],axis=1,inplace=True)
data.head(5)

Unnamed: 0,age,workclass,education,marital-status,occupation,race,gender,hours-per-week,income
0,25,Private,11th,Never-married,Machine-op-inspct,Black,Male,40,<=50K
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,White,Male,50,<=50K
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,White,Male,40,>50K
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Black,Male,40,>50K
4,18,?,Some-college,Never-married,?,White,Female,30,<=50K


*проверим данные о занятости*

In [21]:
data.pivot_table(index='workclass', columns='occupation', values='age',  dropna=False, aggfunc='count')

occupation,?,Adm-clerical,Armed-Forces,Craft-repair,Exec-managerial,Farming-fishing,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving
workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
?,2799.0,,,,,,,,,,,,,,
Federal-gov,,487.0,15.0,93.0,268.0,9.0,36.0,19.0,55.0,,253.0,47.0,17.0,96.0,37.0
Local-gov,,421.0,,211.0,331.0,43.0,65.0,24.0,300.0,,1061.0,450.0,16.0,58.0,156.0
Never-worked,10.0,,,,,,,,,,,,,,
Private,,4208.0,,4748.0,3995.0,670.0,1923.0,2882.0,4057.0,242.0,3409.0,299.0,4439.0,1154.0,1880.0
Self-emp-inc,,47.0,,167.0,617.0,82.0,6.0,17.0,42.0,,245.0,5.0,420.0,9.0,38.0
Self-emp-not-inc,,70.0,,798.0,587.0,653.0,21.0,59.0,276.0,,575.0,7.0,591.0,42.0,183.0
State-gov,,375.0,,94.0,287.0,25.0,19.0,19.0,191.0,,629.0,175.0,20.0,87.0,60.0
Without-pay,,3.0,,1.0,1.0,8.0,2.0,2.0,2.0,,,,1.0,,1.0


*Можно было бы поглубже повникать в подготовку данных, но пока оставим как есть на текущий момент*

**Отделим мух от котлет X - признаки, Y - цель предсказания**

In [36]:
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]

In [37]:
X.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,race,gender,hours-per-week
0,25,Private,11th,Never-married,Machine-op-inspct,Black,Male,40
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,White,Male,50
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,White,Male,40
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Black,Male,40
4,18,?,Some-college,Never-married,?,White,Female,30


In [38]:
Y.head()

0    <=50K
1    <=50K
2     >50K
3     >50K
4    <=50K
Name: income, dtype: object

**Приводим данные к машиночитаемому виду, переводя категориальные признаки в индикаторные переменные, с помощью One-hot encode**

In [40]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,age,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Female,gender_Male
0,25,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,38,50,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,28,40,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,44,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,18,30,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


**Примерно тоже делаем с целью предсказания, переведем значения столбца в число используя LabelEncoder**

In [41]:
le = LabelEncoder()
le.fit(Y)

LabelEncoder()

In [43]:
# проверка расшировки
le.transform(['<=50K', '<=50K', '>50K', '>50K', '<=50K'])

array([0, 0, 1, 1, 0], dtype=int64)

In [42]:
Y = le.transform(Y)
Y = pd.Series(Y)
Y.head()

0    0
1    0
2    1
3    1
4    0
dtype: int32

In [None]:
# Вариант номер два. Можно было преобразовать Y в бинарное число с помощью этой формулы:
# Y = [1 if x == '>50K' else 0 for x in Y]

**Делим выборку на обучающую и тестовую**

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30)

**Вариант 1. Используем модель логистической регрессии**

In [47]:
model = LogisticRegression()

In [61]:
model.fit(X_train, Y_train)
predictions = model.predict_proba(X_test)



In [63]:
predictions[0:10, 1]

array([0, 1, 0, ..., 0, 0, 0])

In [58]:
for pred, y_true in zip( predictions[10:20, 1], Y_test[10:20]):
    print(pred, y_true)

0.8028101053802188 1
0.21173170762614546 0
0.09030031456710452 0
0.004865888751310556 0
0.02940748751652503 0
0.13281800156849952 0
0.7589148868375318 1
0.2406436687233056 0
0.006438042188550486 0
0.13226874780116 0


In [57]:
model.score(X_test, Y_test)

0.8307513819695626

*Средняя точность составила 83%*

**Вариант 2. Используем модель SVM с линейным ядром**

In [59]:
model_2 = SVC(kernel='linear')

In [64]:
model_2.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [65]:
predictions_2 = model_2.predict(X_test)

In [66]:
model_2.score(X_test, Y_test)

0.8321162901794854

*Средняя точность составила 83%*

**Вариант 3. Используем модель SVM с сигмовидным ядром**

In [70]:
model_3 = SVC(kernel='sigmoid')

In [71]:
model_3.fit(X_train, Y_train)
predictions_3 = model_3.predict(X_test)



In [72]:
model_3.score(X_test, Y_test)

0.7541800313928888

*Средняя точность составила 75%*

**Логистическая регрессия и метод опорных векторов с линейным ядром показывают примерно одинаковую среднюю точность - 83%. Метод опорных векторов с сигмовидным ядром показал меньшую точность - 75%**