In [2]:
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#Читаем данные

In [3]:
df = pd.read_csv('adult.csv', sep = ',')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


#Обрабатывам данные

1) Обработка NaN

In [4]:
for col in df.columns:
  df[col] = df[col].apply(lambda x: np.NaN if x == '?' else x)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [6]:
df = df.dropna(axis = 0)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


In [7]:
df['workclass'].value_counts()

Private             22286
Self-emp-not-inc     2499
Local-gov            2067
State-gov            1279
Self-emp-inc         1074
Federal-gov           943
Without-pay            14
Name: workclass, dtype: int64

In [8]:
df['race'].value_counts()

White                 25933
Black                  2817
Asian-Pac-Islander      895
Amer-Indian-Eskimo      286
Other                   231
Name: race, dtype: int64

In [9]:
df['sex'].value_counts()

Male      20380
Female     9782
Name: sex, dtype: int64

2) Обработка категориальных данных

In [10]:
df['income'] = df['income'].apply(lambda x: 0 if x == '<=50K' else 1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,0


In [11]:
num_columns = df.select_dtypes(include=np.number).columns
cat_columns = list(set(df.columns) - set(num_columns))
cat_columns

['occupation',
 'marital.status',
 'sex',
 'education',
 'workclass',
 'native.country',
 'relationship',
 'race']

In [12]:
for col in cat_columns:
  print(col, len(df[col].unique()))

occupation 14
marital.status 7
sex 2
education 16
workclass 7
native.country 41
relationship 6
race 5


In [13]:
df = pd.get_dummies(df, cat_columns)
df.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,income,occupation_Federal-gov,occupation_Local-gov,occupation_Private,...,race_Portugal,race_Puerto-Rico,race_Scotland,race_South,race_Taiwan,race_Thailand,race_Trinadad&Tobago,race_United-States,race_Vietnam,race_Yugoslavia
1,82,132870,9,0,4356,18,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,54,140359,4,0,3900,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,41,264663,10,0,3900,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,34,216864,9,0,3770,45,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,38,150601,6,0,3770,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


#Обучаем модель и вычисляем метрики

In [15]:
X = df.drop(columns = 'income')
y = df['income']

In [16]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0, k_neighbors=10)

columns = X.columns

os_data_X, os_data_y = os.fit_resample(X, y)
os_data_X = pd.DataFrame(os_data_X, columns = columns)
os_data_y = pd.DataFrame(os_data_y, columns = ['income'])

In [17]:
os_data_y.value_counts()

income
0         22654
1         22654
dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(os_data_X, os_data_y, test_size = 0.3)

In [19]:
Misato = LogisticRegression(class_weight = 'balanced')
Misato.fit(X_train, y_train)
answers_pred = Misato.predict(X_test)

  y = column_or_1d(y, warn=True)


In [20]:
Misato.coef_

array([[ 4.27152427e-03, -2.61182993e-06,  9.53947662e-04,
         3.12883104e-04,  7.21356206e-04,  3.45929247e-03,
        -1.00939137e-05, -4.25819889e-05, -3.36693369e-04,
         4.35230367e-06, -5.41246140e-05, -2.99117685e-05,
        -9.24970283e-07, -3.43377209e-05, -4.60564352e-05,
        -1.63873362e-05, -7.28817408e-06, -1.17655807e-05,
        -2.59602630e-05, -1.94612881e-05, -2.43103587e-05,
        -2.96952065e-05,  6.96360954e-06,  7.70937239e-06,
        -3.01659398e-04,  1.31416213e-05, -1.85415644e-06,
         9.67387889e-06, -1.84786673e-04, -1.73854915e-04,
         2.97999640e-07,  4.61487455e-04, -1.64097096e-05,
        -4.44956959e-04, -4.08941585e-05, -3.32356622e-05,
        -1.37294099e-04, -3.71650956e-07, -1.06193929e-04,
         2.27269452e-05, -3.98930545e-05, -5.65982809e-05,
        -7.34475574e-05, -1.48237189e-04, -7.31812403e-06,
         4.75944620e-06, -1.04144874e-05, -7.36525814e-05,
        -1.53839536e-05, -4.44551976e-05,  3.58028392e-0

In [None]:
Misato.intercept_

array([-2.22673438e-07])

In [None]:
print('y =', Misato.intercept_[0], 4, end = ' ')
for i in range(len(Misato.coef_[0])):
  print('+', Misato.coef_[0][i], 4, f'* {X_train.columns.values[i]}',  end = ' ')

y = -2.2267343788865953e-07 4 + 2.3766280434813364e-05 4 * age + -1.3285165629398294e-06 4 * fnlwgt + 6.521040726477169e-06 4 * education.num + 0.00031758822553372483 4 * capital.gain + 0.000783290730707859 4 * capital.loss + 2.0149282078595426e-05 4 * hours.per.week + 1.1535609614821944e-07 4 * sex_Federal-gov + 9.659158998841267e-08 4 * sex_Local-gov + -7.18519639714188e-07 4 * sex_Private + 2.388921154873324e-07 4 * sex_Self-emp-inc + 3.599219647708147e-08 4 * sex_Self-emp-not-inc + 1.0968845277393358e-08 4 * sex_State-gov + -1.954641552834297e-09 4 * sex_Without-pay + -1.173204984801942e-07 4 * native.country_10th + -1.761297325677025e-07 4 * native.country_11th + -5.361687019210874e-08 4 * native.country_12th + -2.5432579430546554e-08 4 * native.country_1st-4th + -4.566175653986029e-08 4 * native.country_5th-6th + -9.418512532433393e-08 4 * native.country_7th-8th + -7.455889923770258e-08 4 * native.country_9th + -1.1122977271136916e-08 4 * native.country_Assoc-acdm + -4.5991685016

In [21]:
tn, fp, fn, tp = confusion_matrix(y_test, answers_pred).ravel()

In [22]:
(tp + tn) / (tp + tn + fp + fn)

0.609652026778489

In [None]:
# Accuracy
(tp + tn) / (tp + tn + fp + fn)

0.7775444800530446

In [None]:
accuracy_score(y_test, answers_pred)

0.7775444800530446

In [23]:
recall = tp / (tp + fn)
recall

0.5287881018995729

In [None]:
# Recall 
recall = tp / (tp + fn)
recall

0.31339285714285714

In [None]:
recall_score(y_test, answers_pred)

0.31339285714285714

In [24]:
precision = tp / (tp + fp)
precision

0.6303317535545023

In [None]:
# Precision 
precision = tp / (tp + fp)
precision

0.5964316057774002

In [None]:
precision_score(y_test, answers_pred)

0.5964316057774002

In [25]:
f1 = 2 * precision * recall / (recall + precision)
f1

0.5751121076233183

In [None]:
# F1
f1 = 2 * precision * recall / (recall + precision)
f1

0.4108867427568042

In [None]:
f1_score(y_test, answers_pred)

0.4108867427568042

После балансировки метрики стали немногим лучше