In [1]:

# Подгрузим библиотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
# Подгружаем данные
# Для данного исследования возьмем данные банка

data = pd.read_csv('bank-full.csv', header=0, sep=';')

In [3]:
# Посмотрим структуру файла
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Удаляем все пропуски
data = data.dropna()

In [5]:
# Смотрим размерность
print(data.shape)

(45211, 17)


In [6]:
# Выводим все столбцы которые есть в данной выборке
print(list(data.columns))

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [7]:
#  Устраняем поля которые не нужны для анализа

data.drop(data.columns[[0,3,5,8,9,10,11,12,13,14]], 
   axis = 1, inplace = True)

In [8]:
# Проверяем столбец по номеру
data.columns[1]

'marital'

In [9]:
# Смотрим текущую выборку
data.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome,y
0,management,married,no,yes,no,unknown,no
1,technician,single,no,yes,no,unknown,no
2,entrepreneur,married,no,yes,yes,unknown,no
3,blue-collar,married,no,yes,no,unknown,no
4,unknown,single,no,no,no,unknown,no


In [10]:
# Посмотрим какие значения есть в столбце y
data['y'].unique()

array(['no', 'yes'], dtype=object)

In [11]:
# Заменим в столбце y значения "yes" и "no" на 1 и 0
data.loc[data["y"] == "no","y"] = 0
data.loc[data["y"] == "yes","y"] = 1

In [12]:
# Приведем значения к типу int

data["y"]=data["y"].astype('int')

In [13]:
# Проверим какие теперь значения в столбце y
data['y'].unique()

array([0, 1])

In [14]:
# Создадим конкантенацию по уникальным признакам в каждом столбце
data_concatenate = pd.get_dummies(data, columns =['job', 
                                                'marital', 
                                                'default', 
                                                'housing', 
                                                'loan', 
                                                'poutcome'])

In [15]:
# Посмотрим какие получились столбцы
data_concatenate.columns

Index(['y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'default_no', 'default_yes', 'housing_no', 'housing_yes', 'loan_no',
       'loan_yes', 'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

In [16]:
# Посмотрим в обобщенном виде
data_concatenate

Unnamed: 0,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1
45207,1,0,0,0,0,0,1,0,0,0,...,1,0,1,0,1,0,0,0,0,1
45208,1,0,0,0,0,0,1,0,0,0,...,1,0,1,0,1,0,0,0,1,0
45209,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1


In [17]:
# Удалим ненужные столбцы
# Основной критерий показатель unknown  в графе job
data_concatenate.drop(data_concatenate.columns[[12, 16, 18, 21, 24]], axis=1, inplace=True)

In [18]:
# Посмотрим актуальные столбцы которые получились после преобразований
data_concatenate.columns

Index(['y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'marital_divorced', 'marital_married', 'marital_single', 'default_yes',
       'housing_yes', 'loan_no', 'poutcome_failure', 'poutcome_other',
       'poutcome_unknown'],
      dtype='object')

In [19]:
# Возьмем данные для анализа
X = data_concatenate.iloc[:,1:]

In [20]:
# Название столбцов
column_data_concatenate = data_concatenate.columns


# Вывод уникальных значений для каждого столбца
for i in range(len(column_data_concatenate)):
    unic_value = data_concatenate.iloc[i].unique()   # Уникальные значения в отдельно взятом столбце
    print(column_data_concatenate[i], ' = ', unic_value, ' =// ', type(unic_value[0]))
   
       

y  =  [0 1]  =//  <class 'numpy.intc'>
job_admin.  =  [0 1]  =//  <class 'numpy.intc'>
job_blue-collar  =  [0 1]  =//  <class 'numpy.intc'>
job_entrepreneur  =  [0 1]  =//  <class 'numpy.intc'>
job_housemaid  =  [0 1]  =//  <class 'numpy.intc'>
job_management  =  [0 1]  =//  <class 'numpy.intc'>
job_retired  =  [0 1]  =//  <class 'numpy.intc'>
job_self-employed  =  [0 1]  =//  <class 'numpy.intc'>
job_services  =  [0 1]  =//  <class 'numpy.intc'>
job_student  =  [0 1]  =//  <class 'numpy.intc'>
job_technician  =  [0 1]  =//  <class 'numpy.intc'>
job_unemployed  =  [0 1]  =//  <class 'numpy.intc'>
marital_divorced  =  [0 1]  =//  <class 'numpy.intc'>
marital_married  =  [0 1]  =//  <class 'numpy.intc'>
marital_single  =  [0 1]  =//  <class 'numpy.intc'>
default_yes  =  [0 1]  =//  <class 'numpy.intc'>
housing_yes  =  [0 1]  =//  <class 'numpy.intc'>
loan_no  =  [0 1]  =//  <class 'numpy.intc'>
poutcome_failure  =  [0 1]  =//  <class 'numpy.intc'>
poutcome_other  =  [0 1]  =//  <class 'n

In [21]:
# Посмотрим что получилось
X.head ()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,default_yes,housing_yes,loan_no,poutcome_failure,poutcome_other,poutcome_unknown
0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1
1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1


In [22]:
# Создадим целевой набор
Y = data_concatenate.iloc[:,0]

In [23]:
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int32

In [24]:
# Разделим набор данных на обучающую и тестовую выборку
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [25]:
# type(X_train[1])
X_train.iloc[0, 0]

0

In [26]:
# Для анализа с помощью Логистической Регресси воспользуемся готовым решением Sklearn
# И зададим первоначальные параметры параметры
CL_LR = LogisticRegression(solver='lbfgs',random_state=0)

In [27]:
# Обучим модель
CL_LR.fit(X_train, Y_train)

LogisticRegression(random_state=0)

In [28]:
#  Зададим параметры
LogisticRegression(C = 1.0, 
                   class_weight = None, 
                   dual = False, 
                   fit_intercept=True, 
                   intercept_scaling=1, 
                   max_iter=100, 
                   multi_class='warn', 
                   n_jobs=None, 
                   penalty='l2', 
                   random_state=0, 
                   solver='lbfgs',
                   tol=0.0001, 
                   verbose=0, 
                   warm_start=False)

LogisticRegression(multi_class='warn', random_state=0)

In [29]:
# Прогнозирование по обученной модели
predicted_y = CL_LR.predict(X_test)

In [30]:
# Вывод на экран
predicted_y

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
# Провека всего массива
for x in range(len(predicted_y)):
   if (predicted_y[x] == 1):
      print(x, end="\t")

26	84	115	160	210	259	302	304	318	339	364	371	381	393	447	544	594	673	709	825	837	862	868	888	941	988	1060	1074	1179	1223	1278	1311	1377	1379	1405	1414	1441	1494	1540	1567	1578	1592	1599	1614	1671	1678	1689	1770	1772	1783	1784	1863	1872	1889	1908	1928	1935	1939	1956	1957	1970	1990	1994	2017	2030	2109	2115	2122	2123	2148	2245	2280	2337	2428	2431	2433	2492	2493	2513	2520	2531	2582	2620	2692	2720	2742	2781	2784	2796	2851	2895	2897	2964	2994	3000	3065	3076	3104	3116	3123	3144	3159	3169	3214	3228	3270	3281	3354	3369	3392	3451	3488	3537	3539	3614	3681	3690	3711	3752	3761	3863	3917	3930	3934	3941	3945	3958	3974	3995	4057	4092	4111	4178	4208	4219	4231	4232	4270	4285	4290	4352	4355	4369	4380	4430	4459	4478	4491	4516	4538	4552	4566	4567	4607	4610	4628	4646	4732	4748	4760	4892	4946	5010	5013	5029	5037	5108	5129	5169	5250	5266	5287	5324	5380	5382	5403	5416	5495	5519	5549	5573	5604	5686	5713	5733	5776	5791	5800	5808	5811	5843	5844	6049	6099	6100	6101	6128	6137	6145	6212	6241	6295	6380	6410	6412	645

In [32]:
# Проверка точночти
print('Accuracy: {:.2f}'.format(CL_LR.score(X_test, Y_test)))

Accuracy: 0.89
