In [23]:
import numpy as np
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [45]:
def prepare_living_region():
	"""скрипт для приведения к плюс-минус божескому виду столбец living_region"""
	global df
	regex = re.compile('( ?ОБЛАСТЬ.?)|(^А?ОБЛ. ?)|(^А?ОБЛ )|( А?ОБЛ.?$)|( ?КРАЙ.? ?)|( ?РЕСПУБЛИКА ?)|( ?РЕСП.? ?)|'
						'( ?АО ?)|( ?Р-Н ?)|(^Г. ?)|(^Г )|( Г.?$)|( АВТОНОМНАЯ ?)|( АВТОНОМНЫЙ )')
	df.living_region = df.living_region.str.replace(regex, '')

	unique = df.living_region.unique()

	regionNums = [i for i in range(1, len(unique) + 1)]
	df.living_region.replace(unique, regionNums, inplace=True)

In [25]:
def prepare_job_position():
	"""скрипт для приведения к плюс-минус божескому виду столбец job_position"""
	global df

	unique = df.job_position.unique()

	jobNums = [i for i in range(1, len(unique) + 1)]
	df.job_position.replace(unique, jobNums, inplace=True)

In [26]:
def prepare_education():
	"""скрипт для приведения к плюс-минус божескому виду столбец education"""
	global df

	unique = df.education.unique()

	jobNums = [i for i in range(1, len(unique) + 1)]
	df.education.replace(unique, jobNums, inplace=True)

In [56]:
df = pd.read_csv('credit.csv', ';', encoding='cp1251', decimal=',')

In [57]:
#  приведение таблицы к удобоваримому для нейросети виду
df.replace(np.nan, 0, inplace=True)
df.gender.replace(['M', 'F'], [1, 0], inplace=True)
df.replace({'marital_status': {'MAR': 1, 'UNM': 2, 'CIV': 3, 'DIV': 4, 'WID': 5}}, inplace=True)
prepare_living_region()
prepare_job_position()
prepare_education()

In [58]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             170746 non-null  int64  
 1   gender                170746 non-null  int64  
 2   age                   170746 non-null  float64
 3   marital_status        170746 non-null  int64  
 4   job_position          170746 non-null  int64  
 5   credit_sum            170746 non-null  float64
 6   credit_month          170746 non-null  int64  
 7   tariff_id             170746 non-null  object 
 8   score_shk             170746 non-null  float64
 9   education             170746 non-null  int64  
 10  living_region         170746 non-null  int64  
 11  monthly_income        170746 non-null  float64
 12  credit_count          170746 non-null  float64
 13  overdue_credit_count  170746 non-null  float64
 14  open_account_flg      170746 non-null  int64  
dtype

In [48]:
df.living_region = df.living_region.astype('category')

In [59]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             170746 non-null  int64  
 1   gender                170746 non-null  int64  
 2   age                   170746 non-null  float64
 3   marital_status        170746 non-null  int64  
 4   job_position          170746 non-null  int64  
 5   credit_sum            170746 non-null  float64
 6   credit_month          170746 non-null  int64  
 7   tariff_id             170746 non-null  object 
 8   score_shk             170746 non-null  float64
 9   education             170746 non-null  int64  
 10  living_region         170746 non-null  int64  
 11  monthly_income        170746 non-null  float64
 12  credit_count          170746 non-null  float64
 13  overdue_credit_count  170746 non-null  float64
 14  open_account_flg      170746 non-null  int64  
dtype

In [50]:
print(df.head())

   client_id  gender   age  marital_status  job_position  credit_sum  \
0          1       1   0.0               0             1    59998.00   
1          2       0   0.0               1             1    10889.00   
2          3       1  32.0               1             2    10728.00   
3          4       0  27.0               0             2    12009.09   
4          5       1  45.0               0             2        0.00   

   credit_month tariff_id  score_shk  education living_region  monthly_income  \
0            10       1.6   0.000000          1             1         30000.0   
1             6       1.1   0.000000          2             2             0.0   
2            12       1.1   0.000000          2             3             0.0   
3            12       1.1   0.000000          2             4             0.0   
4            10       1.1   0.421385          3             5             0.0   

   credit_count  overdue_credit_count  open_account_flg  
0           1.0       

In [54]:
#  запуск и работа нейронной сети
def start_prediction(fieldsOfInterest):
    x_train, x_test, y_train, y_test = \
        train_test_split(df[fieldsOfInterest],
        df.open_account_flg, test_size=0.3, random_state=0)

    st = StandardScaler()
    x_train = st.fit_transform(x_train)
    x_test = st.fit_transform(x_test)

    model = RandomForestClassifier()
    model.fit(x_train, y_train)

    pred = model.predict(x_test)
    print(metrics.accuracy_score(y_test, pred))
    print(metrics.mean_squared_error(y_test, pred))


In [60]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

x_train, x_test, y_train, y_test = \
        train_test_split(df[['gender', 'age', 'marital_status', 'credit_sum', 'credit_count', 'living_region']],
        df.open_account_flg, test_size=0.3, random_state=0)


model = ExtraTreesClassifier()
model.fit(x_test, y_test)
# display the relative importance of each attribute
print(model.feature_importances_)

[0.00444412 0.21988712 0.0240592  0.41511682 0.08612736 0.25036538]


In [53]:
start_prediction(['gender', 'age', 'marital_status', 'credit_sum', 'credit_count', 'living_region'])

0.8108699047321568
0.1891300952678432


AttributeError: 'StandardScaler' object has no attribute 'feature_importances_'

In [43]:
start_prediction(['gender'])

0.8266437607371545
0.17335623926284555


AttributeError: 'StandardScaler' object has no attribute 'feature_importances_'

In [21]:
start_prediction(['age'])

0.8265071060440419
0.17349289395595816


In [25]:
start_prediction(['marital_status'])

0.8266437607371545
0.17335623926284555


In [41]:
start_prediction(['client_id'])

0.7127908792753397
0.2872091207246603


In [40]:
start_prediction(['credit_sum'])

0.7217124785256911
0.2782875214743089


In [23]:
start_prediction(['credit_count'])

0.8266047165391223
0.1733952834608777


In [24]:
start_prediction(['living_region'])

0.8266828049351866
0.17331719506481336
