In [7]:
import numpy as np
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [21]:
def prepare_living_region():
	"""скрипт для приведения к плюс-минус божескому виду столбец living_region"""
	global df
	regex = re.compile('( ?ОБЛАСТЬ.?)|(^А?ОБЛ. ?)|(^А?ОБЛ )|( А?ОБЛ.?$)|( ?КРАЙ.? ?)|( ?РЕСПУБЛИКА ?)|( ?РЕСП.? ?)|'
						'( ?АО ?)|( ?Р-Н ?)|(^Г. ?)|(^Г )|( Г.?$)|( АВТОНОМНАЯ ?)|( АВТОНОМНЫЙ )')
	df.living_region = df.living_region.str.replace(regex, '')

	unique = df.living_region.unique()

	regionNums = [i for i in range(1, len(unique) + 1)]
	df.living_region.replace(unique, regionNums, inplace=True)

In [14]:
def prepare_job_position():
	"""скрипт для приведения к плюс-минус божескому виду столбец job_position"""
	global df

	unique = df.job_position.unique()

	jobNums = [i for i in range(1, len(unique) + 1)]
	df.job_position.replace(unique, jobNums, inplace=True)

In [15]:
def prepare_education():
	"""скрипт для приведения к плюс-минус божескому виду столбец education"""
	global df

	unique = df.education.unique()

	jobNums = [i for i in range(1, len(unique) + 1)]
	df.education.replace(unique, jobNums, inplace=True)

In [10]:
df = pd.read_csv('credit.csv', ';', encoding='cp1251', decimal=',')

In [16]:
#  приведение таблицы к удобоваримому для нейросети виду
df.replace(np.nan, 0, inplace=True)
df.gender.replace(['M', 'F'], [1, 0], inplace=True)
df.replace({'marital_status': {'MAR': 1, 'UNM': 2, 'CIV': 3, 'DIV': 4, 'WID': 5}}, inplace=True)
prepare_living_region()
prepare_job_position()
prepare_education()

In [17]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             170746 non-null  int64  
 1   gender                170746 non-null  int64  
 2   age                   170746 non-null  float64
 3   marital_status        170746 non-null  int64  
 4   job_position          170746 non-null  int64  
 5   credit_sum            170746 non-null  float64
 6   credit_month          170746 non-null  int64  
 7   tariff_id             170746 non-null  object 
 8   score_shk             170746 non-null  float64
 9   education             170746 non-null  int64  
 10  living_region         170746 non-null  int64  
 11  monthly_income        170746 non-null  float64
 12  credit_count          170746 non-null  float64
 13  overdue_credit_count  170746 non-null  float64
 14  open_account_flg      170746 non-null  int64  
dtype

In [19]:
df.living_region = df.living_region.astype('category')

In [20]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   client_id             170746 non-null  int64   
 1   gender                170746 non-null  int64   
 2   age                   170746 non-null  float64 
 3   marital_status        170746 non-null  int64   
 4   job_position          170746 non-null  int64   
 5   credit_sum            170746 non-null  float64 
 6   credit_month          170746 non-null  int64   
 7   tariff_id             170746 non-null  object  
 8   score_shk             170746 non-null  float64 
 9   education             170746 non-null  int64   
 10  living_region         170746 non-null  category
 11  monthly_income        170746 non-null  float64 
 12  credit_count          170746 non-null  float64 
 13  overdue_credit_count  170746 non-null  float64 
 14  open_account_flg      170746 non-nul

In [18]:
#  запуск и работа нейронной сети
def start_prediction(fieldsOfInterest):
    x_train, x_test, y_train, y_test = \
        train_test_split(df[fieldsOfInterest],
        df.open_account_flg, test_size=0.3, random_state=0)

    st = StandardScaler()
    x_train = st.fit_transform(x_train)
    x_test = st.fit_transform(x_test)

    model = RandomForestClassifier()
    model.fit(x_train, y_train)

    pred = model.predict(x_test)
    print(metrics.accuracy_score(y_test, pred))
    print(metrics.mean_squared_error(y_test, pred))

In [19]:
start_prediction(['gender', 'age', 'marital_status', 'credit_sum', 'credit_count', 'living_region'])

0.8097766671872559
0.19022333281274403


In [20]:
start_prediction(['gender'])

0.8266437607371545
0.17335623926284555


In [21]:
start_prediction(['age'])

0.8265071060440419
0.17349289395595816


In [25]:
start_prediction(['marital_status'])

0.8266437607371545
0.17335623926284555


In [22]:
start_prediction(['client_id'])

0.7129470560674683
0.2870529439325316


In [26]:
start_prediction(['credit_sum'])

0.7219662658129002
0.2780337341870998


In [23]:
start_prediction(['credit_count'])

0.8266047165391223
0.1733952834608777


In [24]:
start_prediction(['living_region'])

0.8266828049351866
0.17331719506481336
