# Кредитный скоринг
При принятии решения о выдаче кредита или займа учитывается т.н. «Кредитный скоринг» — рейтинг платежеспособности клиента. ИИ на основе модели, которую просчитывает машинное обучение — в ней много параметров — возраст, зарплата, кредитная история, наличие недвижимости, автомобиля, судимости и других признаков, после обработки которых выносится положительное или отрицательное решение

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=19369e68812c5403915497700ac5bc83f9d5999c7b7b5557de59a0101941936f
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:

from pyspark.sql import SparkSession


spark = SparkSession.builder\
        .master("local[*]")\
        .appName("Colab_pyspark")\
        .config('spark.ui.port', '4050')\
        .config('spark.executor.memory', '3g')\
        .getOrCreate()
        # .config('spark.sql.execution.arrow.enabled', 'true')\
        # .config('spark."Broadcastsizetable"', '-1')\
        # .config('preferSortHashJoin', 'true')\

# Данные:
[скачать](https://drive.google.com/file/d/1MuAyZiIm3b_r-AgQSj78tsRPqZpvv_2s/view?usp=sharing)

**application_record.csv**
*   Feature name	Explanation	Remarks
*   ID	Client number	
*   CODE_GENDER	Gender	
*   FLAG_OWN_CAR	Is there a car	
*   FLAG_OWN_REALTY	Is there a property	
*   CNT_CHILDREN	Number of children	
*   AMT_INCOME_TOTAL	Annual income	
*   NAME_INCOME_TYPE	Income category	
*   NAME_EDUCATION_TYPE	Education level	
*   NAME_FAMILY_STATUS	Marital status	
*   NAME_HOUSING_TYPE	Way of living	
*   DAYS_BIRTH	Birthday	Count backwards from current day (0), -1 means yesterday
*   DAYS_EMPLOYED	Start date of employment	Count backwards from current day(0). If positive, it means the person currently unemployed.
FLAG_MOBIL	Is there a mobile phone	
*   FLAG_WORK_PHONE	Is there a work phone	
*   FLAG_PHONE	Is there a phone	
*   FLAG_EMAIL	Is there an email	
*   OCCUPATION_TYPE	Occupation	
*   CNT_FAM_MEMBERS	Family size	

**credit_record.csv**
*   Feature name	Explanation	Remarks
*   ID	Client number	
*   MONTHS_BALANCE	Record month	The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on
*   STATUS	Status	
   *   0: 1-29 days past due
   *   1: 30-59 days past due 
   *   2: 60-89 days overdue 
   *   3: 90-119 days overdue 
   *   4: 120-149 days overdue 
    *   5: Overdue or bad debts, write-offs for more than 150 days
    *   C: paid off that month X: No loan for the month


## Считываем данные

In [3]:
data = spark.read.csv('/content/application_record.csv', header=True, inferSchema=True)
record = spark.read.csv('/content/credit_record.csv', header=True, inferSchema=True)

In [4]:
data.describe()

DataFrame[summary: string, ID: string, CODE_GENDER: string, FLAG_OWN_CAR: string, FLAG_OWN_REALTY: string, CNT_CHILDREN: string, AMT_INCOME_TOTAL: string, NAME_INCOME_TYPE: string, NAME_EDUCATION_TYPE: string, NAME_FAMILY_STATUS: string, NAME_HOUSING_TYPE: string, DAYS_BIRTH: string, DAYS_EMPLOYED: string, FLAG_MOBIL: string, FLAG_WORK_PHONE: string, FLAG_PHONE: string, FLAG_EMAIL: string, OCCUPATION_TYPE: string, CNT_FAM_MEMBERS: string]

In [5]:
for row in data.schema:
  print(row)

StructField('ID', IntegerType(), True)
StructField('CODE_GENDER', StringType(), True)
StructField('FLAG_OWN_CAR', StringType(), True)
StructField('FLAG_OWN_REALTY', StringType(), True)
StructField('CNT_CHILDREN', IntegerType(), True)
StructField('AMT_INCOME_TOTAL', DoubleType(), True)
StructField('NAME_INCOME_TYPE', StringType(), True)
StructField('NAME_EDUCATION_TYPE', StringType(), True)
StructField('NAME_FAMILY_STATUS', StringType(), True)
StructField('NAME_HOUSING_TYPE', StringType(), True)
StructField('DAYS_BIRTH', IntegerType(), True)
StructField('DAYS_EMPLOYED', IntegerType(), True)
StructField('FLAG_MOBIL', IntegerType(), True)
StructField('FLAG_WORK_PHONE', IntegerType(), True)
StructField('FLAG_PHONE', IntegerType(), True)
StructField('FLAG_EMAIL', IntegerType(), True)
StructField('OCCUPATION_TYPE', StringType(), True)
StructField('CNT_FAM_MEMBERS', DoubleType(), True)


In [6]:
record.show(5)

+-------+--------------+------+
|     ID|MONTHS_BALANCE|STATUS|
+-------+--------------+------+
|5001711|             0|     X|
|5001711|            -1|     0|
|5001711|            -2|     0|
|5001711|            -3|     0|
|5001712|             0|     C|
+-------+--------------+------+
only showing top 5 rows



In [7]:
for row in record.schema:
  print(row)

StructField('ID', IntegerType(), True)
StructField('MONTHS_BALANCE', IntegerType(), True)
StructField('STATUS', StringType(), True)


In [8]:
from pyspark.sql.functions import col, when

In [9]:
begin_month = record.groupby(["ID"]).min('MONTHS_BALANCE').withColumn('begin_month', col('min(MONTHS_BALANCE)')* -1).drop('min(MONTHS_BALANCE)')

In [10]:
new_data = data.join(begin_month, ['ID'], 'left')

In [11]:
cpunt = record.withColumn('dep_value', when(record['STATUS'] == '2', '1')\
                                .when(record['STATUS'] == '3', '1')\
                                .when(record['STATUS'] == '4', '1')\
                                .when(record['STATUS'] == '5', '1')\
                                .otherwise(None))

In [12]:
cpunt = cpunt.withColumn('dep_value', cpunt['dep_value'].cast('int')).groupby('ID').sum('dep_value')

In [13]:
cpunt = cpunt.withColumn('target', when(cpunt['sum(dep_value)'] > 0, 1).otherwise(0)).drop('sum(dep_value)')

In [14]:
new_data = new_data.join(cpunt, ['ID'], 'inner')

In [15]:
#  В итоге к анкетным данным мы добавили таргет
new_data.groupby('target').count().show()

+------+-----+
|target|count|
+------+-----+
|     1|  616|
|     0|35841|
+------+-----+



In [16]:
# Упростим себе задачу и оставим только часть признаков
features = ['AMT_INCOME_TOTAL', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN']	
target = ['target',]
dataset = new_data[features + target]

In [17]:
dataset.withColumn('target', dataset['target'].cast('int'))

DataFrame[AMT_INCOME_TOTAL: double, CODE_GENDER: string, FLAG_OWN_CAR: string, FLAG_OWN_REALTY: string, CNT_CHILDREN: int, target: int]

У нас есть выборка, где указаны параметры клиента, и вышел ли он на просрочку или нет.

In [18]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

In [19]:
text_columns = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

stri = StringIndexer(inputCols=text_columns, outputCols=[column+'_stri' for column in text_columns])
ohe = OneHotEncoder(inputCols=stri.getOutputCols(), outputCols=[column+'_ohe' for column in text_columns])
pipe = Pipeline(stages=[
    stri,
    ohe
])
total_new_data = pipe.fit(new_data).transform(new_data)

In [20]:
# total_new_data[[x + '_ohe' for x in text_columns] + text_columns].show(15)
total_new_data.show(15)

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+-----------+------+----------------+-----------------+--------------------+---------------+----------------+-------------------+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|begin_month|target|CODE_GENDER_stri|FLAG_OWN_CAR_stri|FLAG_OWN_REALTY_stri|CODE_GENDER_ohe|FLAG_OWN_CAR_ohe|FLAG_OWN_REALTY_ohe|
+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+-----------

In [21]:
from pyspark.ml.feature import MinMaxScaler

In [23]:
from pyspark.ml.classification import LogisticRegression


from pyspark.ml.feature import VectorAssembler

# text_columns = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
text_columns = []
columns_to_scale = ['AMT_INCOME_TOTAL', 'CNT_CHILDREN']

stri = StringIndexer(inputCols=text_columns, outputCols=[column+'_stri' for column in text_columns])
ohe = OneHotEncoder(inputCols=stri.getOutputCols(), outputCols=[column+'_ohe' for column in text_columns])

assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in columns_to_scale]
scalers = [MinMaxScaler(inputCol=col + "_vec", outputCol=col + "_scaled") for col in columns_to_scale]
lr = LogisticRegression(featuresCol='learn_vector', labelCol='target')

main_assembler = VectorAssembler(inputCols=ohe.getOutputCols() + [scaler.getOutputCol() for scaler in scalers], outputCol='learn_vector')

pipeline = Pipeline(stages=[stri, ohe] + assemblers + scalers + [main_assembler, lr])

#  Модель

In [24]:
train_data, test_data = new_data.randomSplit([0.7, 0.3], seed=42)
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)
results.show()

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+-----------+------+--------------------+----------------+-----------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|begin_month|target|AMT_INCOME_TOTAL_vec|CNT_CHILDREN_vec|AMT_INCOME_TOTAL_scaled| CNT_CHILDREN_scaled|        learn_vector|       rawPrediction|         probability|prediction|
+-------+-----------+------------+---------------+------------+----------------+--------------------+-----------

In [25]:
results.groupby('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|10839|
+----------+-----+



In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

res = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='target')
  
# Evaluating the AUC on results
ROC_AUC = res.evaluate(results)
ROC_AUC

0.5