In [1]:
!pip install pyspark findspark



In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf().set('spark.ui.port', '4050').set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')\
                  .set('spark.dynamicAllocation.enabled', 'true')\
                  .set('spark.shuffle.service.enabled', 'true') #трекер, чтобы возвращать ресурсы
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [3]:
from pyspark.sql.functions import col, isnan, when, count
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, IndexToString, OneHotEncoder
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [4]:
data = spark.read.csv('Train_Set_90621.csv', header=True, inferSchema=True)

In [5]:
data = data.drop('Amount Defaulted')

### Информация о датасете

In [6]:
data.printSchema()

root
 |-- Application ID: integer (nullable = true)
 |-- Bank Masked: string (nullable = true)
 |-- Bank Type: string (nullable = true)
 |-- Approved_Timestamp: string (nullable = true)
 |-- Name Masked: string (nullable = true)
 |-- Business Owner State: string (nullable = true)
 |-- Business_Industry_Type_Code: integer (nullable = true)
 |-- Approved_Year: integer (nullable = true)
 |-- New Business: integer (nullable = true)
 |-- Term: integer (nullable = true)
 |-- BankState: string (nullable = true)
 |-- Interest Rate: integer (nullable = true)
 |-- Employees: integer (nullable = true)
 |-- Gross Disbursed Amount: integer (nullable = true)
 |-- Term_years: integer (nullable = true)
 |-- Jobs Retained: integer (nullable = true)
 |-- Male to Female Employees Ratio: integer (nullable = true)
 |-- Expected Company Income: long (nullable = true)
 |-- Funds available with company: integer (nullable = true)
 |-- Gross_Apprv_Amount: integer (nullable = true)
 |-- Company Branch Code: inte

In [7]:
data.describe().show(truncate=False)

+-------+-----------------+---------------+---------+------------------+-------------------+--------------------+---------------------------+-----------------+-------------------+------------------+-----------+-----------------+------------------+----------------------+-----------------+------------------+------------------------------+-----------------------+----------------------------+--------------------+-------------------+------------------+------------------+--------------------+------------------+-----------------+--------------------+-------------------+
|summary|Application ID   |Bank Masked    |Bank Type|Approved_Timestamp|Name Masked        |Business Owner State|Business_Industry_Type_Code|Approved_Year    |New Business       |Term              |BankState  |Interest Rate    |Employees         |Gross Disbursed Amount|Term_years       |Jobs Retained     |Male to Female Employees Ratio|Expected Company Income|Funds available with company|Gross_Apprv_Amount  |Company Branch Cod

In [8]:
data.groupBy("Default_Status").count().show()

+--------------+------+
|Default_Status| count|
+--------------+------+
|             1| 35238|
|             0|159880|
+--------------+------+



In [9]:
data.select(col('Default_Status'))\
    .groupBy('Default_Status')\
    .count()\
    .withColumn('count', F.round(col('count') / data.count(), 2))\
    .show()

+--------------+-----+
|Default_Status|count|
+--------------+-----+
|             1| 0.18|
|             0| 0.82|
+--------------+-----+



In [10]:
numeric_columns = [col for col in data.columns if data.schema[col].dataType.typeName() in ['double', 'integer', 'long']]
for col in numeric_columns:
    correlation = data.stat.corr(col, "Default_Status")
    print(f"Корреляция {col} с Default_Status: {correlation}")

Корреляция Application ID с Default_Status: 0.0004897409913387303
Корреляция Business_Industry_Type_Code с Default_Status: 0.001954567609753214
Корреляция Approved_Year с Default_Status: 0.004711407986061488
Корреляция New Business с Default_Status: 0.010631943790759978
Корреляция Term с Default_Status: -0.2094716935506866
Корреляция Interest Rate с Default_Status: -0.00037157022236418983
Корреляция Employees с Default_Status: -0.023560912222720533
Корреляция Gross Disbursed Amount с Default_Status: -0.07374871112938776
Корреляция Term_years с Default_Status: -0.2087603424024912
Корреляция Jobs Retained с Default_Status: 0.00930699847415052
Корреляция Male to Female Employees Ratio с Default_Status: 0.013158289621686445
Корреляция Expected Company Income с Default_Status: -0.0021448440079916355
Корреляция Funds available with company с Default_Status: -0.002987595229433365
Корреляция Gross_Apprv_Amount с Default_Status: -0.07992792907580182
Корреляция Company Branch Code с Default_Stat

In [11]:
categorical_columns = [col for col in data.columns if data.schema[col].dataType.typeName() in ['string', 'boolean']]
for col in categorical_columns:
    print(f"\nТоп 5 значений для {col}:")
    data.groupBy(col).count().orderBy("count", ascending=False).show(5, truncate=False)


Топ 5 значений для Bank Masked:
+--------------+-----+
|Bank Masked   |count|
+--------------+-----+
|Bank_Name_42  |9133 |
|Bank_Name_3   |6688 |
|Bank_Number_13|6021 |
|Bank_Name_11  |4903 |
|Bank_Number_37|4436 |
+--------------+-----+
only showing top 5 rows


Топ 5 значений для Bank Type:
+---------+------+
|Bank Type|count |
+---------+------+
|Private  |130150|
|Govt     |64888 |
|NULL     |80    |
+---------+------+


Топ 5 значений для Approved_Timestamp:
+------------------+-----+
|Approved_Timestamp|count|
+------------------+-----+
|NULL              |426  |
|13-07-2009        |210  |
|15-09-2011        |157  |
|16-09-2011        |152  |
|31-05-2011        |150  |
+------------------+-----+
only showing top 5 rows


Топ 5 значений для Name Masked:
+------------------+-----+
|Name Masked       |count|
+------------------+-----+
|Business_Name_39  |91   |
|Business_Name_355 |30   |
|Business_Name_1807|28   |
|Business_Name_2404|28   |
|Business_Name_2602|27   |
+------------

In [12]:
print("\nСтатистики для числовых столбцов:")
for col in numeric_columns:
    # Enclose column name in backticks if it contains spaces
    stats = data.select(F.mean(col).alias("mean"),
                        F.expr(f"percentile_approx(`{col}`, 0.5)").alias("median"),  # Для медианы
                        F.stddev(col).alias("stddev")).collect()[0]
    print(f"Статистика для {col}:")
    print(f"  Среднее: {stats['mean']}")
    print(f"  Медиана: {stats['median']}")
    print(f"  Стандартное отклонение: {stats['stddev']}")


Статистики для числовых столбцов:
Статистика для Application ID:
  Среднее: 1020297559.5
  Медиана: 1020297541
  Стандартное отклонение: 56325.85924925306
Статистика для Business_Industry_Type_Code:
  Среднее: 399104.7059881713
  Медиана: 446541
  Стандартное отклонение: 263815.88967135985
Статистика для Approved_Year:
  Среднее: 2008.122751360715
  Медиана: 2009
  Стандартное отклонение: 6.256458086968017
Статистика для New Business:
  Среднее: 1.2793030623020063
  Медиана: 1
  Стандартное отклонение: 0.45111900679753353
Статистика для Term:
  Среднее: 110.73985998216465
  Медиана: 84
  Стандартное отклонение: 78.8517723449656
Статистика для Interest Rate:
  Среднее: 7.327914390266403
  Медиана: 7
  Стандартное отклонение: 2.97771377479778
Статистика для Employees:
  Среднее: 19.122981990385306
  Медиана: 7
  Стандартное отклонение: 130.03260689491435
Статистика для Gross Disbursed Amount:
  Среднее: 14144916.44937935
  Медиана: 7000000
  Стандартное отклонение: 20299306.581284165
Ст

### Заполнение пропусков

In [13]:
from pyspark.sql.functions import col

data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+--------------+-----------+---------+------------------+-----------+--------------------+---------------------------+-------------+------------+----+---------+-------------+---------+----------------------+----------+-------------+------------------------------+-----------------------+----------------------------+------------------+-------------------+-------------+--------------+--------------------+------------------+------------+-------------------+--------------+
|Application ID|Bank Masked|Bank Type|Approved_Timestamp|Name Masked|Business Owner State|Business_Industry_Type_Code|Approved_Year|New Business|Term|BankState|Interest Rate|Employees|Gross Disbursed Amount|Term_years|Jobs Retained|Male to Female Employees Ratio|Expected Company Income|Funds available with company|Gross_Apprv_Amount|Company Branch Code|City or Rural|Jobs Generated|Carry-forward Credit|Documents Provided|Balance Left|Final_Appved_Amount|Default_Status|
+--------------+-----------+---------+----------------

Так можно для всех столбцов, где не хватает данных:

In [14]:
data.select(col('Documents Provided')).groupBy('Documents Provided').count().show()

+------------------+------+
|Documents Provided| count|
+------------------+------+
|              NULL|   552|
|                 Y| 24545|
|                 N|170021|
+------------------+------+



In [15]:
data = data.fillna({'Bank Masked': 'Unknown', 'Bank Type': 'Unknown', 'Approved_Timestamp': 'Unknown',
                    'Name Masked': 'Unknown', 'Business Owner State': 'Unknown',
                    'BankState': 'Unknown', 'Carry-forward Credit': 'N', 'Documents Provided': 'N'})

In [16]:
imputer = Imputer(inputCols=["New Business", "Expected Company Income"],
                  outputCols=["New Business", "Expected Company Income"],
                  strategy='mean')
imputer = imputer.fit(data)
data = imputer.transform(data)

Проверяем, что все заполнено:

In [17]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+--------------+-----------+---------+------------------+-----------+--------------------+---------------------------+-------------+------------+----+---------+-------------+---------+----------------------+----------+-------------+------------------------------+-----------------------+----------------------------+------------------+-------------------+-------------+--------------+--------------------+------------------+------------+-------------------+--------------+
|Application ID|Bank Masked|Bank Type|Approved_Timestamp|Name Masked|Business Owner State|Business_Industry_Type_Code|Approved_Year|New Business|Term|BankState|Interest Rate|Employees|Gross Disbursed Amount|Term_years|Jobs Retained|Male to Female Employees Ratio|Expected Company Income|Funds available with company|Gross_Apprv_Amount|Company Branch Code|City or Rural|Jobs Generated|Carry-forward Credit|Documents Provided|Balance Left|Final_Appved_Amount|Default_Status|
+--------------+-----------+---------+----------------

### Обработка

In [18]:
bank_type_indexer = StringIndexer(inputCol="Bank Type", outputCol="Bank_Type_Index")
bank_type_indexer = bank_type_indexer.fit(data)
data = bank_type_indexer.transform(data)

business_owner_state_indexer = StringIndexer(inputCol="Business Owner State", outputCol="Business_Owner_State_Index")
business_owner_state_indexer = business_owner_state_indexer.fit(data)
data = business_owner_state_indexer.transform(data)

bank_state_indexer = StringIndexer(inputCol="BankState", outputCol="Bank_State_Index")
bank_state_indexer = bank_state_indexer.fit(data)
data = bank_state_indexer.transform(data)

carry_forward_credit_indexer = StringIndexer(inputCol="Carry-forward Credit", outputCol="Carry_Forward_Credit_Index")
carry_forward_credit_indexer = carry_forward_credit_indexer.fit(data)
data = carry_forward_credit_indexer.transform(data)

documents_provided_indexer = StringIndexer(inputCol="Documents Provided", outputCol="Documents_Provided_Index")
documents_provided_indexer = documents_provided_indexer.fit(data)
data = documents_provided_indexer.transform(data)

In [19]:
ohe_encoder = OneHotEncoder(inputCols=["Bank_Type_Index", "Business_Owner_State_Index", "Bank_State_Index"],
                        outputCols=["Bank_Type_Index_Vector", "Business_Owner_State_Vector", "Bank_State_Vector"])
ohe_encoder = ohe_encoder.fit(data)
data = ohe_encoder.transform(data)

In [20]:
employees_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Employees", outputCol="Employees_quant")
employees_discretizer = employees_discretizer.fit(data)
data = employees_discretizer.transform(data)

final_appved_amount_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Final_Appved_Amount", outputCol="Final_Appved_Amount_quant")
final_appved_amount_discretizer = final_appved_amount_discretizer.fit(data)
data = final_appved_amount_discretizer.transform(data)

approved_year_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Approved_Year", outputCol="Approved_Yeart_quant")
approved_year_discretizer = approved_year_discretizer.fit(data)
data = approved_year_discretizer.transform(data)

In [21]:
interest_rate_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Interest Rate", outputCol="Interest_Rate_quant")
interest_rate_discretizer = interest_rate_discretizer.fit(data)
data = interest_rate_discretizer.transform(data)

gross_disbursed_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Gross Disbursed Amount", outputCol="Gross_Disburse_Amount_quant")
gross_disbursed_discretizer = gross_disbursed_discretizer.fit(data)
data = gross_disbursed_discretizer.transform(data)

term_years_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Term_years", outputCol="Term_years_quant")
term_years_discretizer = term_years_discretizer.fit(data)
data = term_years_discretizer.transform(data)

male_female_ratio_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Male to Female Employees Ratio", outputCol="Male_to_Female_Employees_Ratio_quant")
male_female_ratio_discretizer = male_female_ratio_discretizer.fit(data)
data = male_female_ratio_discretizer.transform(data)

expected_income_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Expected Company Income", outputCol="Expected_Company_Income_quant")
expected_income_discretizer = expected_income_discretizer.fit(data)
data = expected_income_discretizer.transform(data)

funds_available_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Funds available with company", outputCol="Funds_available_with_company_quant")
funds_available_discretizer = funds_available_discretizer.fit(data)
data = funds_available_discretizer.transform(data)

gross_apprv_amount_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Gross_Apprv_Amount", outputCol="Gross_Apprv_Amount_quant")
gross_apprv_amount_discretizer = gross_apprv_amount_discretizer.fit(data)
data = gross_apprv_amount_discretizer.transform(data)

In [22]:
feature_columns = [
    'Business_Industry_Type_Code',
    'Approved_Year',
    'New Business',
    'Term',
    'Interest Rate',
    'Employees',
    'Gross Disbursed Amount',
    'Term_years',
    'Jobs Retained',
    'Male to Female Employees Ratio',
    'Expected Company Income',
    'Funds available with company',
    'Gross_Apprv_Amount',
    'Company Branch Code',
    'City or Rural',
    'Jobs Generated',
    'Balance Left',
    'Final_Appved_Amount',
    'Carry_Forward_Credit_Index',
    'Documents_Provided_Index',
    'Bank_Type_Index_Vector',
    'Business_Owner_State_Vector',
    'Bank_State_Vector',
    "Employees_quant",
    "Final_Appved_Amount_quant",
    "Approved_Yeart_quant",
    "Interest_Rate_quant",
    "Gross_Disburse_Amount_quant",
    "Term_years_quant",
    "Male_to_Female_Employees_Ratio_quant",
    "Expected_Company_Income_quant",
    "Funds_available_with_company_quant",
    "Gross_Apprv_Amount_quant"
]

Корреляции

In [23]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col, abs
from pyspark.sql.types import FloatType

all_columns = feature_columns + ['Default_Status']

df_va = VectorAssembler(inputCols=all_columns, outputCol='features')
data = df_va.transform(data)

data_vector = data.select('features')

correlation_matrix = Correlation.corr(data_vector, "features").collect()[0][0]

num_cols = len(all_columns)
correlation_df = spark.createDataFrame(
    [(all_columns[i], all_columns[j], float(correlation_matrix[i, j]))
     for i in range(num_cols)
     for j in range(num_cols)],
    ["feature1", "feature2", "correlation"]
).withColumn("correlation", col("correlation").cast(FloatType()))

default_status_correlations = correlation_df.filter(
    (col("feature1") == "Default_Status") & (col("feature2") != "Default_Status")
).select("feature2", "correlation").orderBy(abs(col("correlation")).desc())

default_status_correlations.show(n=default_status_correlations.count(), truncate=False)

+------------------------------------+-------------+
|feature2                            |correlation  |
+------------------------------------+-------------+
|Gross_Apprv_Amount_quant            |-0.5229817   |
|Final_Appved_Amount_quant           |0.46429187   |
|Bank_State_Vector                   |-0.21100944  |
|Employees_quant                     |-0.1320608   |
|Approved_Yeart_quant                |-0.10178113  |
|Male_to_Female_Employees_Ratio_quant|-0.041304994 |
|Interest_Rate_quant                 |-0.04095862  |
|Business_Industry_Type_Code         |0.03533191   |
|Expected_Company_Income_quant       |-0.026378855 |
|Funds_available_with_company_quant  |-0.02418197  |
|City or Rural                       |0.014333968  |
|Term_years_quant                    |-0.012812646 |
|Gross_Disburse_Amount_quant         |0.011264889  |
|Carry_Forward_Credit_Index          |0.008196422  |
|Jobs Retained                       |-0.0027649398|
|Jobs Generated                      |-0.00257

### Собираем модель (регрессия)

In [24]:
data2 = spark.read.csv('Train_Set_90621.csv', header=True, inferSchema=True)

In [25]:
data2 = data2.drop('Amount Defaulted')

In [26]:
data2 = data2.fillna({'Bank Masked': 'Unknown', 'Bank Type': 'Unknown', 'Approved_Timestamp': 'Unknown',
                    'Name Masked': 'Unknown', 'Business Owner State': 'Unknown',
                    'BankState': 'Unknown', 'Carry-forward Credit': 'N', 'Documents Provided': 'N'})

In [27]:
imputer = Imputer(inputCols=["New Business", "Expected Company Income"],
                  outputCols=["New Business", "Expected Company Income"],
                  strategy='mean')
imputer = imputer.fit(data2)
data2 = imputer.transform(data2)

In [28]:
bank_type_indexer = StringIndexer(inputCol="Bank Type", outputCol="Bank_Type_Index", handleInvalid="keep")
business_owner_state_indexer = StringIndexer(inputCol="Business Owner State", outputCol="Business_Owner_State_Index", handleInvalid="keep")
bank_state_indexer = StringIndexer(inputCol="BankState", outputCol="Bank_State_Index", handleInvalid="keep")
carry_forward_credit_indexer = StringIndexer(inputCol="Carry-forward Credit", outputCol="Carry_Forward_Credit_Index", handleInvalid="keep")
documents_provided_indexer = StringIndexer(inputCol="Documents Provided", outputCol="Documents_Provided_Index", handleInvalid="keep")

ohe_encoder = OneHotEncoder(inputCols=["Bank_Type_Index", "Business_Owner_State_Index", "Bank_State_Index"],
                        outputCols=["Bank_Type_Index_Vector", "Business_Owner_State_Vector", "Bank_State_Vector"])

employees_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Employees", outputCol="Employees_quant")
final_appved_amount_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Final_Appved_Amount", outputCol="Final_Appved_Amount_quant")
approved_year_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Approved_Year", outputCol="Approved_Year_quant")

interest_rate_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Interest Rate", outputCol="Interest_Rate_quant")
gross_disbursed_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Gross Disbursed Amount", outputCol="Gross_Disburse_Amount_quant")
term_years_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Term_years", outputCol="Term_years_quant")
male_female_ratio_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Male to Female Employees Ratio", outputCol="Male_to_Female_Employees_Ratio_quant")
expected_income_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Expected Company Income", outputCol="Expected_Company_Income_quant")
funds_available_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Funds available with company", outputCol="Funds_available_with_company_quant")
gross_apprv_amount_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Gross_Apprv_Amount", outputCol="Gross_Apprv_Amount_quant")

feature_columns = [
    'Business_Industry_Type_Code',
    'Approved_Year',
    'New Business',
    'Term',
    'Interest Rate',
    'Employees',
    'Gross Disbursed Amount',
    'Term_years',
    'Jobs Retained',
    'Male to Female Employees Ratio',
    'Expected Company Income',
    'Funds available with company',
    'Gross_Apprv_Amount',
    'Company Branch Code',
    'City or Rural',
    'Jobs Generated',
    'Balance Left',
    'Final_Appved_Amount',
    'Carry_Forward_Credit_Index',
    'Documents_Provided_Index',
    'Bank_Type_Index_Vector',
    'Business_Owner_State_Vector',
    'Bank_State_Vector',
    "Employees_quant",
    "Final_Appved_Amount_quant",
    "Approved_Year_quant",
    "Interest_Rate_quant",
    "Gross_Disburse_Amount_quant",
    "Term_years_quant",
    "Male_to_Female_Employees_Ratio_quant",
    "Expected_Company_Income_quant",
    "Funds_available_with_company_quant",
    "Gross_Apprv_Amount_quant"
]

df_va = VectorAssembler(inputCols = feature_columns, outputCol = 'features')

lr = LogisticRegression(featuresCol='features', labelCol='Default_Status', predictionCol='prediction',
                        maxIter=100, probabilityCol='proba')

In [29]:
pipeline_reg = Pipeline(stages=[
    bank_type_indexer,
    business_owner_state_indexer,
    bank_state_indexer,
    carry_forward_credit_indexer,
    documents_provided_indexer,
    ohe_encoder,
    employees_discretizer,
    final_appved_amount_discretizer,
    approved_year_discretizer,
    interest_rate_discretizer,
    gross_disbursed_discretizer,
    term_years_discretizer,
    male_female_ratio_discretizer,
    expected_income_discretizer,
    funds_available_discretizer,
    gross_apprv_amount_discretizer,
    df_va,
    lr
])

Такой перебор слишком большой, поэтому я поперебирал их отдельно. Дальше будут взяты лучшие параметры.

In [30]:
'''
paramGrid_reg = ParamGridBuilder() \
    .addGrid(employees_discretizer.numBuckets, [5, 8]) \
    .addGrid(final_appved_amount_discretizer.numBuckets, [10, 15]) \
    .addGrid(approved_year_discretizer.numBuckets, [5, 8]) \
    .addGrid(interest_rate_discretizer.numBuckets, [5, 8]) \
    .addGrid(gross_disbursed_discretizer.numBuckets, [10, 15]) \
    .addGrid(term_years_discretizer.numBuckets, [5, 8]) \
    .addGrid(male_female_ratio_discretizer.numBuckets, [5, 8]) \
    .addGrid(expected_income_discretizer.numBuckets, [10, 15]) \
    .addGrid(funds_available_discretizer.numBuckets, [10, 15]) \
    .addGrid(gross_apprv_amount_discretizer.numBuckets, [10, 15]) \
    .addGrid(lr.maxIter, [20, 40]) \
    .build()
'''

'\nparamGrid_reg = ParamGridBuilder()     .addGrid(employees_discretizer.numBuckets, [5, 8])     .addGrid(final_appved_amount_discretizer.numBuckets, [10, 15])     .addGrid(approved_year_discretizer.numBuckets, [5, 8])     .addGrid(interest_rate_discretizer.numBuckets, [5, 8])     .addGrid(gross_disbursed_discretizer.numBuckets, [10, 15])     .addGrid(term_years_discretizer.numBuckets, [5, 8])     .addGrid(male_female_ratio_discretizer.numBuckets, [5, 8])     .addGrid(expected_income_discretizer.numBuckets, [10, 15])     .addGrid(funds_available_discretizer.numBuckets, [10, 15])     .addGrid(gross_apprv_amount_discretizer.numBuckets, [10, 15])     .addGrid(lr.maxIter, [20, 40])     .build()\n'

Лучшие параметры:

In [31]:
paramGrid_reg = ParamGridBuilder() \
    .addGrid(employees_discretizer.numBuckets, [5]) \
    .addGrid(final_appved_amount_discretizer.numBuckets, [15]) \
    .addGrid(approved_year_discretizer.numBuckets, [5]) \
    .addGrid(interest_rate_discretizer.numBuckets, [5]) \
    .addGrid(gross_disbursed_discretizer.numBuckets, [15]) \
    .addGrid(term_years_discretizer.numBuckets, [8]) \
    .addGrid(male_female_ratio_discretizer.numBuckets, [5]) \
    .addGrid(expected_income_discretizer.numBuckets, [10]) \
    .addGrid(funds_available_discretizer.numBuckets, [15]) \
    .addGrid(gross_apprv_amount_discretizer.numBuckets, [15]) \
    .addGrid(lr.maxIter, [40]) \
    .build()

In [32]:
train_reg, test_reg = data2.randomSplit([0.7, 0.3], seed=7)

In [33]:
crossval_reg = CrossValidator(estimator=pipeline_reg,
                          estimatorParamMaps=paramGrid_reg,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                                                  labelCol='Default_Status', metricName='areaUnderROC'),
                          numFolds=2,
                          parallelism=2)

In [34]:
cvModel_reg = crossval_reg.fit(train_reg)

In [35]:
cvModel_reg.avgMetrics

[0.7402074513976986]

In [37]:
test_pred_reg = cvModel_reg.transform(test_reg)

In [38]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                          labelCol='Default_Status', metricName='areaUnderROC')

In [39]:
evaluator.evaluate(test_pred_reg)

0.7399065958049806

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Default_Status",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(test_pred_reg)

print(f"Точность модели: {accuracy:.4f}")

Точность модели: 0.8220


In [None]:
cvModel_reg.write().overwrite().save('model_reg')

### Собираем модель (дерево)

In [43]:
from pyspark.ml.classification import RandomForestClassifier

In [44]:
bank_type_indexer = StringIndexer(inputCol="Bank Type", outputCol="Bank_Type_Index", handleInvalid="keep")
business_owner_state_indexer = StringIndexer(inputCol="Business Owner State", outputCol="Business_Owner_State_Index", handleInvalid="keep")
bank_state_indexer = StringIndexer(inputCol="BankState", outputCol="Bank_State_Index", handleInvalid="keep")
carry_forward_credit_indexer = StringIndexer(inputCol="Carry-forward Credit", outputCol="Carry_Forward_Credit_Index", handleInvalid="keep")
documents_provided_indexer = StringIndexer(inputCol="Documents Provided", outputCol="Documents_Provided_Index", handleInvalid="keep")

ohe_encoder = OneHotEncoder(inputCols=["Bank_Type_Index", "Business_Owner_State_Index", "Bank_State_Index"],
                        outputCols=["Bank_Type_Index_Vector", "Business_Owner_State_Vector", "Bank_State_Vector"])

employees_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Employees", outputCol="Employees_quant")
final_appved_amount_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Final_Appved_Amount", outputCol="Final_Appved_Amount_quant")
approved_year_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Approved_Year", outputCol="Approved_Year_quant")

interest_rate_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Interest Rate", outputCol="Interest_Rate_quant")
gross_disbursed_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Gross Disbursed Amount", outputCol="Gross_Disburse_Amount_quant")
term_years_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Term_years", outputCol="Term_years_quant")
male_female_ratio_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Male to Female Employees Ratio", outputCol="Male_to_Female_Employees_Ratio_quant")
expected_income_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Expected Company Income", outputCol="Expected_Company_Income_quant")
funds_available_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Funds available with company", outputCol="Funds_available_with_company_quant")
gross_apprv_amount_discretizer = QuantileDiscretizer(numBuckets=5, inputCol="Gross_Apprv_Amount", outputCol="Gross_Apprv_Amount_quant")

feature_columns = [
    'Business_Industry_Type_Code',
    'Approved_Year',
    'New Business',
    'Term',
    'Interest Rate',
    'Employees',
    'Gross Disbursed Amount',
    'Term_years',
    'Jobs Retained',
    'Male to Female Employees Ratio',
    'Expected Company Income',
    'Funds available with company',
    'Gross_Apprv_Amount',
    'Company Branch Code',
    'City or Rural',
    'Jobs Generated',
    'Balance Left',
    'Final_Appved_Amount',
    'Carry_Forward_Credit_Index',
    'Documents_Provided_Index',
    'Bank_Type_Index_Vector',
    'Business_Owner_State_Vector',
    'Bank_State_Vector',
    "Employees_quant",
    "Final_Appved_Amount_quant",
    "Approved_Year_quant",
    "Interest_Rate_quant",
    "Gross_Disburse_Amount_quant",
    "Term_years_quant",
    "Male_to_Female_Employees_Ratio_quant",
    "Expected_Company_Income_quant",
    "Funds_available_with_company_quant",
    "Gross_Apprv_Amount_quant"
]

df_va = VectorAssembler(inputCols = feature_columns, outputCol = 'features')

rf = RandomForestClassifier(labelCol="Default_Status", featuresCol="features", numTrees=100)

In [45]:
pipeline_tree = Pipeline(stages=[
    bank_type_indexer,
    business_owner_state_indexer,
    bank_state_indexer,
    carry_forward_credit_indexer,
    documents_provided_indexer,
    ohe_encoder,
    employees_discretizer,
    final_appved_amount_discretizer,
    approved_year_discretizer,
    interest_rate_discretizer,
    gross_disbursed_discretizer,
    term_years_discretizer,
    male_female_ratio_discretizer,
    expected_income_discretizer,
    funds_available_discretizer,
    gross_apprv_amount_discretizer,
    df_va,
    rf
])

In [46]:
paramGrid_tree = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5, 10, 12]) \
    .addGrid(rf.minInstancesPerNode, [1, 2, 3]) \
    .build()

In [47]:
train, test = data2.randomSplit([0.7, 0.3], seed=7)

In [48]:
crossval = CrossValidator(estimator=pipeline_tree,
                          estimatorParamMaps=paramGrid_tree,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                                                  labelCol='Default_Status', metricName='areaUnderROC'),
                          numFolds=2,
                          parallelism=4)

In [49]:
cvModel_tree = crossval.fit(train)

In [50]:
cvModel_tree.avgMetrics

[0.7876230693040538,
 0.7872313760448878,
 0.7858826344046697,
 0.8419280144399564,
 0.8413445349431989,
 0.8421603215094066,
 0.8510171714897595,
 0.8514665222018192,
 0.8520091746817198]

In [51]:
import numpy as np
print(cvModel_tree.getEstimatorParamMaps()[np.argmax(cvModel_tree.avgMetrics)])

{Param(parent='RandomForestClassifier_a65788666545', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 12, Param(parent='RandomForestClassifier_a65788666545', name='minInstancesPerNode', doc='Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.'): 3}


In [52]:
test_pred_tree = cvModel_tree.transform(test)

In [53]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                          labelCol='Default_Status', metricName='areaUnderROC')

In [54]:
evaluator.evaluate(test_pred_tree)

0.8517943334901616

In [55]:
best_model = cvModel_tree.bestModel

print('Важность признаков:')
feature_importance = best_model.stages[-1].featureImportances
feature_names = df_va.getInputCols()
for name, importance in sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{name}: {importance}")

Важность признаков:
Term: 0.255360057321469
Term_years: 0.14236091184992122
Expected Company Income: 0.05103578126603084
Company Branch Code: 0.04239873078119127
City or Rural: 0.03340479566299452
Male to Female Employees Ratio: 0.029265956393561478
Jobs Retained: 0.028768406444694184
Final_Appved_Amount: 0.028005180142221427
Gross_Apprv_Amount: 0.021520633046525
Employees: 0.021401842857952276


In [56]:
cvModel_tree.write().save('model_tree')