In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.pandas as ps
import pandas as pd
import numpy as np


# Pegando o SparkContext existente ou criando um novo
sc = SparkContext.getOrCreate()

spark = SparkSession.builder \
    .appName('credit_score') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .config("spark.driver.host", "localhost") \
    .config("spark.executor.memory", "10g") \
    .config("spark.driver.memory", "10g") \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.executor.cores", "4") \
    .config("spark.driver.cores", "4") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "200") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

# ajustando o nível de log do SparkSession e SparkContext para informar apenas o que estiver a nivel de ERROR
# (necessário em razão das configurações do Apache Spark no meu computador)
sc.setLogLevel("ERROR")
spark.sparkContext.setLogLevel("ERROR")

In [26]:
installments_p = spark.read.csv('datasets/installments_payments.csv', inferSchema=True, header=True)
merged_b = spark.read.csv('datasets/merged_bureau.csv', inferSchema=True, header=True)
# prev_app = spark.read.csv('datasets/previous_application.csv', inferSchema=True, header=True)
# POS_CASH_b = spark.read.csv('datasets/POS_CASH_balance.csv', inferSchema=True, header=True)
# bureau_b = spark.read.csv('datasets/bureau_balance.csv', inferSchema=True, header=True)
# app_train = spark.read.csv('datasets/application_train.csv', inferSchema=True, header=True)
# merged_app = spark.read.csv('datasets/merged_application.csv', inferSchema=True, header=True)
# bur = spark.read.csv('datasets/bureau.csv', inferSchema=True, header=True)
# sample_subm = spark.read.csv('datasets/sample_submission.csv', inferSchema=True, header=True)




                                                                                

application_train.csv:
* Este é o arquivo principal, dividido em dois arquivos para Treinamento (com TARGET) e Teste (sem TARGET).
* Dados estáticos para todas as aplicações. Uma linha representa um empréstimo em nossa amostra de dados.

bureau.csv:
* Todos os créditos anteriores do cliente fornecidos por outras instituições financeiras que foram relatados ao Birô de Crédito (para clientes que têm um empréstimo em nossa amostra).
* Para cada empréstimo em nossa amostra, há mesma quantidade de linhas em relação a quantos créditos o cliente teve no Birô de Crédito antes da data da aplicação.

bureau_balance.csv:
* Saldos mensais de créditos anteriores no Birô de Crédito.
* Esta tabela tem uma linha para cada mês de histórico de cada crédito anterior relatado ao Birô de Crédito - ou seja, a tabela tem (# empréstimos na amostra * # de créditos anteriores relativos * # de meses em que temos algum histórico observável para os créditos anteriores) linhas.

POS_CASH_balance.csv:
* Fotos de saldo mensais de empréstimos POS (ponto de vendas) anteriores e empréstimos em dinheiro que o requerente teve com a Home Credit.
* Esta tabela tem uma linha para cada mês de histórico de cada crédito anterior na Home Credit (crédito ao consumidor e empréstimos em dinheiro) relacionado aos empréstimos em nossa amostra - ou seja, a tabela tem (# empréstimos na amostra * # de créditos anteriores relativos * # de meses em que temos algum histórico observável para os créditos anteriores) linhas.

credit_card_balance.csv:
* Fotos de saldo mensais de cartões de crédito anteriores que o requerente tem com a Home Credit.
* Esta tabela tem uma linha para cada mês de histórico de cada crédito anterior na Home Credit (crédito ao consumidor e empréstimos em dinheiro) relacionado aos empréstimos em nossa amostra - ou seja, a tabela tem (# empréstimos na amostra * # de cartões de crédito anteriores relativos * # de meses em que temos algum histórico observável para os cartões de crédito anteriores) linhas.

previous_application.csv:
* Todas as aplicações anteriores para empréstimos Home Credit de clientes que têm empréstimos em nossa amostra de dados.
* Há uma linha para cada aplicação anterior relacionada aos empréstimos em nossa amostra de dados.

installments_payments.csv:
* Histórico de pagamentos para os créditos previamente liberados na Home Credit relacionados aos empréstimos em nossa amostra de dados.
* Há uma linha para cada pagamento realizado mais uma linha para cada pagamento perdido. Uma linha é equivalente a um pagamento de uma prestação OU uma prestação correspondente a um pagamento de um crédito anterior da Home Credit relacionado aos empréstimos em nossa amostra de dados.

In [7]:
cc_balance = pd.read_csv('datasets/credit_card_balance.csv', encoding='ISO-8859-1')
prev_app = pd.read_csv('datasets/previous_application.csv', encoding='ISO-8859-1')
POS_CASH_b = pd.read_csv('datasets/POS_CASH_balance.csv', encoding='ISO-8859-1')
bureau_b = pd.read_csv('datasets/bureau_balance.csv', encoding='ISO-8859-1')
app_train = pd.read_csv('datasets/application_train.csv', encoding='ISO-8859-1')
merged_app = pd.read_csv('datasets/merged_application.csv', encoding='ISO-8859-1')
bur = pd.read_csv('datasets/bureau.csv', encoding='ISO-8859-1')
sample_subm = pd.read_csv('datasets/sample_submission.csv', encoding='ISO-8859-1')
home_cred = pd.read_csv('datasets/HomeCredit_columns_description.csv', encoding='ISO-8859-1')

In [8]:
application_train = app_train
bureau_balance = bureau_b
bureau = bur
credit_card_balance = cc_balance
home_cred_col_descrip = home_cred
# installments_payments = installments_p.pandas_api()
merged_application = merged_app
# merged_bureau = merged_b.pandas_api()
POS_CASH_balance = POS_CASH_b
previous_app = prev_app
sample_submission = sample_subm

Comecemos aplicando função para remover as colunas que estejam duplicadas nos dataframes, a fim de em seguida darmos um join em todas.

In [35]:
def remove_duplicated_columns(df1, df2):
    for col in df1.columns:
        for col2 in df2.columns:
            if column == col2 and col2 != 'SK_ID_CURR':
                df2 = df2.drop(col2)
    return df1.join(df2, on='SK_ID_CURR', how='inner')

a = remove_duplicated_columns(app_train, merged_b)
b = remove_duplicated_columns(a, prev_app)
c = remove_duplicated_columns(b, POS_CASH_b)
d = remove_duplicated_columns(c, merged_app)
df = remove_duplicated_columns(d, bur)

In [28]:
df = app_train.join(installments_p, on='SK_ID_CURR', how='inner')\
                .join(merged_b, on='SK_ID_CURR', how='inner') \
                .join(prev_app, on='SK_ID_CURR', how='inner') \
                .join(POS_CASH_b, on='SK_ID_CURR', how='inner') \
                .join(merged_app, on='SK_ID_CURR', how='inner') \
                .join(bur, on='SK_ID_CURR', how='inner') \
#                 .drop(merged_app['TARGET'])


In [10]:
len(df.columns)

NameError: name 'df' is not defined

1. Dar merge em todos os dataframes: Comece combinando todos os seus DataFrames em um único dataset. Isso garante que você tenha todas as informações necessárias em um único lugar.

2. Separar teste e treino: Divida seus dados em conjuntos de treino e teste antes de fazer qualquer pré-processamento ou análise para evitar qualquer tipo de "data leakage" (vazamento de dados). Isso ajuda a garantir que seu modelo seja avaliado de forma justa.

3. Verificar a distribuição dos dados e tratar distribuições: Analise a distribuição das variáveis. Trate valores ausentes, outliers e aplique transformações se necessário para normalizar ou padronizar as distribuições.

4. Calcular WoE (Weight of Evidence) e VI (Variable Importance): Use essas técnicas para transformar variáveis categóricas em valores numéricos (WoE) e avaliar a importância inicial das variáveis (VI).

5. Verificar multicolinearidade: Identifique e trate variáveis que estão altamente correlacionadas para evitar redundância e problemas durante o treinamento do modelo.

6. Fazer feature engineering: Crie novas variáveis a partir das existentes se necessário, para melhorar a performance do modelo.

7. Fazer feature selecting: Use técnicas de seleção de características para escolher as mais relevantes para o modelo. Isso pode incluir métodos automáticos ou baseados em análise manual.

8. Fazer feature importance: Após a seleção de características, utilize métodos de importância de características (como random forests ou gradient boosting) para confirmar quais variáveis são mais importantes.

9. Fazer cross-validation: Realize cross-validation para avaliar a performance do seu modelo de forma mais robusta e garantir que ele não esteja overfitting nos dados de treino.

10. Treinar o modelo: Finalmente, treine seu modelo de machine learning com os dados de treino, usando as características selecionadas.

1. Dar merge em todos os dataframes
2. EDA
3. Separar teste e treino
4. Verificar a distribuição dos dados e tratar distribuições
5. Calcular WoE (Weight of Evidence) e VI (Variable Importance)
6. Verificar multicolinearidade
7. Calcular Chi-square
8. Fazer feature engineering
9. Fazer feature selecting
10. Fazer feature importance
11. Fazer cross-validation
12. Treinar o modelo


# 1. Conhecendo os datasets

In [6]:
pd.set_option('display.max_colwidth', None)
home_cred_col_descrip[home_cred_col_descrip['Table'] == 'installments_payments.csv']

Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
211,214,installments_payments.csv,SK_ID_PREV,"ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)",hashed
212,215,installments_payments.csv,SK_ID_CURR,ID of loan in our sample,hashed
213,216,installments_payments.csv,NUM_INSTALMENT_VERSION,Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed,
214,217,installments_payments.csv,NUM_INSTALMENT_NUMBER,On which installment we observe payment,
215,218,installments_payments.csv,DAYS_INSTALMENT,When the installment of previous credit was supposed to be paid (relative to application date of current loan),time only relative to the application
216,219,installments_payments.csv,DAYS_ENTRY_PAYMENT,When was the installments of previous credit paid actually (relative to application date of current loan),time only relative to the application
217,220,installments_payments.csv,AMT_INSTALMENT,What was the prescribed installment amount of previous credit on this installment,
218,221,installments_payments.csv,AMT_PAYMENT,What the client actually paid on previous credit on this installment,


## Detalhamento dos Campos

SK_ID_PREV: Cada crédito anterior é identificado por um código único. Este campo relaciona o empréstimo atual com possíveis empréstimos anteriores que o cliente teve na mesma instituição financeira.

SK_ID_CURR: Identificador único para o empréstimo que está sendo analisado na amostra de dados atual.

NUM_INSTALMENT_VERSION: Se houver alterações no calendário de parcelas, como ajustes no prazo ou nas datas, essa coluna ajuda a identificar a versão do calendário para cada parcela.

NUM_INSTALMENT_NUMBER: Indica qual parcela específica do crédito anterior está sendo observada, por exemplo, a primeira, segunda ou terceira parcela.

DAYS_INSTALMENT: Mostra a data programada para o pagamento da parcela em relação à data de aplicação do empréstimo atual. Por exemplo, -30 pode significar que o pagamento era esperado 30 dias antes da data de aplicação do crédito atual.

DAYS_ENTRY_PAYMENT: Mostra a data real do pagamento da parcela em relação à data de aplicação do crédito atual. Se o valor for negativo, indica que o pagamento foi feito antes da data de aplicação.

AMT_INSTALMENT: O valor da parcela conforme o contrato do crédito anterior. Esse valor pode ser alterado ao longo do tempo se houver mudanças nas condições do crédito. Se o contrato diz que a parcela mensal é R 500, esse valor é o AMT_INSTALMENT.

AMT_PAYMENT: O valor efetivamente pago pelo cliente para essa parcela do crédito anterior. Pode ser menor ou maior que o valor da parcela se houver pagamentos antecipados ou atrasos. Se o cliente pagou R 450, em vez dos R 500, esse valor é o AMT_PAYMENT.

In [7]:
# installments_payments.repartition(100)
pd.options.display.float_format = '{:.2f}'.format
installments_payments.describe()

                                                                                

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
count,13605401.0,13605401.0,13605401.0,13605401.0,13605401.0,13602496.0,13605401.0,13602496.0
mean,1903364.97,278444.88,0.86,18.87,-1042.27,-1051.11,17050.91,17238.22
std,536202.91,102718.31,1.04,26.66,800.95,800.59,50570.25,54735.78
min,1000001.0,100001.0,0.0,1.0,-2922.0,-4921.0,0.0,0.0
25%,1434140.0,189623.0,0.0,4.0,-1653.0,-1662.0,4225.86,3397.32
50%,1896486.0,278672.0,1.0,8.0,-818.0,-827.0,8884.08,8124.98
75%,2369086.0,367545.0,1.0,19.0,-361.0,-370.0,16712.1,16106.35
max,2843499.0,456255.0,178.0,277.0,-1.0,-1.0,3771487.85,3771487.85


In [6]:
installments_payments.head()

                                                                                

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [13]:
installments_payments.isna().sum()

                                                                                

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

In [11]:
pd.set_option('display.max_rows', None)
home_cred_col_descrip[home_cred_col_descrip['Table'] == 'application_{train|test}.csv']

Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
0,1,application_{train|test}.csv,SK_ID_CURR,ID of loan in our sample,
1,2,application_{train|test}.csv,TARGET,Target variable (1 - client with payment diffi...,
2,5,application_{train|test}.csv,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
3,6,application_{train|test}.csv,CODE_GENDER,Gender of the client,
4,7,application_{train|test}.csv,FLAG_OWN_CAR,Flag if the client owns a car,
5,8,application_{train|test}.csv,FLAG_OWN_REALTY,Flag if client owns a house or flat,
6,9,application_{train|test}.csv,CNT_CHILDREN,Number of children the client has,
7,10,application_{train|test}.csv,AMT_INCOME_TOTAL,Income of the client,
8,11,application_{train|test}.csv,AMT_CREDIT,Credit amount of the loan,
9,12,application_{train|test}.csv,AMT_ANNUITY,Loan annuity,


In [21]:
# def missing_data_pandas(data):
#     total = data.isna().sum().sort_values(ascending=False)
#     percent = (data.isna().sum() / data.shape[0] * 100).sort_values(ascending=False)
#     return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

# def missing_data_spark(data):
#     total = data.isna().sum().sort_values(ascending=False)
#     percent = (data.isna().sum() / data.shape[0] * 100).sort_values(ascending=False)
#     return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

def missing_data(data):
    total = data.isna().sum().sort_values(ascending=False)
    percent = (data.isna().sum() / data.count() * 100).sort_values(ascending=False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])


In [9]:
application_train.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [1]:
application_train.shape()

NameError: name 'application_train' is not defined

In [12]:
install_p = installments_payments.dropna()
install_p.isna().sum()

NameError: name 'installments_payments' is not defined

In [28]:
installments_payments = installments_p.pandas_api()
installments_payments.info()



<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
 #   Column                  Non-Null Count     Dtype  
---  ------                  --------------     -----  
 0   SK_ID_PREV              13605401 non-null  int32  
 1   SK_ID_CURR              13605401 non-null  int32  
 2   NUM_INSTALMENT_VERSION  13605401 non-null  float64
 3   NUM_INSTALMENT_NUMBER   13605401 non-null  int32  
 4   DAYS_INSTALMENT         13605401 non-null  float64
 5   DAYS_ENTRY_PAYMENT      13602496 non-null  float64
 6   AMT_INSTALMENT          13605401 non-null  float64
 7   AMT_PAYMENT             13602496 non-null  float64
dtypes: float64(5), int32(3)

                                                                                

## 1.1 Coluna `application_train`

In [13]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
home_cred_col_descrip[home_cred_col_descrip['Table'] == 'application_{train|test}.csv']


Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
0,1,application_{train|test}.csv,SK_ID_CURR,ID of loan in our sample,
1,2,application_{train|test}.csv,TARGET,"Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)",
2,5,application_{train|test}.csv,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
3,6,application_{train|test}.csv,CODE_GENDER,Gender of the client,
4,7,application_{train|test}.csv,FLAG_OWN_CAR,Flag if the client owns a car,
5,8,application_{train|test}.csv,FLAG_OWN_REALTY,Flag if client owns a house or flat,
6,9,application_{train|test}.csv,CNT_CHILDREN,Number of children the client has,
7,10,application_{train|test}.csv,AMT_INCOME_TOTAL,Income of the client,
8,11,application_{train|test}.csv,AMT_CREDIT,Credit amount of the loan,
9,12,application_{train|test}.csv,AMT_ANNUITY,Loan annuity,


In [29]:
application_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


In [24]:
column_info_df = pd.DataFrame({
    'Column Name': application_train.columns,
    'Data Type': [application_train[col].dtype for col in application_train.columns]
})

column_info_df

Unnamed: 0,Column Name,Data Type
0,SK_ID_CURR,int64
1,TARGET,int64
2,NAME_CONTRACT_TYPE,object
3,CODE_GENDER,object
4,FLAG_OWN_CAR,object
5,FLAG_OWN_REALTY,object
6,CNT_CHILDREN,int64
7,AMT_INCOME_TOTAL,float64
8,AMT_CREDIT,float64
9,AMT_ANNUITY,float64


In [30]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [31]:
missing_data(application_train)

Unnamed: 0,Total,Percent
COMMONAREA_MEDI,214865,69.872297
COMMONAREA_AVG,214865,69.872297
COMMONAREA_MODE,214865,69.872297
NONLIVINGAPARTMENTS_MODE,213514,69.432963
NONLIVINGAPARTMENTS_AVG,213514,69.432963
NONLIVINGAPARTMENTS_MEDI,213514,69.432963
FONDKAPREMONT_MODE,210295,68.386172
LIVINGAPARTMENTS_MODE,210199,68.354953
LIVINGAPARTMENTS_AVG,210199,68.354953
LIVINGAPARTMENTS_MEDI,210199,68.354953


In [30]:
import scipy.stats as ss
from sklearn.preprocessing import LabelEncoder

# Função para calcular Cramér's V
def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()  # Soma total de todos os elementos da matriz de contingência
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    rcorr = r - ((r-1)**2) / (n-1)
    kcorr = k - ((k-1)**2) / (n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


# Lista para armazenar os resultados
results = []

# Iterar sobre colunas categóricas
for column in application_train.select_dtypes(include=['object']).columns:
    confusion_matrix = pd.crosstab(application_train[column], application_train['TARGET'])
    chi2, p, dof, ex = ss.chi2_contingency(confusion_matrix)
    cramers_v_value = cramers_v(confusion_matrix)
    results.append((column, chi2, p, cramers_v_value))

# Criar DataFrame com os resultados
results_df = pd.DataFrame(results, columns=['Variable', 'Chi2', 'p-value', 'Cramers V'])
results_df = results_df.sort_values(by='Cramers V', ascending=False)

# Mostrar os resultados
print(results_df)


                      Variable         Chi2        p-value  Cramers V
9              OCCUPATION_TYPE  1402.846796  3.784500e-288   0.081020
11           ORGANIZATION_TYPE  1609.240636  5.224541e-299   0.071048
5             NAME_INCOME_TYPE  1253.470808  1.928146e-266   0.063667
6          NAME_EDUCATION_TYPE  1019.213187  2.447681e-219   0.057458
1                  CODE_GENDER   920.791334  1.129022e-200   0.054661
7           NAME_FAMILY_STATUS   504.694083  7.744842e-107   0.040311
8            NAME_HOUSING_TYPE   420.556190   1.099089e-88   0.036761
0           NAME_CONTRACT_TYPE   293.150542   1.023515e-65   0.030823
14          WALLSMATERIAL_MODE   139.235314   1.453180e-27   0.029688
2                 FLAG_OWN_CAR   146.656018   9.330994e-34   0.021764
13              HOUSETYPE_MODE    27.632556   9.992328e-07   0.012934
12          FONDKAPREMONT_MODE    16.809897   7.732982e-04   0.011919
15         EMERGENCYSTATE_MODE    23.678150   1.138680e-06   0.011841
4              NAME_

In [None]:
from sklearn.model_seletion import train_test_split
# FAZER O MERGE DE TODAS ANTES
# separando em treino e teste:


X = # independentes
y = df.iloc['TARGET']

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.3, random_state=41)

In [33]:
confusion_matrix

TARGET,0,1
EMERGENCYSTATE_MODE,Unnamed: 1_level_1,Unnamed: 2_level_1
No,148324,11104
Yes,2105,223


In [None]:
SK_ID_CURR: Identificação do empréstimo na amostra. Cada linha corresponde a um empréstimo diferente.

TARGET: 1 - cliente teve dificuldades de pagamento (atrasos em pelo menos uma das primeiras parcelas do empréstimo)
        0 - cliente não teve dificuldades

NAME_CONTRACT_TYPE: Tipo de contrato do empréstimo. Pode ser Cash loans (empréstimo em dinheiro) ou Revolving loans (empréstimo rotativo, como cartão de crédito).

CODE_GENDER: Gênero do cliente (M para masculino, F para feminino).

FLAG_OWN_CAR: Indicador se o cliente possui um carro (Y para sim, N para não).
    
AMT_REQ_CREDIT_BUREAU_DAY: Número de consultas ao bureau de crédito sobre o cliente um dia antes da aplicação (excluindo consultas uma hora antes da aplicação).

AMT_REQ_CREDIT_BUREAU_WEEK: Número de consultas ao bureau de crédito sobre o cliente uma semana antes da aplicação (excluindo consultas um dia antes da aplicação).

AMT_REQ_CREDIT_BUREAU_MON: Número de consultas ao bureau de crédito sobre o cliente um mês antes da aplicação (excluindo consultas uma semana antes da aplicação).

AMT_REQ_CREDIT_BUREAU_QRT: Número de consultas ao bureau de crédito sobre o cliente três meses antes da aplicação (excluindo consultas um mês antes da aplicação).

AMT_REQ_CREDIT_BUREAU_YEAR: Número de consultas ao bureau de crédito sobre o cliente um ano antes da aplicação (excluindo consultas nos últimos três meses antes da aplicação).



In [8]:
application_train.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [8]:
from pyspark.sql.functions import col, sum
POS_CASH_b.select([sum(col(c).isNull().cast("int")).alias(c) for c in POS_CASH_b.columns])

                                                                                

SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,0,0,26071,26087,0,0,0


In [None]:
# substituir nulos pela média no spark:

from pyspark.sql.functions import avg

# Calcular a média para cada coluna numérica
mean_values = POS_CASH_b.select([avg(c).alias(c) for c in POS_CASH_b.columns if c != 'SK_ID_CURR']).collect()[0].asDict()

# Substituir os valores nulos pelas médias calculadas
POS_CASH_b = POS_CASH_b.fillna(mean_values)


In [35]:
display(bureau_balance)

SK_ID_BUREAU,MONTHS_BALANCE,STATUS
5715448,0,C
5715448,-1,C
5715448,-2,C
5715448,-3,C
5715448,-4,C
5715448,-5,C
5715448,-6,C
5715448,-7,C
5715448,-8,C
5715448,-9,0


In [36]:
display(bureau)

SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,
215354,5714467,Active,currency 1,-273,0,27460.0,,0.0,0,180000.0,71017.38,108982.62,0.0,Credit card,-31,
215354,5714468,Active,currency 1,-43,0,79.0,,0.0,0,42103.8,42103.8,0.0,0.0,Consumer credit,-22,
162297,5714469,Closed,currency 1,-1896,0,-1684.0,-1710.0,14985.0,0,76878.45,0.0,0.0,0.0,Consumer credit,-1710,
162297,5714470,Closed,currency 1,-1146,0,-811.0,-840.0,0.0,0,103007.7,0.0,0.0,0.0,Consumer credit,-840,
162297,5714471,Active,currency 1,-1146,0,-484.0,,0.0,0,4500.0,0.0,0.0,0.0,Credit card,-690,


In [37]:
display(credit_card_balance)

SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
2646502,380010,-7,82903.815,270000,0.0,0.0,0.0,0.0,4449.105,3825.0,3825.0,80519.04,82773.315,82773.315,0.0,0,0.0,0.0,2.0,Active,7,0
1079071,171320,-6,353451.645,585000,67500.0,67500.0,0.0,0.0,14684.175,15750.0,15750.0,345433.86,351881.145,351881.145,1.0,1,0.0,0.0,6.0,Active,0,0
2095912,118650,-7,47962.125,45000,45000.0,45000.0,0.0,0.0,0.0,264.69,0.0,44735.31,47962.125,47962.125,1.0,1,0.0,0.0,51.0,Active,0,0
2181852,367360,-4,291543.075,292500,90000.0,289339.425,0.0,199339.425,130.5,4093.515,4093.515,285376.41,286831.575,286831.575,3.0,8,0.0,5.0,3.0,Active,0,0
1235299,203885,-5,201261.195,225000,76500.0,111026.7,0.0,34526.7,6338.34,45000.0,45000.0,192793.275,197224.695,197224.695,3.0,9,0.0,6.0,38.0,Active,0,0


In [13]:
pd.set_option('display.max_colwidth', None)
home_cred_col_descrip.head(20)

Unnamed: 0.1,Unnamed: 0,Table,Row,Description,Special
0,1,application_{train|test}.csv,SK_ID_CURR,ID of loan in our sample,
1,2,application_{train|test}.csv,TARGET,"Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)",
2,5,application_{train|test}.csv,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
3,6,application_{train|test}.csv,CODE_GENDER,Gender of the client,
4,7,application_{train|test}.csv,FLAG_OWN_CAR,Flag if the client owns a car,
5,8,application_{train|test}.csv,FLAG_OWN_REALTY,Flag if client owns a house or flat,
6,9,application_{train|test}.csv,CNT_CHILDREN,Number of children the client has,
7,10,application_{train|test}.csv,AMT_INCOME_TOTAL,Income of the client,
8,11,application_{train|test}.csv,AMT_CREDIT,Credit amount of the loan,
9,12,application_{train|test}.csv,AMT_ANNUITY,Loan annuity,


In [46]:
pandas_df = home_cred_col_descrip.toPandas()
pandas_df.shape

(219, 5)

In [14]:
# selecionar apenas as colunas que interessam para aprender sobre elas

In [None]:
#MODELO

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Carregar um conjunto de dados de exemplo
data = load_iris()
X, y = data.data, data.target

# Introduzir alguns valores ausentes para demonstrar a imputação
X[0, 0] = np.nan
X[2, 2] = np.nan

# Criar um pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Passo 1: Imputação de valores ausentes
    ('scaler', StandardScaler()),                # Passo 2: Escalonamento dos dados
    ('classifier', RandomForestClassifier())     # Classificador de exemplo
])

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar o pipeline aos dados de treino
pipeline.fit(X_train, y_train)

# Avaliar o pipeline nos dados de teste
accuracy = pipeline.score(X_test, y_test)
print("Acurácia:", accuracy)

In [None]:
# TRATANDO DIFERENTES COLUNAS DE FORMAS DIFERENTES

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Criar um DataFrame de exemplo
data = {
    'age': [25, np.nan, 35, 45, 22],
    'salary': [50000, 60000, np.nan, 100000, 30000],
    'city': ['New York', 'San Francisco', 'New York', np.nan, 'Chicago'],
    'bought': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Separar características e alvo
X = df.drop('bought', axis=1)
y = df['bought']

# Definir transformações para diferentes tipos de colunas
numeric_features = ['age', 'salary']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['city']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar transformações usando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Criar um pipeline que inclui o preprocessador e o modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar o pipeline aos dados de treino
pipeline.fit(X_train, y_train)

# Fazer previsões nos dados de teste
y_pred = pipeline.predict(X_test)

# Avaliar o pipeline nos dados de teste
accuracy = pipeline.score(X_test, y_test)
print("Acurácia:", accuracy)

Exemplo de Ordem para um Projeto de Machine Learning:
Desenvolva um pipeline:

Defina as etapas de pré-processamento e modelagem.
Crie e teste um pipeline básico.
Detecte e trate a multicolinearidade:

Inspecione as variáveis no pipeline.
Aplique técnicas para ajustar a multicolinearidade, se necessário.
Realize a cross-validation:

Aplique a cross-validation usando o pipeline finalizado para avaliar o desempenho do modelo.
Resumindo
Pipeline: Crie e estruture seu fluxo de trabalho.
Detecção de Multicolinearidade: Verifique e ajuste variáveis.
Cross-Validation: Avalie o desempenho do modelo.

In [None]:
# PIPELINE PARA TRATAR AS COLUNAS TAMBÉM!!!

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Definindo o transformador personalizado
class EmpLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed = X_transformed.str.replace('+ years', '')
        X_transformed = X_transformed.str.replace('< 1 year', '0')
        X_transformed = X_transformed.str.replace(' years', '')
        X_transformed = X_transformed.str.replace(' year', '')
        X_transformed = X_transformed.fillna('0')
        return X_transformed

# Exemplo de dados
data = {
    'emp_length': ['10+ years', '2 years', '< 1 year', '3 years', '5 years', None],
    'feature1': [5, 3, 1, 4, 2, 6],
    'target': [1, 0, 1, 0, 1, 0]
}
datos = pd.DataFrame(data)

# Separando características e alvo
X = datos[['emp_length', 'feature1']]
y = datos['target']

# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definindo o ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('emp_length', EmpLengthTransformer(), 'emp_length')
    ],
    remainder='passthrough'  # mantém as outras colunas inalteradas
)

# Definindo o pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Ajustando o pipeline aos dados de treinamento
pipeline.fit(X_train, y_train)

# Transformando os dados de teste e fazendo previsões
y_pred = pipeline.predict(X_test)

# Avaliando o modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
# WEIGHT OF EVIDENCE

import pandas as pd
import numpy as np

# Função para calcular WoE
def calculate_woe(df, var, target):
    eps = 1e-10  # Small value to avoid division by zero
    df = df[[var, target]].copy()
    df['good'] = (df[target] == 0).astype(int)
    df['bad'] = (df[target] == 1).astype(int)
    
    grouped = df.groupby(var).agg({'good': 'sum', 'bad': 'sum'})
    grouped['good_pct'] = grouped['good'] / grouped['good'].sum() + eps
    grouped['bad_pct'] = grouped['bad'] / grouped['bad'].sum() + eps
    grouped['woe'] = np.log(grouped['good_pct'] / grouped['bad_pct'])
    
    return grouped[['woe']].reset_index()

# Função para aplicar WoE às variáveis
def apply_woe(df, var, woe_df):
    return df.merge(woe_df, on=var, how='left').drop(columns=[var])

# Exemplo de uso
data = pd.DataFrame({
    'age': [25, 45, 35, 50, 23, 43, 36],
    'income': [50000, 100000, 75000, 120000, 60000, 95000, 80000],
    'default': [0, 1, 0, 1, 0, 1, 0]
})

# Binning da variável 'age'
data['age_bin'] = pd.cut(data['age'], bins=[20, 30, 40, 50, 60], labels=['20-30', '30-40', '40-50', '50-60'])

# Calcular WoE para 'age_bin'
woe_age = calculate_woe(data, 'age_bin', 'default')

# Aplicar WoE
data_woe = apply_woe(data, 'age_bin', woe_age)
print(data_woe)

In [None]:
# FEATURE IMPORTANCE + WOE

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Suponha que você tenha uma lista de DataFrames
dataframes = [df1, df2, df3, ...]  # Substitua com seus DataFrames reais

# Passo 1: Calcular Feature Importance
def calculate_feature_importance(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    model = RandomForestClassifier()
    model.fit(X, y)
    importances = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
    return feature_importance.sort_values(by='Importance', ascending=False)

# Passo 2: Selecionar as variáveis mais importantes
important_features = []
for df in dataframes:
    feature_importance = calculate_feature_importance(df, 'default')
    important_features.extend(feature_importance.head(10)['Feature'].tolist())  # Selecionando as 10 mais importantes de cada DataFrame

important_features = list(set(important_features))  # Remover duplicatas

# Passo 3: Calcular WoE para as variáveis selecionadas
def calculate_woe(df, var, target):
    eps = 1e-10
    df = df[[var, target]].copy()
    df['good'] = (df[target] == 0).astype(int)
    df['bad'] = (df[target] == 1).astype(int)
    
    grouped = df.groupby(var).agg({'good': 'sum', 'bad': 'sum'})
    grouped['good_pct'] = grouped['good'] / grouped['good'].sum() + eps
    grouped['bad_pct'] = grouped['bad'] / grouped['bad'].sum() + eps
    grouped['woe'] = np.log(grouped['good_pct'] / grouped['bad_pct'])
    
    return grouped[['woe']].reset_index()

def apply_woe(df, var, woe_df):
    return df.merge(woe_df, on=var, how='left').drop(columns=[var])

for df in dataframes:
    for feature in important_features:
        if feature in df.columns:
            if df[feature].dtype == 'object':
                woe_df = calculate_woe(df, feature, 'default')
                df = apply_woe(df, feature, woe_df)
            else:
                # Binning para variáveis numéricas
                df[f'{feature}_bin'] = pd.qcut(df[feature], q=10, duplicates='drop')
                woe_df = calculate_woe(df, f'{feature}_bin', 'default')
                df = apply_woe(df, f'{feature}_bin', woe_df)

# Passo 4: Treinar o modelo final com variáveis transformadas
X = df[important_features]
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy}')

In [32]:
# PLOT


# Create a color palette
rainbow_palette = sns.color_palette("icefire", len(top_30_models))

# Plot
plt.figure(figsize=(20, 10))
sns.barplot(x=top_30_models.index, y=top_30_models.values, hue=top_30_models.index, palette=rainbow_palette, alpha=0.8, legend=False)
plt.title('Top 30 Modelos de Carros Mais Vendidos', fontsize=30)
plt.ylabel('Número de Carros', fontsize=25)
plt.xlabel('Modelo', fontsize=25)
plt.xticks(rotation=90, fontsize=20)
plt.yticks(fontsize=20)
plt.show()

24/07/13 21:29:26 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:31:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:31:45 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:32:05 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:32:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:32:55 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:33:15 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:33:35 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:33:55 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:34:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:34:55 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:35:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:35:55 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:36:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:41:27 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:44:06 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:47:59 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:52:26 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:52:46 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:53:16 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:53:46 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:54:06 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:54:36 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

24/07/13 21:54:56 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50470)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda3/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda3/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda3/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/opt/anaconda3/lib/python3.11/site-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-pac