In [None]:
from pyspark.sql import SparkSession

In [None]:
spark: SparkSession = SparkSession \
  .builder \
  .master('local[*]') \
  .appName('classification with spark') \
  .getOrCreate()

In [None]:
spark

In [None]:
df = spark.read.csv('./data/dados_clientes.csv', header=True, inferSchema=True)

In [None]:
df.show()

In [None]:
df.count()

In [None]:
df.groupBy('Churn').count().show()

In [None]:
df.printSchema()

In [None]:
binary_columns = [
  'Churn',
  'Conjuge',
  'Dependentes',
  'TelefoneFixo',
  'MaisDeUmaLinhaTelefonica',
  'SegurancaOnline',
  'BackupOnline',
  'SeguroDispositivo',
  'SuporteTecnico',
  'TVaCabo',
  'StreamingFilmes',
  'ContaCorreio'
]

In [None]:
from pyspark.sql import functions as f

In [None]:
all_columns = [f.when(f.col(c) == 'Sim', 1).otherwise(0).alias(c) for c in binary_columns]

In [None]:
[all_columns.insert(0, c) if c not in binary_columns else None for c in reversed(df.columns)]    

In [None]:
all_columns

In [None]:
dataset = df.select(all_columns)

In [None]:
dataset.show()

In [None]:
dataset.printSchema()

In [None]:
dataset.select('Internet', 'TipoContrato', 'MetodoPagamento').show()

In [None]:
internet = dataset.groupBy('id').pivot('Internet').agg(f.lit(1)).fillna(0)
internet = internet \
  .select([f.col(c).alias(f'Internet_{c}') if c != 'id' else f.col(c) for c in internet.columns])


contract_type = dataset.groupBy('id').pivot('TipoContrato').agg(f.lit(1)).fillna(0)
contract_type = contract_type \
  .select([f.col(c).alias(f'TipoContrato_{c}') if c != 'id' else f.col(c) for c in contract_type.columns])


payment_method = dataset.groupBy('id').pivot('MetodoPagamento').agg(f.lit(1)).fillna(0)
payment_method = payment_method \
  .select([f.col(c).alias(f'MetodoPagamento_{c}') if c != 'id' else f.col(c) for c in payment_method.columns])

In [None]:
drop_columns = ['Internet', 'TipoContrato', 'MetodoPagamento']

dataset = dataset \
  .join(internet, 'id', how='inner') \
  .join(contract_type, 'id', how='inner') \
  .join(payment_method, 'id', how='inner') \
  .drop(*drop_columns)

In [None]:
dataset.printSchema()

In [None]:
dataset.show()

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
dataset = dataset.withColumnRenamed('Churn', 'label')

In [None]:
drop_columns = ['label', 'id']
x = []

[x.append(c) if c not in drop_columns else None for c in dataset.columns]

In [None]:
assembler = VectorAssembler(inputCols=x, outputCol='features')

In [None]:
dataset_prep = assembler.transform(dataset).select('features', 'label')

In [None]:
dataset_prep.show(truncate=False)

In [None]:
seed = 101

In [None]:
training, test = dataset_prep.randomSplit([0.7, 0.3], seed=seed)

In [None]:
training.count()

In [None]:
test.count()

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
model_lr = lr.fit(training)

In [None]:
predictions_lr_test = model_lr.transform(test)

In [None]:
predictions_lr_test.show()

In [None]:
training_lr_summary = model_lr.summary

In [None]:
training_lr_summary.accuracy

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier(seed=seed)

In [None]:
modelo_dtc = dtc.fit(training)

In [None]:
predictions_dtc_training = modelo_dtc.transform(training)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
evaluator = MulticlassClassificationEvaluator()

In [None]:
predictions_dtc_test = modelo_dtc.transform(test)

In [None]:
lr_test_tp = predictions_lr_test.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') == 1)).count()
lr_test_tn = predictions_lr_test.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') == 0)).count()
lr_test_fp = predictions_lr_test.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') == 1)).count()
lr_test_fn = predictions_lr_test.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') == 0)).count()

dtc_training_tp = predictions_dtc_training.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') == 1)).count()
dtc_training_tn = predictions_dtc_training.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') == 0)).count()
dtc_training_fp = predictions_dtc_training.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') == 1)).count()
dtc_training_fn = predictions_dtc_training.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') == 0)).count()

dtc_test_tp = predictions_dtc_test.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') == 1)).count()
dtc_test_tn = predictions_dtc_test.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') == 0)).count()
dtc_test_fp = predictions_dtc_test.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') == 1)).count()
dtc_test_fn = predictions_dtc_test.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') == 0)).count()

In [None]:
print('='*80)
print('- Relatorio dos desempenhos dos modelos')
print('='*80)
print('- Modelo de regressão lógistica')
print('-'*80)
print("- Acurácia: %f" % training_lr_summary.accuracy)
print("- Precisão: %f" % training_lr_summary.precisionByLabel[1])
print("- Recall: %f" % training_lr_summary.recallByLabel[1])
print("- F1: %f" % training_lr_summary.fMeasureByLabel()[1])
print('-'*80)
print('- Resultado com os dados de teste')
print('-'*80)
print(f'- Churn   : {lr_test_tp} acertos | {lr_test_fp} erros')
print(f'- No-Churn: {lr_test_tn} acertos | {lr_test_fn} erros')
print('='*80)
print('- Modelo de Árvore de decisão')
print('-'*80)
print('- Treino')
print('-'*80)
print("- Acurácia: %f" % evaluator.evaluate(predictions_dtc_training, {evaluator.metricName: "accuracy"}))
print("- Precisão: %f" % evaluator.evaluate(predictions_dtc_training, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print("- Recall: %f" % evaluator.evaluate(predictions_dtc_training, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print("- F1: %f" % evaluator.evaluate(predictions_dtc_training, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 1}))
print('-'*80)
print('- Resultado com os dados de treino')
print('-'*80)
print(f'- Churn   : {dtc_training_tp} acertos | {dtc_training_fp} erros')
print(f'- No-Churn: {dtc_training_tn} acertos | {dtc_training_fn} erros')
print('-'*80)
print('- Teste')
print('-'*80)
print("- Acurácia: %f" % evaluator.evaluate(predictions_dtc_test, {evaluator.metricName: "accuracy"}))
print("- Precisão: %f" % evaluator.evaluate(predictions_dtc_test, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print("- Recall: %f" % evaluator.evaluate(predictions_dtc_test, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print("- F1: %f" % evaluator.evaluate(predictions_dtc_test, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: 1}))
print('-'*80)
print('- Resultado com os dados de teste')
print('-'*80)
print(f'- Churn   : {dtc_test_tp} acertos | {dtc_test_fp} erros')
print(f'- No-Churn: {dtc_test_tn} acertos | {dtc_test_fn} erros')
print('='*80)