## Biblioteca

In [3]:
import sys
sys.path.append('../../../')

In [99]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression, BinaryLogisticRegressionSummary
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

## Dados

In [34]:
spark.read.options(**{'header': True}).csv('../../../data/raw/raw_test.csv', 
                                          schema = 'Survived INT, Pclass INT, Sex STRING, Age DOUBLE')\
.write.parquet('../../../data/raw/raw_test', mode='overwrite')

In [35]:
spark.read.options(**{'header': True}).csv('../../../data/raw/raw_train.csv', 
                                          schema = 'Survived INT, Pclass INT, Sex STRING, Age DOUBLE')\
.write.parquet('../../../data/raw/raw_train', mode='overwrite')

In [38]:
df = spark.read.parquet('../../../data/raw/raw_train')

## Preprocessing

In [77]:
string_index = StringIndexer(inputCol = 'Sex', 
                             outputCol='Sex_index', 
                             handleInvalid = 'keep')
one_hot = OneHotEncoder(inputCol = 'Sex_index', 
                        outputCol='Sex_OHE', 
                        handleInvalid = 'keep')
vecAssembler = VectorAssembler(inputCols=['Pclass', 'Age', 'Sex_OHE'], 
                               outputCol="features", 
                               handleInvalid = 'skip')

In [86]:
logistic = LogisticRegression(labelCol = 'Survived', family='binomial')

In [87]:
pipeline = Pipeline(stages=[string_index, one_hot, vecAssembler, logistic])

In [88]:
pipeline_fit = pipeline.fit(df)

In [92]:
pred_df = pipeline_fit.transform(df)

In [127]:
pipeline_fit.stages[3].summary.accuracy

0.7947154471544715

In [114]:
pred_df.limit(5).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,Sex_index,Sex_OHE,features,rawPrediction,probability,prediction
0,1,2,female,34.0,1.0,"(0.0, 1.0, 0.0)","[2.0, 34.0, 0.0, 1.0, 0.0]","[-1.227818198409344, 1.227818198409344]","[0.22656352015616585, 0.7734364798438341]",1.0
1,1,2,female,31.0,1.0,"(0.0, 1.0, 0.0)","[2.0, 31.0, 0.0, 1.0, 0.0]","[-1.3522841969050214, 1.3522841969050214]","[0.20549718444957693, 0.794502815550423]",1.0
2,1,1,male,36.0,0.0,"(1.0, 0.0, 0.0)","[1.0, 36.0, 1.0, 0.0, 0.0]","[0.05226962139659075, -0.05226962139659075]","[0.5130644310257452, 0.48693556897425483]",0.0
3,1,3,male,29.0,0.0,"(1.0, 0.0, 0.0)","[3.0, 29.0, 1.0, 0.0, 0.0]","[2.2326309820607175, -2.2326309820607175]","[0.9031417529494812, 0.09685824705051882]",0.0
4,0,2,male,18.0,0.0,"(1.0, 0.0, 0.0)","[2.0, 18.0, 1.0, 0.0, 0.0]","[0.54086464233288, -0.54086464233288]","[0.632013532645747, 0.367986467354253]",0.0


In [97]:
evaluator = BinaryClassificationEvaluator(labelCol='Survived', metricName= 'precision')

In [110]:
BinaryLogisticRegressionSummary(pipeline_fit.stages[3])

<pyspark.ml.classification.BinaryLogisticRegressionSummary at 0x19e25146b80>

In [98]:
evaluator.evaluate(pred_df)

IllegalArgumentException: BinaryClassificationEvaluator_be6cd2bd628d parameter metricName given invalid value precision.

In [None]:
results = {'accuracy': accuracy_score(y_true, y_pred),
                   'f1': f1_score(y_true, y_pred),
                   'precision': precision_score(y_true, y_pred),
                   'recall': recall_score(y_true, y_pred),
                   'roc_auc': roc_auc_score(y_true, y_probs)}