## Biblioteca

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml import Estimator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression, BinaryLogisticRegressionSummary
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [3]:
spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

## Dados

In [4]:
df_train = spark.read.parquet('../../../data/raw/raw_train').withColumn('rand', f.rand())

In [5]:
df_test = spark.read.parquet('../../../data/raw/raw_test')

## Preprocessing

In [25]:
string_index = StringIndexer(inputCol = 'Sex', 
                             outputCol='Sex_index', 
                             handleInvalid = 'keep')
one_hot = OneHotEncoder(inputCol = 'Sex_index', 
                        outputCol='Sex_OHE', 
                        handleInvalid = 'keep')
vecAssembler = VectorAssembler(inputCols=['Pclass', 'Age', 'Sex_OHE'], 
                               outputCol="features", 
                               handleInvalid = 'skip')

In [5]:
vecAssembler = VectorAssembler(inputCols=['Age', 'rand'], 
                               outputCol="num_scaled", 
                               handleInvalid = 'skip')

In [11]:
vecAssembler.transform(df_train).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,rand,num_scaled
0,1,2,female,34.0,0.497388,"[34.0, 0.4973881996720384]"
1,1,2,female,31.0,0.917723,"[31.0, 0.9177232089304834]"
2,1,1,male,36.0,0.052448,"[36.0, 0.05244801994809012]"
3,1,3,male,29.0,0.407875,"[29.0, 0.4078747771655139]"
4,0,2,male,18.0,0.723134,"[18.0, 0.7231336669138928]"
...,...,...,...,...,...,...
487,1,2,female,34.0,0.848951,"[34.0, 0.8489507335486182]"
488,1,2,female,30.0,0.646160,"[30.0, 0.6461595801992537]"
489,0,3,male,32.0,0.145363,"[32.0, 0.14536288261704877]"
490,0,3,male,30.0,0.227605,"[30.0, 0.22760470612009165]"


In [17]:
ss = StandardScaler(withMean = True, inputCol = 'num_scaled', outputCol='num_scaled_2')

In [4]:
from pyspark.ml import Estimator

def fit_transform(self, df):
    self.model = self.fit(df)
    self.transform = self.model.transform
    return self.transform(df)

Estimator.fit_transform = fit_transform

In [27]:
ss.fit_transform(vecAssembler.transform(df_train)).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,rand,num_scaled,num_scaled_2
0,1,2,female,34.0,0.497388,"[34.0, 0.4973881996720384]","[0.2873779211086445, 0.09794326811014781]"
1,1,2,female,31.0,0.917723,"[31.0, 0.9177232089304834]","[0.0778327033694635, 1.5703544107028142]"
2,1,1,male,36.0,0.052448,"[36.0, 0.05244801994809012]","[0.4270747329347652, -1.46065847569538]"
3,1,3,male,29.0,0.407875,"[29.0, 0.4078747771655139]","[-0.06186410845665717, -0.21561748178488085]"
4,0,2,male,18.0,0.723134,"[18.0, 0.7231336669138928]","[-0.8301965735003208, 0.8887176124672758]"
...,...,...,...,...,...,...,...
487,1,2,female,34.0,0.848951,"[34.0, 0.8489507335486182]","[0.2873779211086445, 1.3294480935378854]"
488,1,2,female,30.0,0.646160,"[30.0, 0.6461595801992537]","[0.007984297456403168, 0.6190814877735968]"
489,0,3,male,32.0,0.145363,"[32.0, 0.14536288261704877]","[0.14768110928252384, -1.135182662135143]"
490,0,3,male,30.0,0.227605,"[30.0, 0.22760470612009165]","[0.007984297456403168, -0.8470939361290618]"


In [39]:
df_train.filter('age is null').count()

131

In [26]:
(0.4973881996720384 - 0.481847)/0.285932

0.05435278203222575

In [44]:
(0.9177232089304834 - 0.478832)/0.278662

1.5749948286113047

In [43]:
(0.910421490343801 - 0.478832)/0.278662

1.548792050382905

In [42]:
df_train.filter('age is not null').agg(f.mean('rand'), f.stddev('rand')).toPandas()

Unnamed: 0,avg(rand),stddev_samp(rand)
0,0.478832,0.278662


In [26]:
logistic = LogisticRegression(labelCol = 'Survived', family='binomial')

In [27]:
pipeline = Pipeline(stages=[string_index, one_hot, vecAssembler, logistic])

In [29]:
pred_df = pipeline_fit.transform(df_test)

In [97]:
evaluator = MulticlassClassificationEvaluator(labelCol='Survived', metricName='recallByLabel')

In [105]:
pipeline_fit.stages[-1].labelCol

Param(parent='LogisticRegression_caaa95474919', name='labelCol', doc='label column name.')

In [None]:
f1
accuracy
truePositiveRateByLabel
falsePositiveRateByLabel
precisionByLabel
recallByLabel
fMeasureByLabel

In [99]:
pipeline_fit.stages[-1].summary.recallByLabel

[0.8415492957746479, 0.7307692307692307]

In [109]:
pipeline_fit.stages[-1].coefficients

DenseVector([-1.2354, -0.0415, 1.6495, 4.082, 0.0])

In [98]:
evaluator.evaluate(pipeline_fit.transform(df_train))

0.8415492957746479

In [122]:
from pyspark.sql.window import Window
df_train = df_train.withColumn('Survived', f.expr("case when Survived = 1 then 'surv' else 'died' end"))

In [120]:
pipeline_fit.transform(df_train).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,rank,Sex_index,Sex_OHE,features,rawPrediction,probability,prediction
0,1,2,female,34.0,1,1.0,"(0.0, 1.0, 0.0)","[2.0, 34.0, 0.0, 1.0, 0.0]","[-1.7458025797299273, 1.7458025797299273]","[0.14857739788066304, 0.851422602119337]",1.0
1,1,2,female,31.0,2,1.0,"(0.0, 1.0, 0.0)","[2.0, 31.0, 0.0, 1.0, 0.0]","[-1.8558896451697067, 1.8558896451697067]","[0.13518286703158428, 0.8648171329684158]",1.0
2,1,1,male,36.0,3,0.0,"(1.0, 0.0, 0.0)","[1.0, 36.0, 1.0, 0.0, 0.0]","[-0.44425921429168413, 0.44425921429168413]","[0.3907265530268593, 0.6092734469731407]",1.0
3,1,3,male,29.0,4,0.0,"(1.0, 0.0, 0.0)","[3.0, 29.0, 1.0, 0.0, 0.0]","[1.476348849423135, -1.476348849423135]","[0.8140204624622408, 0.1859795375377592]",0.0
4,1,1,female,63.0,5,1.0,"(0.0, 1.0, 0.0)","[1.0, 63.0, 0.0, 1.0, 0.0]","[-1.7703665553492083, 1.7703665553492083]","[0.14549675012441243, 0.8545032498755876]",1.0
...,...,...,...,...,...,...,...,...,...,...,...
355,0,3,male,35.0,193,0.0,"(1.0, 0.0, 0.0)","[3.0, 35.0, 1.0, 0.0, 0.0]","[1.6965229803026938, -1.6965229803026938]","[0.8450800703716294, 0.15491992962837065]",0.0
356,0,1,male,45.0,194,0.0,"(1.0, 0.0, 0.0)","[1.0, 45.0, 1.0, 0.0, 0.0]","[-0.11399801797234455, 0.11399801797234455]","[0.47153131934039844, 0.5284686806596015]",1.0
357,0,3,male,21.0,195,0.0,"(1.0, 0.0, 0.0)","[3.0, 21.0, 1.0, 0.0, 0.0]","[1.1827833415837217, -1.1827833415837217]","[0.7654478868835106, 0.23455211311648938]",0.0
358,0,3,male,23.0,198,0.0,"(1.0, 0.0, 0.0)","[3.0, 23.0, 1.0, 0.0, 0.0]","[1.2561747185435752, -1.2561747185435752]","[0.778366904079927, 0.22163309592007296]",0.0


In [93]:
evaluator.evaluate(pred_df)

0.8198198198198198

In [88]:
pipeline_fit.stages[3].summary.accuracy

0.7947154471544715

In [46]:
mod = pipeline_fit.stages[3]

In [52]:
pipeline_fit.stages[::-1]

[LogisticRegressionModel: uid=LogisticRegression_caaa95474919, numClasses=2, numFeatures=5,
 VectorAssembler_4ad6129463d0,
 OneHotEncoderModel: uid=OneHotEncoder_cc9bff4e9d4e, dropLast=true, handleInvalid=keep,
 StringIndexerModel: uid=StringIndexer_820f4d2f8eba, handleInvalid=keep]

In [56]:
pipeline_fit.evaluate(pred_df)

AttributeError: 'PipelineModel' object has no attribute 'evaluate'

In [41]:
BinaryLogisticRegressionSummary(pred_df).pr()

AttributeError: 'DataFrame' object has no attribute 'pr'

In [16]:
pipeline_fit.stages[3].summary.precisionByLabel

[0.8101694915254237, 0.7715736040609137]

In [18]:
pipeline_fit.stages[3].summary.recallByLabel

[0.8415492957746479, 0.7307692307692307]

In [None]:
df.randomSplit([])

In [20]:
pipeline_fit.stages[3].summary.roc.toPandas()

Unnamed: 0,FPR,TPR
0,0.000000,0.000000
1,0.003521,0.000000
2,0.003521,0.004808
3,0.003521,0.009615
4,0.003521,0.014423
...,...,...
232,0.989437,1.000000
233,0.992958,1.000000
234,0.996479,1.000000
235,1.000000,1.000000


In [114]:
pred_df.limit(5).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,Sex_index,Sex_OHE,features,rawPrediction,probability,prediction
0,1,2,female,34.0,1.0,"(0.0, 1.0, 0.0)","[2.0, 34.0, 0.0, 1.0, 0.0]","[-1.227818198409344, 1.227818198409344]","[0.22656352015616585, 0.7734364798438341]",1.0
1,1,2,female,31.0,1.0,"(0.0, 1.0, 0.0)","[2.0, 31.0, 0.0, 1.0, 0.0]","[-1.3522841969050214, 1.3522841969050214]","[0.20549718444957693, 0.794502815550423]",1.0
2,1,1,male,36.0,0.0,"(1.0, 0.0, 0.0)","[1.0, 36.0, 1.0, 0.0, 0.0]","[0.05226962139659075, -0.05226962139659075]","[0.5130644310257452, 0.48693556897425483]",0.0
3,1,3,male,29.0,0.0,"(1.0, 0.0, 0.0)","[3.0, 29.0, 1.0, 0.0, 0.0]","[2.2326309820607175, -2.2326309820607175]","[0.9031417529494812, 0.09685824705051882]",0.0
4,0,2,male,18.0,0.0,"(1.0, 0.0, 0.0)","[2.0, 18.0, 1.0, 0.0, 0.0]","[0.54086464233288, -0.54086464233288]","[0.632013532645747, 0.367986467354253]",0.0


In [97]:
evaluator = BinaryClassificationEvaluator(labelCol='Survived', metricName= 'precision')

In [110]:
BinaryLogisticRegressionSummary(pipeline_fit.stages[3])

<pyspark.ml.classification.BinaryLogisticRegressionSummary at 0x19e25146b80>

In [98]:
evaluator.evaluate(pred_df)

IllegalArgumentException: BinaryClassificationEvaluator_be6cd2bd628d parameter metricName given invalid value precision.

In [None]:
results = {'accuracy': accuracy_score(y_true, y_pred),
                   'f1': f1_score(y_true, y_pred),
                   'precision': precision_score(y_true, y_pred),
                   'recall': recall_score(y_true, y_pred),
                   'roc_auc': roc_auc_score(y_true, y_probs)}