## Bibliotecas

In [1]:
import sys
sys.path.append('../../../')

In [2]:
## Spark SQL
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Spark ML
from pyspark.ml.pipeline import Pipeline#, PipelineModel
from pyspark.ml.param import Param
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Imputer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, Evaluator, RegressionEvaluator
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [3]:
import mlflow.pyspark.ml

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

## Spark ML

### Binary Classification

#### Data Split

In [5]:
df_train = spark.read.parquet('../../../data/raw/raw_train')
df_test = spark.read.parquet('../../../data/raw/raw_test')

#### Preprocessing

In [6]:
df_train.limit(5).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age
0,1,2,female,34.0
1,1,2,female,31.0
2,1,1,male,36.0
3,1,3,male,29.0
4,0,2,male,18.0


In [7]:
# df_train = df_train.withColumn("Survived", f.when(f.rand() >= 0.75, 2).otherwise(f.col('Survived')))

In [8]:
df_train.groupby('Survived').count().toPandas()

Unnamed: 0,Survived,count
0,1,244
1,0,379


In [9]:
from src.ml.preprocessing.preprocessing import SparkPreprocessor

In [10]:
from src.ml.preprocessing.normalization import SparkScaler

In [11]:
preproc = SparkPreprocessor(cat_cols = ['Sex', 'Pclass'])

In [12]:
imputer = Imputer(inputCols=['Age'], strategy='mean', outputCols = ['Age'])

In [13]:
df_preproc = preproc.execute(df_train, )

INFO:root:Treating categorical data...


In [14]:
df_preproc = imputer.fit(df_preproc).transform(df_preproc)

#### Model

In [15]:
from src.ml.model.trainer import SparkTrainer

In [16]:
from src.ml.model.metrics import Metrics, CustomRegressionEvaluator

In [17]:
model = SparkTrainer()

In [18]:
eva = CustomRegressionEvaluator("mape", 'Age')

In [30]:
model.train(df_preproc, False, LinearRegression, labelCol='Age', data_split=('cv', {'numFolds': 5, 'param_grid': {'regParam': [0, 1,  57]}, 'evaluator': eva}))\
.predict_proba(df_preproc)

AnalysisException: cannot resolve '`probability`' given input columns: [Age, Pclass, Pclass_indexed, Pclass_ohe, Sex, Sex_indexed, Sex_ohe, Survived, features, prediction];
'Project ['probability]
+- Project [Survived#0, Pclass#1, Sex#2, Age#159, Sex_indexed#81, Pclass_indexed#82, Sex_ohe#103, Pclass_ohe#104, features#134, UDF(features#134) AS prediction#107989]
   +- Project [Survived#0, Pclass#1, Sex#2, cast(CASE WHEN isnull(cast(Age#3 as double)) THEN 29.88569105691057 WHEN (cast(Age#3 as double) = NaN) THEN 29.88569105691057 ELSE cast(Age#3 as double) END as double) AS Age#159, Sex_indexed#81, Pclass_indexed#82, Sex_ohe#103, Pclass_ohe#104, features#134]
      +- Project [Survived#0, Pclass#1, Sex#2, Age#3, Sex_indexed#81, Pclass_indexed#82, Sex_ohe#103, Pclass_ohe#104, UDF(struct(Sex_ohe, Sex_ohe#103, Pclass_ohe, Pclass_ohe#104)) AS features#134]
         +- Filter AtLeastNNulls(n, Sex_ohe#103,Pclass_ohe#104)
            +- Project [Survived#0, Pclass#1, Sex#2, Age#3, Sex_indexed#81, Pclass_indexed#82, UDF(cast(Sex_indexed#81 as double), 0) AS Sex_ohe#103, UDF(cast(Pclass_indexed#82 as double), 1) AS Pclass_ohe#104]
               +- Project [Survived#0, Pclass#1, Sex#2, Age#3, UDF(cast(Sex#2 as string)) AS Sex_indexed#81, UDF(cast(Pclass#1 as string)) AS Pclass_indexed#82]
                  +- Relation[Survived#0,Pclass#1,Sex#2,Age#3] parquet


In [23]:
if {'maxIter': [0, 5]}:
    print(8)

8


In [18]:
scaler = SparkScaler('Age', 'zscore')

In [20]:
type(scaler.fit(df_train)).load("teste")

PipelineModel_03a36a953ebe

In [166]:
PipelineModel.load("teste").transform(df_train)

DataFrame[Survived: int, Pclass: int, Sex: string, Age: double, zscore_vec: vector, zscore_scaled: vector]

In [151]:
eva.evaluate(lr.fit(df_preproc).transform(df_preproc))

[0.8229166666666666, 0.7696078431372549]