# Machine Learning with PySpark
   * Modelo de Classificação  
### Datasource
   * https://archive.ics.uci.edu/ml/datasets/HCV+data

In [1]:
#Biblioteca que iremos usar
!pip install pyspark



In [2]:
# carregar os pacotes
from pyspark import SparkContext

In [3]:
#para visualizar os jobs stages etc.. master ='local[2]' ou 'local[4]'
sc = SparkContext(master='local[4]')

In [4]:
sc

In [5]:
# carregar bibliotecas
from pyspark.sql import SparkSession

In [6]:
# Spark
spark = SparkSession.builder.appName("MLwithSpark").getOrCreate()

## Gestão de fluxo de trabalho(WorkFlow )
   * Preparação dos dados
   * Feature Engineering
   * Modelo de construção
   * Avalie

## Tarefa
   * Prever se um paciente é Hepatite ou parâmetro não baseado
   * O conjunto de dados contém valores laboratoriais de doadores de sangue e pacientes com hepatite C e valores demográficos, como idade.

In [7]:
# carregar a base de dados
df = spark.read.csv("/content/sample_data/hcvdata.csv",header=True,inferSchema=True)
# inferSchema=True (serve para aplicar o tipo de acordo com os valores em cada coluna)
# inferSchema=False (string)

In [8]:
#ver as 5 primeiras linhas
df.show(5)

+---+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
|_c0|     Category|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|
+---+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
|  1|0=Blood Donor| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|
|  2|0=Blood Donor| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|
|  3|0=Blood Donor| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|
|  4|0=Blood Donor| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|
|  5|0=Blood Donor| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|
+---+-------------+---+---+----+----+----+----+----+-----+----+-----+----+----+
only showing top 5 rows



In [9]:
# ver numero de  (linha,coluna)
print(df.count(),len(df.columns))

615 14


In [10]:
#ver as colunas
print(df.columns)

['_c0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT']


In [11]:
# reorganizar as colunas colocando a coluna "category na utima posição"
df = df.select('Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT','Category')

In [12]:
#verificando se as modificação feitas anterioramente foram aplicadas como esperado
# para isso vamos apenas printar as duas primeiras linha só para ver a nova posição da coluna "Category"
df.show(2)

+---+---+----+----+---+----+---+-----+----+-----+----+----+-------------+
|Age|Sex| ALB| ALP|ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|     Category|
+---+---+----+----+---+----+---+-----+----+-----+----+----+-------------+
| 32|  m|38.5|52.5|7.7|22.1|7.5| 6.93|3.23|106.0|12.1|  69|0=Blood Donor|
| 32|  m|38.5|70.3| 18|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|0=Blood Donor|
+---+---+----+----+---+----+---+-----+----+-----+----+----+-------------+
only showing top 2 rows



In [13]:
#verificando os tipos de dados
df.dtypes

[('Age', 'int'),
 ('Sex', 'string'),
 ('ALB', 'string'),
 ('ALP', 'string'),
 ('ALT', 'string'),
 ('AST', 'double'),
 ('BIL', 'double'),
 ('CHE', 'double'),
 ('CHOL', 'string'),
 ('CREA', 'double'),
 ('GGT', 'double'),
 ('PROT', 'string'),
 ('Category', 'string')]

In [14]:
# schema da tabala
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- ALB: string (nullable = true)
 |-- ALP: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: string (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: string (nullable = true)
 |-- Category: string (nullable = true)



In [15]:
#Resumo descritivo da tabela
print(df.describe().show())

+-------+------------------+----+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+-------------+
|summary|               Age| Sex|              ALB|               ALP|               ALT|              AST|               BIL|               CHE|              CHOL|             CREA|              GGT|             PROT|     Category|
+-------+------------------+----+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+-------------+
|  count|               615| 615|              615|               615|               615|              615|               615|               615|               615|              615|              615|              615|          615|
|   mean| 47.40813008130081|null|41.62019543973941| 68.2839195979899

In [16]:
# Value Count 
df.groupBy('Category').count().show()

+--------------------+-----+
|            Category|count|
+--------------------+-----+
|       0=Blood Donor|  533|
|         3=Cirrhosis|   30|
|          2=Fibrosis|   21|
|0s=suspect Blood ...|    7|
|         1=Hepatitis|   24|
+--------------------+-----+



## Feature Engineering
    + Valores Numéricos
    + Vetorização
    + Dimensionamento (scaling)

In [17]:
#importando a bibliteca pyspark para machine learning
import pyspark.ml

In [18]:
#os componentes do (pyspark.ml)
dir(pyspark.ml)

['Estimator',
 'Model',
 'Pipeline',
 'PipelineModel',
 'PredictionModel',
 'Predictor',
 'Transformer',
 'UnaryTransformer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'base',
 'classification',
 'clustering',
 'common',
 'evaluation',
 'feature',
 'fpm',
 'image',
 'linalg',
 'param',
 'pipeline',
 'recommendation',
 'regression',
 'stat',
 'tree',
 'tuning',
 'util',
 'wrapper']



```
Vamos usar o 'feature' do dir(pyspark.ml) para fazer as transformaçoes 
```



In [19]:
#componante do (pyspark.ml.feature)
dir(pyspark.ml.feature)

['Binarizer',
 'BucketedRandomProjectionLSH',
 'BucketedRandomProjectionLSHModel',
 'Bucketizer',
 'ChiSqSelector',
 'ChiSqSelectorModel',
 'CountVectorizer',
 'CountVectorizerModel',
 'DCT',
 'ElementwiseProduct',
 'FeatureHasher',
 'HasFeaturesCol',
 'HasHandleInvalid',
 'HasInputCol',
 'HasInputCols',
 'HasLabelCol',
 'HasMaxIter',
 'HasNumFeatures',
 'HasOutputCol',
 'HasOutputCols',
 'HasRelativeError',
 'HasSeed',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HashingTF',
 'IDF',
 'IDFModel',
 'Imputer',
 'ImputerModel',
 'IndexToString',
 'Interaction',
 'JavaEstimator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaModel',
 'JavaParams',
 'JavaTransformer',
 'MaxAbsScaler',
 'MaxAbsScalerModel',
 'MinHashLSH',
 'MinHashLSHModel',
 'MinMaxScaler',
 'MinMaxScalerModel',
 'NGram',
 'Normalizer',
 'OneHotEncoder',
 'OneHotEncoderModel',
 'PCA',
 'PCAModel',
 'Param',
 'Params',
 'PolynomialExpansion',
 'QuantileDiscretizer',
 'RFormula',
 'RFormulaModel',
 'RegexTokenizer',
 'R



```
Dentro do dir(pyspark.ml.feature) temos o "StringIndexer" que iremos usar para transformar os dados do tipo string
```



In [20]:
# carregar ML Pkgs que permite transformar os dados do tipo string
from pyspark.ml.feature import StringIndexer

In [21]:
# Unique Values para Sex
df.select('Sex').distinct().show()

+---+
|Sex|
+---+
|  m|
|  f|
+---+



### IndexToString
Simetricamente para StringIndexer, IndexToString mapeia uma coluna de índices de rótulo de volta para 
uma coluna que contém os rótulos originais como strings.Um caso de uso comum é produzir índices a partir de rótulos 
com StringIndexer, treinar um modelo com esses índices e recuperar os rótulos originais da coluna de índices previstoscom      IndexToString. No entanto, você é livre para fornecer suas próprias etiquetas.

In [22]:
# Converter string para numerico code(vamos criar uma nova coluna chamada 'sex_nemerico' onde iremos preencher com valor numerico de acordo com o tipo de sexo)
# label encoding
#stringto index
sex_code = StringIndexer(inputCol='Sex',outputCol='sex_nemerico').fit(df)

In [23]:
df = sex_code.transform(df)

In [24]:
df.show(3)

+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+------------+
|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|     Category|sex_nemerico|
+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+------------+
| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|  69|0=Blood Donor|         0.0|
| 32|  m|38.5|70.3|  18|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|0=Blood Donor|         0.0|
| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|0=Blood Donor|         0.0|
+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+------------+
only showing top 3 rows



In [25]:
# Converter string para numerico code(vamos criar uma nova coluna chamada 'code_categoriy' onde iremos preencher com valor numerico de acordo com o tipo de sexo)
# label encoding
code_category = StringIndexer(inputCol='Category',outputCol='code_category').fit(df)

In [26]:
df = code_category.transform(df)

In [27]:
df.show(3)

+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+------------+-------------+
|Age|Sex| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|     Category|sex_nemerico|code_category|
+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+------------+-------------+
| 32|  m|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|  69|0=Blood Donor|         0.0|          0.0|
| 32|  m|38.5|70.3|  18|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|0=Blood Donor|         0.0|          0.0|
| 32|  m|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|0=Blood Donor|         0.0|          0.0|
+---+---+----+----+----+----+---+-----+----+-----+----+----+-------------+------------+-------------+
only showing top 3 rows



In [28]:
# Obtenha os rótulos
code_category.labels

['0=Blood Donor',
 '3=Cirrhosis',
 '1=Hepatitis',
 '2=Fibrosis',
 '0s=suspect Blood Donor']



```
Transformação do para string outro exemplo
```



In [29]:
# IndexToString
from pyspark.ml.feature import IndexToString

In [30]:
# Aqui podemos perceber que o .fit(df) não foi aplicado
converter = IndexToString(inputCol='code_category',outputCol='orig_Category')

In [31]:
df_convertido = converter.transform(df)

In [32]:
df_convertido.show(4)

+---+---+----+----+----+----+----+-----+----+-----+----+----+-------------+------------+-------------+-------------+
|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|     Category|sex_nemerico|code_category|orig_Category|
+---+---+----+----+----+----+----+-----+----+-----+----+----+-------------+------------+-------------+-------------+
| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|0=Blood Donor|         0.0|          0.0|0=Blood Donor|
| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|0=Blood Donor|         0.0|          0.0|0=Blood Donor|
| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|0=Blood Donor|         0.0|          0.0|0=Blood Donor|
| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|0=Blood Donor|         0.0|          0.0|0=Blood Donor|
+---+---+----+----+----+----+----+-----+----+-----+----+----+-------------+------------+-------------+-------------+
only showing top 4 rows





```
fim exemplo de transformação do para string
```



In [33]:
### Feature 
df.show()

+---+---+----+----+----+----+----+-----+----+-----+----+----+-------------+------------+-------------+
|Age|Sex| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|     Category|sex_nemerico|code_category|
+---+---+----+----+----+----+----+-----+----+-----+----+----+-------------+------------+-------------+
| 32|  m|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|  69|0=Blood Donor|         0.0|          0.0|
| 32|  m|38.5|70.3|  18|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|0=Blood Donor|         0.0|          0.0|
| 32|  m|46.9|74.7|36.2|52.6| 6.1| 8.84| 5.2| 86.0|33.2|79.3|0=Blood Donor|         0.0|          0.0|
| 32|  m|43.2|  52|30.6|22.6|18.9| 7.33|4.74| 80.0|33.8|75.7|0=Blood Donor|         0.0|          0.0|
| 32|  m|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|0=Blood Donor|         0.0|          0.0|
| 32|  m|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|  74|0=Blood Donor|         0.0|          0.0|
| 32|  m|46.3|41.3|17.5|17.8| 8.5| 7.01|4.79| 70.0|16.9|74.5|0=Blood Dono

In [34]:
print(df.columns)

['Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Category', 'sex_nemerico', 'code_category']


In [35]:
df.dtypes

[('Age', 'int'),
 ('Sex', 'string'),
 ('ALB', 'string'),
 ('ALP', 'string'),
 ('ALT', 'string'),
 ('AST', 'double'),
 ('BIL', 'double'),
 ('CHE', 'double'),
 ('CHOL', 'string'),
 ('CREA', 'double'),
 ('GGT', 'double'),
 ('PROT', 'string'),
 ('Category', 'string'),
 ('sex_nemerico', 'double'),
 ('code_category', 'double')]



```
Verificação após transformação dos Valores string para Numéricos (valor numericos atribuidos para cada string transformado)
```



In [36]:
df.select("Sex","sex_nemerico").distinct().show()

+---+------------+
|Sex|sex_nemerico|
+---+------------+
|  f|         1.0|
|  m|         0.0|
+---+------------+



In [38]:
df.select("Category","code_category").distinct().show()

+--------------------+-------------+
|            Category|code_category|
+--------------------+-------------+
|         1=Hepatitis|          2.0|
|0s=suspect Blood ...|          4.0|
|       0=Blood Donor|          0.0|
|          2=Fibrosis|          3.0|
|         3=Cirrhosis|          1.0|
+--------------------+-------------+



In [39]:
df2 = df.select('Age','sex_nemerico', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'code_category')

In [40]:
df2.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- sex_nemerico: double (nullable = false)
 |-- ALB: string (nullable = true)
 |-- ALP: string (nullable = true)
 |-- ALT: string (nullable = true)
 |-- AST: double (nullable = true)
 |-- BIL: double (nullable = true)
 |-- CHE: double (nullable = true)
 |-- CHOL: string (nullable = true)
 |-- CREA: double (nullable = true)
 |-- GGT: double (nullable = true)
 |-- PROT: string (nullable = true)
 |-- code_category: double (nullable = false)



In [41]:
#convert df2 to pandas DataFrame
df2 = df2.toPandas()

In [42]:
#ver quantidade de Nan em cada coluna
#print(df2.isnull().sum()) 
print(df2.isna().sum())

Age              0
sex_nemerico     0
ALB              0
ALP              0
ALT              0
AST              0
BIL              0
CHE              0
CHOL             0
CREA             0
GGT              0
PROT             0
code_category    0
dtype: int64


In [43]:
# se tivesse valor null podemos:
# substituir nan por 0 no dataframe inteiro ou atribuir um valor usando algumas funç~es estatisticas
# df2.fillna(0)
# substituir nan por 0 em uma coluna especifica ou atribuir um valor usando algumas funç~es estatisticas
# df2.fillna(0,subset=['nome_da_coluna']

In [44]:
df2 = df2.replace('NA',0).astype(float)

In [45]:
print(df.columns)

['Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'Category', 'sex_nemerico', 'code_category']


In [46]:
type(df2)

pandas.core.frame.DataFrame

In [47]:
type(df)

pyspark.sql.dataframe.DataFrame

In [48]:
# Convert To PySpark Dataframe
new_df2 = spark.createDataFrame(df2)

In [49]:
#Ajustando o dataframe selecionando os features
features = ['Age', 'sex_nemerico', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT' ,'code_category']

In [50]:
#bibliotecas do (pyspark.ml.feature)
dir(pyspark.ml.feature)

['Binarizer',
 'BucketedRandomProjectionLSH',
 'BucketedRandomProjectionLSHModel',
 'Bucketizer',
 'ChiSqSelector',
 'ChiSqSelectorModel',
 'CountVectorizer',
 'CountVectorizerModel',
 'DCT',
 'ElementwiseProduct',
 'FeatureHasher',
 'HasFeaturesCol',
 'HasHandleInvalid',
 'HasInputCol',
 'HasInputCols',
 'HasLabelCol',
 'HasMaxIter',
 'HasNumFeatures',
 'HasOutputCol',
 'HasOutputCols',
 'HasRelativeError',
 'HasSeed',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HashingTF',
 'IDF',
 'IDFModel',
 'Imputer',
 'ImputerModel',
 'IndexToString',
 'Interaction',
 'JavaEstimator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaModel',
 'JavaParams',
 'JavaTransformer',
 'MaxAbsScaler',
 'MaxAbsScalerModel',
 'MinHashLSH',
 'MinHashLSHModel',
 'MinMaxScaler',
 'MinMaxScalerModel',
 'NGram',
 'Normalizer',
 'OneHotEncoder',
 'OneHotEncoderModel',
 'PCA',
 'PCAModel',
 'Param',
 'Params',
 'PolynomialExpansion',
 'QuantileDiscretizer',
 'RFormula',
 'RFormulaModel',
 'RegexTokenizer',
 'R



```
Dentor do dir(pyspark.ml.feature) temos o "VectorAssembler". (VectorAssembler é um transformador que combina uma determinada lista de colunas em uma única coluna de vetor)
```



In [51]:
from pyspark.ml.feature import VectorAssembler

### Iniciando a vectorização
VectorAssembler é um transformador que combina uma determinada lista de colunas em uma única coluna de vetor. É útil para combinar recursos brutos e recursos gerados por diferentes transformadores de recursos em um único vetor de recursos, a fim de treinar modelos de ML, como regressão logística e árvores de decisão. VectorAssembler aceita os seguintes tipos de coluna de entrada: todos os tipos numéricos, tipo booleano e tipo de vetor. Em cada linha, os valores das colunas de entrada serão concatenados em um vetor na ordem especificada.

In [52]:
#Vetorização 
vec_assembler = VectorAssembler(inputCols = features,outputCol='features')

In [53]:
vec_df = vec_assembler.transform(new_df2)

In [54]:
vec_df.show(3)

+----+------------+----+----+----+----+---+-----+----+-----+----+----+-------------+--------------------+
| Age|sex_nemerico| ALB| ALP| ALT| AST|BIL|  CHE|CHOL| CREA| GGT|PROT|code_category|            features|
+----+------------+----+----+----+----+---+-----+----+-----+----+----+-------------+--------------------+
|32.0|         0.0|38.5|52.5| 7.7|22.1|7.5| 6.93|3.23|106.0|12.1|69.0|          0.0|[32.0,0.0,38.5,52...|
|32.0|         0.0|38.5|70.3|18.0|24.7|3.9|11.17| 4.8| 74.0|15.6|76.5|          0.0|[32.0,0.0,38.5,70...|
|32.0|         0.0|46.9|74.7|36.2|52.6|6.1| 8.84| 5.2| 86.0|33.2|79.3|          0.0|[32.0,0.0,46.9,74...|
+----+------------+----+----+----+----+---+-----+----+-----+----+----+-------------+--------------------+
only showing top 3 rows



## Train,Test Split

In [55]:
# vamos treinar e testar os dados
# (Pegar uma amostra de 70% para treinar) e (uma amostra 30% para testar)
train_df,test_df = vec_df.randomSplit([0.7,0.3])

In [56]:
train_df.count()

421

In [57]:
train_df.show(4)

+----+------------+----+----+----+----+----+-----+----+-----+----+----+-------------+--------------------+
| Age|sex_nemerico| ALB| ALP| ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|code_category|            features|
+----+------------+----+----+----+----+----+-----+----+-----+----+----+-------------+--------------------+
|32.0|         0.0|38.5|52.5| 7.7|22.1| 7.5| 6.93|3.23|106.0|12.1|69.0|          0.0|[32.0,0.0,38.5,52...|
|32.0|         0.0|38.5|70.3|18.0|24.7| 3.9|11.17| 4.8| 74.0|15.6|76.5|          0.0|[32.0,0.0,38.5,70...|
|32.0|         0.0|39.2|74.1|32.6|24.8| 9.6| 9.15|4.32| 76.0|29.9|68.7|          0.0|[32.0,0.0,39.2,74...|
|32.0|         0.0|41.6|43.3|18.5|19.7|12.3| 9.92|6.05|111.0|91.0|74.0|          0.0|[32.0,0.0,41.6,43...|
+----+------------+----+----+----+----+----+-----+----+-----+----+----+-------------+--------------------+
only showing top 4 rows



In [58]:
test_df.count()

194

In [59]:
test_df.show(4)

+----+------------+----+----+----+----+----+----+----+----+----+----+-------------+--------------------+
| Age|sex_nemerico| ALB| ALP| ALT| AST| BIL| CHE|CHOL|CREA| GGT|PROT|code_category|            features|
+----+------------+----+----+----+----+----+----+----+----+----+----+-------------+--------------------+
|32.0|         0.0|42.4|86.3|20.3|20.0|35.2|5.46|4.45|81.0|15.9|69.9|          0.0|[32.0,0.0,42.4,86...|
|32.0|         0.0|44.3|52.3|21.7|22.4|17.2|4.15|3.57|78.0|24.1|75.4|          0.0|[32.0,0.0,44.3,52...|
|32.0|         0.0|46.3|41.3|17.5|17.8| 8.5|7.01|4.79|70.0|16.9|74.5|          0.0|[32.0,0.0,46.3,41...|
|33.0|         0.0|46.7|88.3|23.4|23.9| 7.8|9.42|4.62|78.0|29.5|74.3|          0.0|[33.0,0.0,46.7,88...|
+----+------------+----+----+----+----+----+----+----+----+----+----+-------------+--------------------+
only showing top 4 rows



#### Model Building
+ Pyspark.ml: DataFrame
+ Pyspark.mllib: RDD /Legacy

In [60]:
import pyspark.ml.classification 

In [61]:
dir(pyspark.ml.classification)

['ABCMeta',
 'ArrayType',
 'BinaryLogisticRegressionSummary',
 'BinaryLogisticRegressionTrainingSummary',
 'BinaryRandomForestClassificationSummary',
 'BinaryRandomForestClassificationTrainingSummary',
 'ClassificationModel',
 'Classifier',
 'DataFrame',
 'DecisionTreeClassificationModel',
 'DecisionTreeClassifier',
 'DecisionTreeRegressionModel',
 'DefaultParamsReader',
 'DefaultParamsWriter',
 'DoubleType',
 'Estimator',
 'FMClassificationModel',
 'FMClassificationSummary',
 'FMClassificationTrainingSummary',
 'FMClassifier',
 'GBTClassificationModel',
 'GBTClassifier',
 'HasAggregationDepth',
 'HasBlockSize',
 'HasElasticNetParam',
 'HasFitIntercept',
 'HasMaxBlockSizeInMB',
 'HasMaxIter',
 'HasParallelism',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasRegParam',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HasTol',
 'HasTrainingSummary',
 'HasWeightCol',
 'JavaMLReadable',
 'JavaMLReader',
 'JavaMLWritable',
 'JavaMLWri



```
Vamos trabalhar com LogisticRegression e DecisionTreeClassifier que estão dentro do dir(pyspark.ml.classification)
```



In [62]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier

In [63]:
# Logist Model  "code_categor"
lr = LogisticRegression(featuresCol='features',labelCol='code_category')

In [64]:
lr_model = lr.fit(train_df)

In [65]:
y_pred = lr_model.transform(test_df)

In [66]:
y_pred.show()

+----+------------+----+----+-----+----+----+-----+----+-----+----+----+-------------+--------------------+--------------------+--------------------+----------+
| Age|sex_nemerico| ALB| ALP|  ALT| AST| BIL|  CHE|CHOL| CREA| GGT|PROT|code_category|            features|       rawPrediction|         probability|prediction|
+----+------------+----+----+-----+----+----+-----+----+-----+----+----+-------------+--------------------+--------------------+--------------------+----------+
|32.0|         0.0|42.4|86.3| 20.3|20.0|35.2| 5.46|4.45| 81.0|15.9|69.9|          0.0|[32.0,0.0,42.4,86...|[298.358428541305...|[1.0,5.4383518704...|       0.0|
|32.0|         0.0|44.3|52.3| 21.7|22.4|17.2| 4.15|3.57| 78.0|24.1|75.4|          0.0|[32.0,0.0,44.3,52...|[267.337083767760...|[1.0,3.3529733569...|       0.0|
|32.0|         0.0|46.3|41.3| 17.5|17.8| 8.5| 7.01|4.79| 70.0|16.9|74.5|          0.0|[32.0,0.0,46.3,41...|[357.252881608096...|[1.0,7.6961676367...|       0.0|
|33.0|         0.0|46.7|88.3| 23.4

In [67]:
print(y_pred.columns)

['Age', 'sex_nemerico', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'code_category', 'features', 'rawPrediction', 'probability', 'prediction']


In [68]:
y_pred.select('code_category','rawPrediction', 'probability', 'prediction').show()

+-------------+--------------------+--------------------+----------+
|code_category|       rawPrediction|         probability|prediction|
+-------------+--------------------+--------------------+----------+
|          0.0|[298.358428541305...|[1.0,5.4383518704...|       0.0|
|          0.0|[267.337083767760...|[1.0,3.3529733569...|       0.0|
|          0.0|[357.252881608096...|[1.0,7.6961676367...|       0.0|
|          0.0|[400.737280147441...|[1.0,5.8163647386...|       0.0|
|          0.0|[283.219379253323...|[1.0,2.7642844938...|       0.0|
|          0.0|[387.449735774959...|[1.0,3.4896239289...|       0.0|
|          0.0|[349.885539420888...|[1.0,2.2379872133...|       0.0|
|          0.0|[359.131773282828...|[1.0,5.3217324182...|       0.0|
|          0.0|[383.735416688796...|[1.0,2.7862338387...|       0.0|
|          0.0|[393.665218793983...|[1.0,3.8669686700...|       0.0|
|          0.0|[371.893832612945...|[1.0,9.6746833227...|       0.0|
|          0.0|[396.264130907269..

## Model Evaluation

In [69]:
import pyspark.ml

In [70]:
dir(pyspark.ml)

['Estimator',
 'Model',
 'Pipeline',
 'PipelineModel',
 'PredictionModel',
 'Predictor',
 'Transformer',
 'UnaryTransformer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'base',
 'classification',
 'clustering',
 'common',
 'evaluation',
 'feature',
 'fpm',
 'image',
 'linalg',
 'param',
 'pipeline',
 'recommendation',
 'regression',
 'stat',
 'tree',
 'tuning',
 'util',
 'wrapper']

In [74]:
dir(pyspark.ml.evaluation)

['ABCMeta',
 'BinaryClassificationEvaluator',
 'ClusteringEvaluator',
 'Evaluator',
 'HasFeaturesCol',
 'HasLabelCol',
 'HasPredictionCol',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasWeightCol',
 'JavaEvaluator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaParams',
 'MulticlassClassificationEvaluator',
 'MultilabelClassificationEvaluator',
 'Param',
 'Params',
 'RankingEvaluator',
 'RegressionEvaluator',
 'TypeConverters',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'abstractmethod',
 'inherit_doc',
 'keyword_only',
 'since',
 'sys']



```
Vamos usar MulticlassClassificationEvaluator que está dentro do dir(pyspark.ml.evaluation)
```



In [72]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [73]:
# How to Check For Accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol='code_category',metricName='accuracy')

In [75]:
multi_evaluator.evaluate(y_pred)

0.979381443298969



### Precision,F1 Score,Recall : Classification Report




In [78]:
dir(pyspark.mllib.evaluation)

['ArrayType',
 'BinaryClassificationMetrics',
 'DoubleType',
 'JavaModelWrapper',
 'MulticlassMetrics',
 'MultilabelMetrics',
 'RankingMetrics',
 'RegressionMetrics',
 'SQLContext',
 'StructField',
 'StructType',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_test',
 'callMLlibFunc',
 'since',
 'sys']



```
Vamos usar MulticlassMetrics que está dentro do dir(pyspark.mllib.evaluation)
```



In [79]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [81]:
lr_metric = MulticlassMetrics(y_pred['code_category', 'prediction'].rdd)

In [82]:
dir(lr_metric)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_java_model',
 '_sc',
 'accuracy',
 'call',
 'confusionMatrix',
 'fMeasure',
 'falsePositiveRate',
 'logLoss',
 'precision',
 'recall',
 'truePositiveRate',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

In [83]:
print("Accuracy",lr_metric.accuracy)

Accuracy 0.979381443298969


In [85]:
print("Precision",lr_metric.precision(1.0))
print("Recall",lr_metric.recall(1.0))
print("F1Score",lr_metric.fMeasure(1.0))

Precision 0.5
Recall 1.0
F1Score 0.6666666666666666


In [86]:
dir(lr_model)

['__abstractmethods__',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_call_java',
 '_checkThresholdConsistency',
 '_copyValues',
 '_copy_params',
 '_create_from_java_class',
 '_create_params_from_java',
 '_defaultParamMap',
 '_dummy',
 '_empty_java_param_map',
 '_from_java',
 '_java_obj',
 '_make_java_param_pair',
 '_new_java_array',
 '_new_java_obj',
 '_paramMap',
 '_params',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_testOwnParam',
 '_to_java',
 '_transfer_param_map_from_java',
 '_transfer_param_map_to_java',
 '_transfer_params_from_java',
 '_transfer_params_to_java',
 '_transform',
 'aggreg

In [87]:
# Salvar Model
lr_model.save("lr_model_30")
lr_model.write().save("mylr_model")