## Bibliotecas

In [1]:
import sys
sys.path.append('../../../')

In [2]:
## Spark SQL
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Spark ML
from src.ml.preprocessing.preprocessing import SparkPreprocessor
from src.ml.preprocessing.normalization import SparkScaler
from src.ml.preprocessing.text_vectorizer import TextVectorizer
from src.ml.model.trainer import SparkTrainer, SparkUnsupTrainer
from src.ml.model.metrics import Metrics, CustomRegressionEvaluator
from src.ml.analysis.pca import SparkPCA
from src.ml.analysis.cluster import SparkCluster
from src.ml.analysis.feature_selection import FeatureSelector

In [3]:
import mlflow.pyspark.ml

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

## Data

In [10]:
df_train = spark.read.parquet('../../../data/raw/raw_train')
df_test = spark.read.parquet('../../../data/raw/raw_test')

In [6]:
df_train.limit(5).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age
0,1,2,female,34.0
1,1,2,female,31.0
2,1,1,male,36.0
3,1,3,male,29.0
4,0,2,male,18.0


## Preprocessing

### Scaling

In [7]:
scaler = SparkScaler('Age', 'max_abs')

In [9]:
scaler.fit(df_train)

<class 'src.ml.preprocessing.normalization.SparkScaler'>

In [10]:
scaler.transform(df_train).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,max_abs_vec,max_abs_scaled
0,1,2,female,34.0,[34.0],[0.42500000000000004]
1,1,2,female,31.0,[31.0],[0.3875]
2,1,1,male,36.0,[36.0],[0.45]
3,1,3,male,29.0,[29.0],[0.36250000000000004]
4,0,2,male,18.0,[18.0],[0.225]
...,...,...,...,...,...,...
487,1,2,female,34.0,[34.0],[0.42500000000000004]
488,1,2,female,30.0,[30.0],[0.375]
489,0,3,male,32.0,[32.0],[0.4]
490,0,3,male,30.0,[30.0],[0.375]


In [11]:
scaler = SparkScaler('Age', 'max_abs')
scaler.fit_transform(df_train).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,max_abs_vec,max_abs_scaled
0,1,2,female,34.0,[34.0],[0.42500000000000004]
1,1,2,female,31.0,[31.0],[0.3875]
2,1,1,male,36.0,[36.0],[0.45]
3,1,3,male,29.0,[29.0],[0.36250000000000004]
4,0,2,male,18.0,[18.0],[0.225]
...,...,...,...,...,...,...
487,1,2,female,34.0,[34.0],[0.42500000000000004]
488,1,2,female,30.0,[30.0],[0.375]
489,0,3,male,32.0,[32.0],[0.4]
490,0,3,male,30.0,[30.0],[0.375]


### Preprocessing

In [7]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mode')

In [8]:
processor.fit(df_train)

<class 'src.ml.preprocessing.preprocessing.SparkPreprocessor'>

In [9]:
processor.transform(df_train).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,Pclass_indexed,Sex_indexed,Pclass_ohe,Sex_ohe,zscore_vec,zscore_scaled,features
0,1,2,female,34.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[34.0],[2.591401445647517],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.591401445647517]"
1,1,2,female,31.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[31.0],[2.3627483769139124],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.3627483769139124]"
2,1,1,male,36.0,1.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",[36.0],[2.743836824803253],"[0.0, 1.0, 0.0, 1.0, 0.0, 2.743836824803253]"
3,1,3,male,29.0,0.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",[29.0],[2.210312997758176],"[1.0, 0.0, 0.0, 1.0, 0.0, 2.210312997758176]"
4,0,2,male,18.0,2.0,0.0,"(0.0, 0.0, 1.0)","(1.0, 0.0)",[18.0],[1.3719184124016266],"[0.0, 0.0, 1.0, 1.0, 0.0, 1.3719184124016266]"
...,...,...,...,...,...,...,...,...,...,...,...
618,1,2,female,34.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[34.0],[2.591401445647517],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.591401445647517]"
619,1,2,female,30.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[30.0],[2.2865306873360445],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.2865306873360445]"
620,0,3,male,32.0,0.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",[32.0],[2.4389660664917807],"[1.0, 0.0, 0.0, 1.0, 0.0, 2.4389660664917807]"
621,0,3,male,30.0,0.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",[30.0],[2.2865306873360445],"[1.0, 0.0, 0.0, 1.0, 0.0, 2.2865306873360445]"


In [10]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mode')
processor.fit_transform(df_train).toPandas()

Unnamed: 0,Survived,Pclass,Sex,Age,Pclass_indexed,Sex_indexed,Pclass_ohe,Sex_ohe,zscore_vec,zscore_scaled,features
0,1,2,female,34.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[34.0],[2.591401445647517],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.591401445647517]"
1,1,2,female,31.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[31.0],[2.3627483769139124],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.3627483769139124]"
2,1,1,male,36.0,1.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",[36.0],[2.743836824803253],"[0.0, 1.0, 0.0, 1.0, 0.0, 2.743836824803253]"
3,1,3,male,29.0,0.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",[29.0],[2.210312997758176],"[1.0, 0.0, 0.0, 1.0, 0.0, 2.210312997758176]"
4,0,2,male,18.0,2.0,0.0,"(0.0, 0.0, 1.0)","(1.0, 0.0)",[18.0],[1.3719184124016266],"[0.0, 0.0, 1.0, 1.0, 0.0, 1.3719184124016266]"
...,...,...,...,...,...,...,...,...,...,...,...
618,1,2,female,34.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[34.0],[2.591401445647517],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.591401445647517]"
619,1,2,female,30.0,2.0,1.0,"(0.0, 0.0, 1.0)","(0.0, 1.0)",[30.0],[2.2865306873360445],"[0.0, 0.0, 1.0, 0.0, 1.0, 2.2865306873360445]"
620,0,3,male,32.0,0.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",[32.0],[2.4389660664917807],"[1.0, 0.0, 0.0, 1.0, 0.0, 2.4389660664917807]"
621,0,3,male,30.0,0.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",[30.0],[2.2865306873360445],"[1.0, 0.0, 0.0, 1.0, 0.0, 2.2865306873360445]"


### TextVectorizer

In [5]:
df_text = spark.read.csv('D:/projects/pyspark_dev/JEOPARDY_CSV.csv', header=True).select(f.regexp_replace(f.col(" Question"), '"', '').alias('questions'))
df_text.limit(5).toPandas()

Unnamed: 0,questions
0,"For the last 8 years of his life, Galileo was ..."
1,No. 2: 1912 Olympian; football star at Carlisl...
2,The city of Yuma in this state has a record av...
3,"In 1963, live on The Art Linkletter Show"
4,"Signer of the Dec. of Indep., framer of the Co..."


In [6]:
vectorizer = TextVectorizer('questions', 'word2vec')

In [7]:
vectorizer.fit(df_text)

<class 'src.ml.preprocessing.text_vectorizer.TextVectorizer'>

In [8]:
vectorizer.transform(df_text).limit(5).toPandas()

Unnamed: 0,questions,tokens,word_vectors
0,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","[0.03557115552636484, 0.04990100726071331, 0.0..."
1,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","[-0.0024237934267148376, -0.025637355963944602..."
2,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","[-0.04773901656476987, 0.03761978359479043, 0...."
3,"In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","[0.02796331257559359, -0.015056118369102478, -..."
4,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","[-0.13217267259541485, 0.004586822042862574, 0..."


In [9]:
vectorizer = TextVectorizer('questions', 'hashing_tfidf')
vectorizer.fit_transform(df_text).limit(5).toPandas()

Unnamed: 0,questions,tokens,unscaled_vectors,word_vectors
0,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [10]:
vectorizer = TextVectorizer('questions', 'tfidf')
vectorizer.fit_transform(df_text).limit(5).toPandas()

Unnamed: 0,questions,tokens,unscaled_vectors,word_vectors
0,"For the last 8 years of his life, Galileo was ...","[for, the, last, 8, years, of, his, life,, gal...","(1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...","(0.6199042418410315, 0.6284836274800071, 0.914..."
1,No. 2: 1912 Olympian; football star at Carlisl...,"[no., 2:, 1912, olympian;, football, star, at,...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.6199042418410315, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,The city of Yuma in this state has a record av...,"[the, city, of, yuma, in, this, state, has, a,...","(1.0, 1.0, 3.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.6199042418410315, 0.6284836274800071, 2.743..."
3,"In 1963, live on The Art Linkletter Show","[in, 1963,, live, on, the, art, linkletter, show]","(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.6199042418410315, 0.0, 0.0, 0.9836374634432..."
4,"Signer of the Dec. of Indep., framer of the Co...","[signer, of, the, dec., of, indep.,, framer, o...","(3.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.8597127255230945, 0.0, 4.57301340068142, 0...."


In [11]:
vectorizer = TextVectorizer('questions', 'teste')
vectorizer.fit_transform(df_text).limit(5).toPandas()

Exception: Method not supported. Choose one from `hashing_tfidf`, `tfidf`, `word2vec`.

## Model

### Binary Classification

In [6]:
from pyspark.ml.classification import LogisticRegression

In [7]:
trainer = SparkTrainer()

In [11]:
df = df_train.unionByName(df_test)

In [13]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [15]:
model = trainer.train(df, True, LogisticRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Survived')

Survived
Confusion Matrix
+-------+---+---+
|Outcome|  0|  1|
+-------+---+---+
|      0| 89| 18|
|      1| 14| 50|
+-------+---+---+


Results
+------------+-----------+
|   Accuracy |   ROC AUC |
|   0.812865 |  0.879308 |
+------------+-----------+

+-----------+-------------+----------+----------+
|   Outcome |   Precision |   Recall |       F1 |
|         0 |    0.864078 | 0.831776 | 0.847619 |
+-----------+-------------+----------+----------+
|         1 |    0.735294 | 0.78125  | 0.757576 |
+-----------+-------------+----------+----------+


In [16]:
model.artifacts

{'model': LogisticRegressionModel: uid=LogisticRegression_407f78518fd8, numClasses=2, numFeatures=6,
 'model_instance': pyspark.ml.classification.LogisticRegressionModel,
 'metrics': {'labels': {0: {'precision': 0.8640776699029126,
    'recall': 0.8317757009345794,
    'f1': 0.8476190476190476},
   1: {'precision': 0.7352941176470589,
    'recall': 0.78125,
    'f1': 0.7575757575757576}},
  'accuracy': 0.8128654970760234,
  'roc_auc': 0.879307827102804},
 'creation_date': datetime.date(2021, 10, 6)}

### Multiclass Classification

In [6]:
from pyspark.ml.classification import LogisticRegression

In [7]:
trainer = SparkTrainer()

In [19]:
df = df_train.unionByName(df_test).withColumn('Survived', f.expr('case when rand() >= 0.7 then 2 else Survived end'))

In [20]:
df.groupby('Survived').count().toPandas()

Unnamed: 0,Survived,count
0,1,234
1,2,268
2,0,389


In [21]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [22]:
model = trainer.train(df, True, LogisticRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Survived', family='multinomial')

Survived
Confusion Matrix
+-------+---+---+---+
|Outcome|  0|  1|  2|
+-------+---+---+---+
|      0| 61|  5|  6|
|      1|  8| 33|  8|
|      2| 31| 13|  6|
+-------+---+---+---+


Results
+------------+
|   Accuracy |
|   0.584795 |
+------------+

+-----------+-------------+----------+----------+
|   Outcome |   Precision |   Recall |       F1 |
|         0 |    0.61     | 0.847222 | 0.709302 |
+-----------+-------------+----------+----------+
|         1 |    0.647059 | 0.673469 | 0.66     |
+-----------+-------------+----------+----------+
|         2 |    0.3      | 0.12     | 0.171429 |
+-----------+-------------+----------+----------+


In [23]:
model.artifacts

{'model': LogisticRegressionModel: uid=LogisticRegression_303a165438bc, numClasses=3, numFeatures=6,
 'model_instance': pyspark.ml.classification.LogisticRegressionModel,
 'metrics': {0: {'f1': 0.7093023255813953,
   'precision': 0.61,
   'recall': 0.8472222222222222},
  1: {'f1': 0.66,
   'precision': 0.6470588235294118,
   'recall': 0.673469387755102},
  2: {'f1': 0.17142857142857143, 'precision': 0.3, 'recall': 0.12}},
 'creation_date': datetime.date(2021, 10, 6)}

### Regression

In [25]:
from pyspark.ml.regression import LinearRegression

In [26]:
trainer = SparkTrainer()

In [27]:
df = df_train.unionByName(df_test)

In [21]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [22]:
model = trainer.train(df, True, LogisticRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Survived', family='multinomial')

Survived
Confusion Matrix
+-------+---+---+---+
|Outcome|  0|  1|  2|
+-------+---+---+---+
|      0| 61|  5|  6|
|      1|  8| 33|  8|
|      2| 31| 13|  6|
+-------+---+---+---+


Results
+------------+
|   Accuracy |
|   0.584795 |
+------------+

+-----------+-------------+----------+----------+
|   Outcome |   Precision |   Recall |       F1 |
|         0 |    0.61     | 0.847222 | 0.709302 |
+-----------+-------------+----------+----------+
|         1 |    0.647059 | 0.673469 | 0.66     |
+-----------+-------------+----------+----------+
|         2 |    0.3      | 0.12     | 0.171429 |
+-----------+-------------+----------+----------+


In [23]:
model.artifacts

{'model': LogisticRegressionModel: uid=LogisticRegression_303a165438bc, numClasses=3, numFeatures=6,
 'model_instance': pyspark.ml.classification.LogisticRegressionModel,
 'metrics': {0: {'f1': 0.7093023255813953,
   'precision': 0.61,
   'recall': 0.8472222222222222},
  1: {'f1': 0.66,
   'precision': 0.6470588235294118,
   'recall': 0.673469387755102},
  2: {'f1': 0.17142857142857143, 'precision': 0.3, 'recall': 0.12}},
 'creation_date': datetime.date(2021, 10, 6)}

### Unsupervised Trainer

In [7]:
trainer = SparkUnsupTrainer()