## Bibliotecas

In [None]:
import sys
sys.path.append('../../../')

In [None]:
## Spark SQL
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Spark ML
from src.ml.preprocessing.preprocessing import SparkPreprocessor
from src.ml.preprocessing.normalization import SparkScaler
from src.ml.preprocessing.text_vectorizer import TextVectorizer
from src.ml.model.trainer import SparkTrainer, SparkUnsupTrainer
from src.ml.model.metrics import Metrics, CustomRegressionEvaluator, CustomBinaryEvaluator
from src.ml.analysis.pca import SparkPCA
from src.ml.analysis.cluster import SparkCluster
from src.ml.analysis.feature_selection import FeatureSelector

In [None]:
import mlflow.pyspark.ml

In [None]:
spark = (
    SparkSession
    .builder
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

## Data

In [None]:
df_train = spark.read.parquet('../../../data/raw/raw_train')
df_test = spark.read.parquet('../../../data/raw/raw_test')

In [None]:
df_train.limit(5).toPandas()

## Preprocessing

### Scaling

In [None]:
scaler = SparkScaler('Age', 'max_abs')

In [None]:
scaler.fit(df_train)

In [None]:
scaler.transform(df_train).toPandas()

In [None]:
scaler = SparkScaler('Age', 'max_abs')
scaler.fit_transform(df_train).toPandas()

### Preprocessing

In [None]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mode')

In [None]:
processor.fit(df_train)

In [None]:
processor.transform(df_train).toPandas()

In [None]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mode')
processor.fit_transform(df_train).toPandas()

### TextVectorizer

In [None]:
df_text = spark.read.csv('D:/projects/pyspark_dev/JEOPARDY_CSV.csv', header=True).select(f.regexp_replace(f.col(" Question"), '"', '').alias('questions'))
df_text.limit(5).toPandas()

In [None]:
vectorizer = TextVectorizer('questions', 'word2vec')

In [None]:
vectorizer.fit(df_text)

In [None]:
vectorizer.transform(df_text).limit(5).toPandas()

In [None]:
vectorizer = TextVectorizer('questions', 'hashing_tfidf')
vectorizer.fit_transform(df_text).limit(5).toPandas()

In [None]:
vectorizer = TextVectorizer('questions', 'tfidf')
vectorizer.fit_transform(df_text).limit(5).toPandas()

In [None]:
vectorizer = TextVectorizer('questions', 'teste')
vectorizer.fit_transform(df_text).limit(5).toPandas()

## Model

### Binary Classification

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
trainer = SparkTrainer()

In [None]:
df = df_train.unionByName(df_test)

In [None]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [None]:
model = trainer.train(df, True, LogisticRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Survived')

In [None]:
model.artifacts

### Multiclass Classification

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
trainer = SparkTrainer()

In [None]:
df = df_train.unionByName(df_test).withColumn('Survived', f.expr('case when rand() >= 0.7 then 2 else Survived end'))

In [None]:
df.groupby('Survived').count().toPandas()

In [None]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [None]:
model = trainer.train(df, True, LogisticRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Survived', family='multinomial')

In [None]:
model.artifacts

### Regression

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
trainer = SparkTrainer()

In [None]:
df = df_train.unionByName(df_test)

In [None]:
processor = SparkPreprocessor(['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCol='Age', outputCol='Age')
df = imputer.fit(df).transform(df)

In [None]:
model = trainer.train(df, False, LinearRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Age')

In [None]:
model.artifacts

#### Cross Validation

In [None]:
model = trainer.train(df, False, LinearRegression,  data_split=('cv', {'numFolds': 4, 'param_grid': {'regParam': [0, 1, 2]}}), labelCol = 'Age')

In [None]:
model.artifacts

In [None]:
evaluator = CustomRegressionEvaluator('mape', 'Age')
split = ('cv', {'numFolds': 4, 'param_grid': {'regParam': [0, 1, 2], 'elasticNetParam': [0, 0.5, 1]}, 'evaluator': evaluator, })

In [None]:
model = trainer.train(df, False, LinearRegression,  data_split=split, labelCol = 'Age')

In [None]:
model.artifacts

### Unsupervised Trainer

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
trainer = SparkUnsupTrainer()

In [None]:
df = df_train.unionByName(df_test)

In [None]:
processor = SparkPreprocessor({'robust': 'Age'}, impute_strategy = 'mean')
df = processor.fit_transform(df)

In [None]:
model = trainer.train(df, KMeans)

In [None]:
model.artifacts

#### Mudando a métrica de distância da validação

In [None]:
model = trainer.train(df, KMeans, metric_params={'distanceMeasure': 'cosine'})

In [None]:
model.artifacts

#### Mudando a métrica de distância da clusterização

In [None]:
df = df_train.unionByName(df_test).withColumn('rand', f.rand())

processor = SparkPreprocessor({'robust': ['Age', 'rand']}, impute_strategy = 'mean')
df = processor.fit_transform(df)

model = trainer.train(df, KMeans, metric_params={'distanceMeasure': 'cosine'}, k = 5, distanceMeasure = 'cosine')

In [None]:
model.artifacts

### Custom Metrics

#### Binary Classification

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
metrics = ['accuracy','roc_auc', 'precision', 'recall', 'f1']

In [None]:
processor = SparkPreprocessor({'robust': ['Age']}, ['Sex', 'Pclass'], impute_strategy = 'mean')
df = processor.fit_transform(df_train)
lr = LogisticRegression(labelCol = 'Survived')

df_pred = lr.fit(df).transform(df)
df_pred.limit(5).toPandas()

In [None]:
for metric in metrics:
    evaluator = CustomBinaryEvaluator(metric, 'Survived')
    print(metric, evaluator.evaluate(df_pred))

In [None]:
for metric in metrics:
    if metric not in ['accuracy', 'roc_auc']:
        evaluator = CustomBinaryEvaluator(metric, 'Survived', 1)
        print(metric, evaluator.evaluate(df_pred))

#### Regression

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import Imputer

In [None]:
metrics = ['mape', 'smape', 'weighted_mape']

In [None]:
processor = SparkPreprocessor(cat_cols = ['Sex', 'Pclass'], impute_strategy = 'mean')
df = processor.fit_transform(df_train)

imputer = Imputer(inputCol='Age', outputCol='Age')
df = imputer.fit(df).transform(df)

lr = LinearRegression(labelCol = 'Age')

df_pred = lr.fit(df).transform(df)
df_pred.limit(5).toPandas()

In [None]:
for metric in metrics:
    evaluator = CustomRegressionEvaluator(metric, 'Age')
    print(metric, evaluator.evaluate(df_pred))

### Wrapper

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
trainer = SparkTrainer()

In [None]:
df = df_train.unionByName(df_test)

In [None]:
processor = SparkPreprocessor({'zscore': 'Age'}, ['Pclass', 'Sex'], impute_strategy = 'mean')
df = processor.fit_transform(df)

In [None]:
model = trainer.train(df, True, LogisticRegression,  data_split=('train_test', {'test_size': 0.2}), labelCol = 'Survived')

In [None]:
model.artifacts

In [None]:
model.predict(df).toPandas()

In [None]:
model.predict_proba(df).toPandas()

In [None]:
model.predict_proba(df, True).toPandas()

In [None]:
model.save('teste')

In [None]:
model.load('teste')

In [None]:
model.get_metrics()

In [None]:
model.get_model()

In [None]:
model.get_model_instance()

## Analysis

### Cluster

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
df = df_train.unionByName(df_test).withColumn('rand', f.rand())

In [None]:
processor = SparkPreprocessor({'robust': ['Age', 'rand']}, impute_strategy = 'mean')
df = processor.fit_transform(df)

In [None]:
cluster = SparkCluster()

In [None]:
cluster.analyzeK(df)

In [None]:
trainer = SparkUnsupTrainer()

df_pred = trainer.train(df, KMeans).artifacts['model'].transform(df)

In [None]:
df_pred.toPandas()

In [None]:
pdf = df_pred.toPandas()
cluster.plot_cluster(pdf, 'Age', 'rand', 'prediction')

### PCA

In [None]:
df = df_train.unionByName(df_test).withColumn('rand', f.rand()).withColumn('rand2', f.rand())

processor = SparkPreprocessor({'robust': ['Age', 'rand', 'rand2']}, impute_strategy = 'mean')
df = processor.fit_transform(df)
df.toPandas()

In [None]:
pca = SparkPCA('features', k = 2)
pca.fit_transform(df).toPandas()

In [None]:
pca = SparkPCA('features', k = 0.9)
pca.fit_transform(df).toPandas()

### Feature Selection

In [None]:
df = df_train.unionByName(df_test)
processor = SparkPreprocessor(cat_cols=['Pclass', 'Sex'])
scaler = SparkScaler('Age', 'robust')

df = (
    scaler.fit_transform(processor.fit_transform(df))
    .withColumnRenamed('features', 'cat_features')
    .withColumnRenamed('robust_scaled', 'num_features')
)
df.toPandas()

In [None]:
selector = FeatureSelector('num_features', 'cat_features', 'Survived', 'categorical')
selector.select(df).toPandas()

In [None]:
selector = FeatureSelector('num_features', 'cat_features', 'Survived', 'categorical', selectionThreshold=1)
selector.select(df).select('features').toPandas()

In [None]:
df = df_train.unionByName(df_test).withColumn('rand', f.rand()).withColumn('rand2', f.rand())
processor = SparkPreprocessor(cat_cols=['Pclass', 'Sex'])
scaler = SparkScaler(['Age', 'rand', 'rand2'], 'robust')

df = (
    scaler.fit_transform(processor.fit_transform(df))
    .withColumnRenamed('features', 'cat_features')
    .withColumnRenamed('robust_scaled', 'num_features')
)

selector = FeatureSelector('num_features', 'cat_features', 'Survived', 'categorical', 'fpr')
selector.select(df).select('num_features', 'cat_features', 'selected_continuous', 'selected_categorical', 'features').toPandas()