In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler,StringIndexer, OneHotEncoder
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.ml import Pipeline

In [2]:
def print_metrics(predictions_and_labels):
    
    metrics = MulticlassMetrics(predictions_and_labels)
    auc_roc = model2.avgMetrics[0]
    print("Summary Stats")
    print("Accuracy = %s" % float(auc_roc)*100)
    print("Gini = %s" % float(2 * auc_roc — 1)*100)
    print("precision", float(metrics.precision())*100)
    print("recall", float(metrics.recall())*100)

    print('Confusion Matrix\n', metrics.confusionMatrix())

In [4]:
spark = SparkSession.builder.appName('APP').getOrCreate()

In [5]:
def get_df_columns_train(df):
    df_rows = df.select(
        df.v_0.cast("String"),
        df.v_1.cast("Double"),
        df.v_2.cast("Double"),
        df.v_3.cast("Double"),
        df.v_4.cast("Double"),
        df.v_5.cast("Double"),
        df.v_6.cast("Double"),
        df.v_7.cast("Double"),
        df.v_8.cast("Double"),
        df.v_9.cast("Double"),
        df.v_10.cast("Double"),
        df.v_11.cast("Double"),
        df.v_12.cast("Integer").alias("label")
    )
    return df_rows

In [6]:
def get_df_columns_test(df):
    df_rows = df.select(
        df.v_0.cast("String"),
        df.v_1.cast("Double"),
        df.v_2.cast("Double"),
        df.v_3.cast("Double"),
        df.v_4.cast("Double"),
        df.v_5.cast("Double"),
        df.v_6.cast("Double"),
        df.v_7.cast("Double"),
        df.v_8.cast("Double"),
        df.v_9.cast("Double"),
        df.v_10.cast("Double"),
        df.v_11.cast("Double"),
        df.v_12.cast("Integer").alias("label")
    )
    return df_rows

In [7]:
def df_train():

    df = spark.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .load("data/dataset.csv")
    
    #data selections
    my_df = get_df_columns_train(df)
    feature_columns = my_df.columns[1:-1]
    
    #data preparations
    assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
    
    pipeline = Pipeline(stages=[assembler])
    pipelineModel = pipeline.fit(my_df)
    pipelineModel.write().overwrite().save("data/model/Model_GBTS")
    
    dataset = pipelineModel.transform(my_df)
    
    #data partitions
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3])
    
    #gbts
    iteration = 100
    gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=iteration)
    
    evaluator = BinaryClassificationEvaluator(labelCol="label")
    
    paramGrid = ParamGridBuilder().build()


    # Create 5-fold CrossValidator
    cv = CrossValidator(estimator=gbt, 
                        estimatorParamMaps=paramGrid, 
                        evaluator=evaluator, 
                        numFolds=5)
    
    
    #Train model using CV
    model = cv.fit(trainingData).bestModel
    model.write().overwrite().save("data/model/CVModel_LGR")
    
    ## Predict and evaluate
    predictions = model.transform(testData)
    
    
    #BinaryClassificationEvaluator
    evaluator2 = BinaryClassificationEvaluator(labelCol="label")
    
    accuracy = evaluator2.evaluate(predictions)
    print("Accuracy:", accuracy)
    print("Test Error = %g" % (1.0 — accuracy))


    predictions_and_labels = predictions.select("prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    print_metrics(predictions_and_labels)    
    
    return model      

In [8]:
def df_test(df):
    my_df = get_df_columns_test(df)
    feature_columns = my_df.columns[1:]

    assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
    pipeline = Pipeline(stages=[assembler])
    
    pipelineModel = pipeline.fit(my_df)
    
    dataset = pipelineModel.transform(my_df)
    
    loadedPipeline = PipelineModel.read().load("data/model/CVModel_GBTS")
    predictions = loadedPipeline.transform(dataset)

    return predictions

In [9]:
# _train = False
# _test = True
_train = True
_test = False

if _train:
    print("Training Model")
    my_model = df_train()
elif _test:
    print("Testing Model")
    

Training Model
Accuracy: 0.7480509395868928
Summary Stats
Accuracy = 0.6912599318955732
Confusion Matrix
 DenseMatrix([[428.,  42.],
             [230., 181.]])
