# Demo Cross Validation

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import cassandra
import pyspark
import re
import os
import random
from random import randint, randrange
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.clustering import KMeans
import seaborn as sns
from pyspark.ml.stat import Correlation
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import PCA, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, asc
from pyspark.sql.functions import isnan
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#### Helper function to have nicer formatting of Spark DataFrames

In [None]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  5, truncate = True):
    if(truncate):
        pd.set_option('display.max_colwidth', 50)
    else:
        pd.set_option('display.max_colwidth', -1)
    pd.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pd.reset_option('display.max_rows')

In [None]:
def correlation_matrix(df, corr_columns, method='pearson'):
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
    df_vector = assembler.transform(df).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col, method)

    result = matrix.collect()[0]["pearson({})".format(vector_col)].values
    return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

<img src="images/dselogo.png" width="400" height="200">

## Creating Tables and Loading Tables

### Connect to Cassandra

In [None]:
from cassandra.cluster import Cluster

cluster = Cluster(['dse'])
session = cluster.connect()

### Create Demo Keyspace 

In [None]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS accelerate 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)

### Set keyspace 

In [None]:
session.set_keyspace('accelerate')

### Create table called `iris`. Our PRIMARY will be a unique key (status_id) we generate for each row. 

In [None]:
query = "CREATE TABLE IF NOT EXISTS iris \
                                   (Id int, SepalLengthCm float, SepalWidthCm float, \
                                   PetalLengthCm float, PetalWidthCm float, Species text, \
                                   PRIMARY KEY (Id))"
session.execute(query)

### Load dataset from CSV file

#### Insert all the Data into the Apache Cassandra table `iris`

In [None]:
fileName = 'data/datasets_19_420_Iris.csv'
input_file = open(fileName, 'r')
next(input_file)
for line in input_file:
    row = line.split(',')

    query = "INSERT INTO iris (Id, SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm, Species)"
    query = query + " VALUES (%s, %s, %s, %s, %s, %s)"
    session.execute(query, (int(row[0]), float(row[1]), float(row[2]), float(row[3]), float(row[4]), str(row[5])))
    

## Loading with Apache Spark
<img src="images/sparklogo.png" width="150" height="200">

In [None]:
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()


irisDF = spark.read.format("org.apache.spark.sql.cassandra").options(table="iris", keyspace="accelerate").load()

print ("Table Row Count: ")
print (irisDF.count())

In [None]:
showDF(irisDF)

In [None]:
labelIndexer = StringIndexer(inputCol="species", outputCol="label", handleInvalid='keep')
training = labelIndexer.fit(irisDF).transform(irisDF)

showDF(training)

In [None]:
irisPD = training.toPandas()
sns.countplot(y=irisPD.label)
plt.xlabel("Count of each Target class")
plt.ylabel("Target classes")
plt.show()

In [None]:
assembler = VectorAssembler(
    inputCols=['petallengthcm', 'sepalwidthcm', 'petalwidthcm', 'sepallengthcm'],
    outputCol='features')

trainingData = assembler.transform(training)

In [None]:
splits = trainingData.randomSplit([0.8, 0.2], 124)
train = splits[0]
test = splits[1]

print ("Train Dataframe Row Count: ")
print (train.count())
print ("Test Datafram Row Count: ")
print (test.count())

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

model = rf.fit(train)

predictions = model.transform(test)
#predictions.show()
showDF(predictions)

In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="f1")
f1 = evaluator.evaluate(predictions)
print("Test set f1 score = " + str(f1))

In [None]:
showDF(predictions, 30)

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
grid = ParamGridBuilder().addGrid(rf.numTrees, [5,15]).build()
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
cv = CrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator,
    parallelism=2, numFolds = 5)
cvModel = cv.fit(trainingData)
print(cv.getNumFolds())
print(cvModel.avgMetrics)

3-Folds
Accuracy [0.9469198790627362, 0.9350151171579743]
F1 Score [0.9468202574624536, 0.9347017340401462]

5-Folds
Accuracy [0.9672969966629588, 0.9605990783410137]
F1 Score [0.9672383456365659, 0.9608919863240464]

## Linear Regression Dataset

In [None]:
query = "CREATE TABLE IF NOT EXISTS linear \
                                   (Id int, x float, y float, PRIMARY KEY (Id))"
session.execute(query)

In [None]:
fileName = 'data/lin_test.csv'
input_file = open(fileName, 'r')
next(input_file)
i = 1
iD = i
for line in input_file:
    row = line.split(',')

    query = "INSERT INTO linear (Id, x, y)"
    query = query + " VALUES (%s, %s, %s)"
    session.execute(query, (int(iD),float(row[0]), float(row[1])))
    i = i + 1
    iD = iD + 1

In [None]:
fileName = 'data/lin_train.csv'
input_file = open(fileName, 'r')
next(input_file)
i = 1
for line in input_file:
    row = line.split(',')

    query = "INSERT INTO linear (Id, x, y)"
    query = query + " VALUES (%s, %s, %s)"
    #print(row)
    if len(row)==2:
        session.execute(query, (int(iD),float(row[0]), float(row[1])))
        iD = iD + 1
    i = i + 1

In [None]:
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()


linDF = spark.read.format("org.apache.spark.sql.cassandra").options(table="linear", keyspace="accelerate").load()

print ("Table Row Count: ")
print (linDF.count())

In [None]:
showDF(linDF,100)

In [None]:
linDF = linDF.drop('label')

In [None]:
linDF = linDF.withColumn("label", linDF["y"])

showDF(linDF)

In [None]:
assembler = VectorAssembler(
    inputCols=['x'],
    outputCol='features')

linDF = assembler.transform(linDF)
showDF(linDF)

In [None]:
k=5
splits = linDF.randomSplit([1/k]*k)
showDF(splits[0])
print ("Split 0 Row Count: "+str(splits[0].count()))

In [None]:
r2 = []
rmse = []
test_split_nums = []
evaluator1 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
evaluator2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
for i in range(k):
    test = splits[i]
    #print(test.count())
    test_splits = list(range(k))
    test_splits.remove(int(i))
    #print(test_splits)
    if k == 2:
        train = splits[test_splits[0]]
    elif k == 3:
        train = splits[test_splits[0]].union(splits[test_splits[1]])
    else:
        train = splits[test_splits[0]].union(splits[test_splits[1]])
        for j in range(k-3):
            train = train.union(splits[test_splits[j+2]])
    #print(train.count())
    #showDF(train)
    lr = LinearRegression(labelCol="label", featuresCol="features",maxIter=10, regParam=0.3, elasticNetParam=0.8)
    model = lr.fit(train)
    predictions = model.transform(test)
    r2_val = evaluator1.evaluate(predictions)
    rmse_val = evaluator2.evaluate(predictions)
    r2.append(r2_val)
    rmse.append(rmse_val)
    test_split_nums.append(i)
    #showDF(predictions)

print(test_split_nums)
print(r2)

In [None]:
results = []
for i in range(len(test_split_nums)):
    results.append({"Test Split Number":test_split_nums[i], "R2":r2[i], "RMSE":rmse[i]})
print(results)
resultsDF = spark.createDataFrame(pyspark.sql.Row(**x) for x in results)
showDF(resultsDF)

In [None]:
plt.scatter(predictions.select("x").toPandas(), predictions.select("y").toPandas())

In [None]:
plt.scatter(predictions.select("x").toPandas(), predictions.select("prediction").toPandas())

In [None]:
plt.plot(predictions.select("y").toPandas(), '.', predictions.select("prediction").toPandas(), 'x')