Note: you may need to restart the kernel to use updated packages.


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import cassandra
import pyspark
import re
import os
import random
from random import randint, randrange
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.clustering import KMeans
import seaborn as sns
from pyspark.ml.stat import Correlation
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import PCA, Imputer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, asc
from pyspark.sql.functions import isnan
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml import Pipeline
import pickle

In [2]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  5, truncate = True):
    if(truncate):
        pd.set_option('display.max_colwidth', 50)
    else:
        pd.set_option('display.max_colwidth', -1)
    pd.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pd.reset_option('display.max_rows')

In [3]:
def correlation_matrix(df, corr_columns, method='pearson'):
    vector_col = "corr_features"
    assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
    df_vector = assembler.transform(df).select(vector_col)
    matrix = Correlation.corr(df_vector, vector_col, method)

    result = matrix.collect()[0]["pearson({})".format(vector_col)].values
    return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)

In [4]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Model 1')
    plt.xlabel('Model 2')

In [5]:
from cassandra.cluster import Cluster

cluster = Cluster(['dse'])
session = cluster.connect()

In [6]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS accelerate 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)

<cassandra.cluster.ResultSet at 0x7f32b5e09490>

In [7]:
session.set_keyspace('accelerate')

In [8]:
query = "CREATE TABLE IF NOT EXISTS diabetes \
                                   (Id int, timesPregnant int, plasmaGlucose int, bloodPressure int, \
                                   tricepThickness int, serumInsulin int, bmi float, diabetesPedegree float, \
                                   age int, label int, PRIMARY KEY (Id))"
session.execute(query)

<cassandra.cluster.ResultSet at 0x7f32b5e07750>

In [9]:
fileName = 'data/pima-indians-diabetes.csv'
input_file = open(fileName, 'r')
i = 1
for line in input_file:
    iD = i
    row = line.split(',')

    query = "INSERT INTO diabetes (Id, timesPregnant, plasmaGlucose, bloodPressure, \
                                   tricepThickness, serumInsulin, bmi, diabetesPedegree, \
                                   age, label)"
    query = query + " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    session.execute(query, (int(iD), int(row[0]), int(row[1]), int(row[2]), int(row[3]), int(row[4]), float(row[5]), float(row[6]), int(row[7]), int(row[8])))
    i = i + 1

In [10]:
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()
diabetesDF = spark.read.format("org.apache.spark.sql.cassandra").options(table="diabetes", keyspace="accelerate").load()

print ("Table Row Count: ")
print (diabetesDF.count())

Table Row Count: 
768


In [11]:
print(diabetesDF.schema.names)
print([diabetesDF.where((col(c_name) == 0)).count() for c_name in diabetesDF.schema.names])

['id', 'age', 'bloodpressure', 'bmi', 'diabetespedegree', 'label', 'plasmaglucose', 'seruminsulin', 'timespregnant', 'tricepthickness']
[0, 0, 35, 11, 0, 500, 5, 374, 111, 227]


In [12]:
diabetesDF = diabetesDF.withColumn("bloodpressure", F.when(F.col("bloodpressure")==0, float("nan")).otherwise(F.col("bloodpressure")))
diabetesDF = diabetesDF.withColumn("plasmaglucose", F.when(F.col("plasmaglucose")==0, float("nan")).otherwise(F.col("plasmaglucose")))
diabetesDF = diabetesDF.withColumn("tricepthickness", F.when(F.col("tricepthickness")==0, float("nan")).otherwise(F.col("tricepthickness")))
diabetesDF = diabetesDF.withColumn("seruminsulin", F.when(F.col("seruminsulin")==0, float("nan")).otherwise(F.col("seruminsulin")))
diabetesDF = diabetesDF.withColumn("bmi", F.when(F.col("bmi")==0, float("nan")).otherwise(F.col("bmi")))

In [13]:
print(diabetesDF.schema.names)
print([diabetesDF.where((col(c_name) == 0)).count() for c_name in diabetesDF.schema.names])

['id', 'age', 'bloodpressure', 'bmi', 'diabetespedegree', 'label', 'plasmaglucose', 'seruminsulin', 'timespregnant', 'tricepthickness']
[0, 0, 0, 0, 0, 500, 0, 0, 111, 0]


In [14]:
imputer = Imputer()
imputer.setInputCols(["plasmaglucose", "bloodpressure", "bmi", "tricepthickness", "seruminsulin"])
imputer.setOutputCols(["out_plasmaglucose", "out_bloodpressure", "out_bmi", "out_tricepthickness", "out_seruminsulin"])
model = imputer.fit(diabetesDF)
#model.setInputCols(["plasmaglucose", "bloodpressure", "bmi"])
diabetesDF_imputed = model.transform(diabetesDF)
showDF(diabetesDF_imputed,100)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure
0,23,41,90.0,39.799999,0.451,1,196.0,,7,,39.799999,196.0,29.15342,155.548223,90.0
1,114,25,62.0,34.0,0.391,0,76.0,,4,,34.0,76.0,29.15342,155.548223,62.0
2,660,27,82.0,34.200001,1.292,1,80.0,70.0,3,31.0,34.200001,80.0,31.0,70.0,82.0
3,53,30,66.0,24.4,0.342,0,88.0,23.0,5,21.0,24.4,88.0,21.0,23.0,66.0
4,110,24,85.0,37.400002,0.247,1,95.0,36.0,0,25.0,37.400002,95.0,25.0,36.0,85.0
5,91,21,55.0,19.1,0.258,0,80.0,,1,,19.1,80.0,29.15342,155.548223,55.0
6,128,23,58.0,33.299999,0.261,0,118.0,94.0,1,36.0,33.299999,118.0,36.0,94.0,58.0
7,363,65,108.0,39.200001,0.305,0,103.0,,5,37.0,39.200001,103.0,37.0,155.548223,108.0
8,251,42,52.0,31.200001,0.38,0,106.0,,9,,31.200001,106.0,29.15342,155.548223,52.0
9,744,45,94.0,32.700001,0.734,1,140.0,,9,,32.700001,140.0,29.15342,155.548223,94.0


In [19]:
model.save("/home/jovyan/work/models/imputer_model")

In [25]:
tempDF = diabetesDF_imputed.filter(F.col("out_tricepthickness") != float("nan"))
tempDF = diabetesDF_imputed.filter(F.col("out_seruminsulin") != float("nan"))

print ("Table Row Count: ")
print (tempDF.count())

showDF(tempDF,100)

Table Row Count: 
768


Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure
0,23,41,90.0,39.799999,0.451,1,196.0,,7,,39.799999,196.0,29.15342,155.548223,90.0
1,114,25,62.0,34.0,0.391,0,76.0,,4,,34.0,76.0,29.15342,155.548223,62.0
2,660,27,82.0,34.200001,1.292,1,80.0,70.0,3,31.0,34.200001,80.0,31.0,70.0,82.0
3,53,30,66.0,24.4,0.342,0,88.0,23.0,5,21.0,24.4,88.0,21.0,23.0,66.0
4,110,24,85.0,37.400002,0.247,1,95.0,36.0,0,25.0,37.400002,95.0,25.0,36.0,85.0
5,91,21,55.0,19.1,0.258,0,80.0,,1,,19.1,80.0,29.15342,155.548223,55.0
6,128,23,58.0,33.299999,0.261,0,118.0,94.0,1,36.0,33.299999,118.0,36.0,94.0,58.0
7,363,65,108.0,39.200001,0.305,0,103.0,,5,37.0,39.200001,103.0,37.0,155.548223,108.0
8,251,42,52.0,31.200001,0.38,0,106.0,,9,,31.200001,106.0,29.15342,155.548223,52.0
9,744,45,94.0,32.700001,0.734,1,140.0,,9,,32.700001,140.0,29.15342,155.548223,94.0


In [26]:
assembler = VectorAssembler(
    inputCols=['age', 'out_bloodpressure', 'out_bmi', 'diabetespedegree', 'out_plasmaglucose', 'out_seruminsulin', 'timespregnant', 'out_tricepthickness'],
    outputCol='features', handleInvalid = "keep")

dDF = assembler.transform(diabetesDF_imputed)
showDF(dDF)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure,features
0,23,41,90.0,39.799999,0.451,1,196.0,,7,,39.799999,196.0,29.15342,155.548223,90.0,"[41.0, 90.0, 39.79999923706055, 0.451000005006..."
1,114,25,62.0,34.0,0.391,0,76.0,,4,,34.0,76.0,29.15342,155.548223,62.0,"[25.0, 62.0, 34.0, 0.39100000262260437, 76.0, ..."
2,660,27,82.0,34.200001,1.292,1,80.0,70.0,3,31.0,34.200001,80.0,31.0,70.0,82.0,"[27.0, 82.0, 34.20000076293945, 1.292000055313..."
3,53,30,66.0,24.4,0.342,0,88.0,23.0,5,21.0,24.4,88.0,21.0,23.0,66.0,"[30.0, 66.0, 24.399999618530273, 0.34200000762..."
4,110,24,85.0,37.400002,0.247,1,95.0,36.0,0,25.0,37.400002,95.0,25.0,36.0,85.0,"[24.0, 85.0, 37.400001525878906, 0.24699999392..."


In [28]:
splits = dDF.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]

print ("Train Dataframe Row Count: ")
print (train.count())
print ("Test Datafram Row Count: ")
print (test.count())

Train Dataframe Row Count: 
536
Test Datafram Row Count: 
232


In [29]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
rf_model = rf.fit(train)

rf_predictions = rf_model.transform(test)
showDF(rf_predictions)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure,features,rawPrediction,probability,prediction
0,2,31,66.0,26.6,0.351,0,85.0,,1,29.0,26.6,85.0,29.0,155.548223,66.0,"[31.0, 66.0, 26.600000381469727, 0.35100001096...","[8.349324188213066, 1.6506758117869345]","[0.8349324188213065, 0.16506758117869344]",0.0
1,5,33,40.0,43.099998,2.288,1,137.0,168.0,0,35.0,43.099998,137.0,35.0,168.0,40.0,"[33.0, 40.0, 43.099998474121094, 2.28800010681...","[4.734329487242798, 5.265670512757202]","[0.47343294872427977, 0.5265670512757202]",1.0
2,10,54,96.0,,0.232,1,125.0,,8,,32.457464,125.0,29.15342,155.548223,96.0,"[54.0, 96.0, 32.45746368650086, 0.231999993324...","[5.894224127214578, 4.105775872785421]","[0.5894224127214578, 0.4105775872785421]",0.0
3,33,22,58.0,24.799999,0.267,0,88.0,54.0,3,11.0,24.799999,88.0,11.0,54.0,58.0,"[22.0, 58.0, 24.799999237060547, 0.26699998974...","[9.758234811001202, 0.24176518899879781]","[0.9758234811001202, 0.024176518899879783]",0.0
4,50,24,,,0.305,0,105.0,,7,,32.457464,105.0,29.15342,155.548223,72.405184,"[24.0, 72.40518417462484, 32.45746368650086, 0...","[6.191913836544945, 3.8080861634550547]","[0.6191913836544944, 0.38080861634550545]",0.0


In [30]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
rf_score = evaluator.evaluate(rf_predictions)
print("Test set accuracy for Random Forest Classifier = " + str(rf_score))

Test set accuracy for Random Forest Classifier = 0.7801724137931034


In [31]:
pipeline = Pipeline(stages=[imputer, assembler, rf])

In [32]:
splits = diabetesDF.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]

print ("Train Dataframe Row Count: ")
print (train.count())
print ("Test Datafram Row Count: ")
print (test.count())

Train Dataframe Row Count: 
547
Test Datafram Row Count: 
221


In [33]:
model = pipeline.fit(train)

In [34]:
prediction = model.transform(test)
showDF(prediction)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness,out_bmi,out_plasmaglucose,out_tricepthickness,out_seruminsulin,out_bloodpressure,features,rawPrediction,probability,prediction
0,2,31,66.0,26.6,0.351,0,85.0,,1,29.0,26.6,85.0,29.0,157.021818,66.0,"[31.0, 66.0, 26.600000381469727, 0.35100001096...","[6.9904045279405915, 3.009595472059409]","[0.6990404527940591, 0.30095954720594087]",0.0
1,5,33,40.0,43.099998,2.288,1,137.0,168.0,0,35.0,43.099998,137.0,35.0,168.0,40.0,"[33.0, 40.0, 43.099998474121094, 2.28800010681...","[3.035844073186449, 6.964155926813551]","[0.3035844073186449, 0.6964155926813551]",1.0
2,16,32,,30.0,0.484,1,100.0,,7,,30.0,100.0,28.662269,157.021818,72.480916,"[32.0, 72.48091603053435, 30.0, 0.483999997377...","[6.2238205515972105, 3.7761794484027886]","[0.622382055159721, 0.3776179448402789]",0.0
3,30,38,92.0,34.099998,0.337,0,117.0,,5,,34.099998,117.0,28.662269,157.021818,92.0,"[38.0, 92.0, 34.099998474121094, 0.33700001239...","[3.974485791828168, 6.025514208171832]","[0.3974485791828168, 0.6025514208171832]",1.0
4,52,26,50.0,24.200001,0.526,0,101.0,36.0,1,15.0,24.200001,101.0,15.0,36.0,50.0,"[26.0, 50.0, 24.200000762939453, 0.52600002288...","[9.82753338825974, 0.1724666117402595]","[0.9827533388259739, 0.01724666117402595]",0.0


In [35]:
showDF(train)

Unnamed: 0,id,age,bloodpressure,bmi,diabetespedegree,label,plasmaglucose,seruminsulin,timespregnant,tricepthickness
0,1,50,72.0,33.599998,0.627,1,148.0,,6,35.0
1,8,29,,35.299999,0.134,0,115.0,,10,
2,10,54,96.0,,0.232,1,125.0,,8,
3,11,30,92.0,37.599998,0.191,0,110.0,,4,
4,13,57,80.0,27.1,1.441,0,139.0,,10,


In [39]:
model.write().overwrite().save("/home/jovyan/work/models/full_pipeline_model")

In [46]:
sameModel = pyspark.ml.PipelineModel.load("/home/jovyan/work/models/full_pipeline_model")

In [47]:
prediction2 = sameModel.transform(test)