In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier,LinearSVC,LogisticRegression
import pandas as pd



# User Input  and Classifier List

In [14]:
bootstrap_count=5  #number of boostraps
classifiers = [LogisticRegression,DecisionTreeClassifier,LinearSVC]  #Classifiers list


In [15]:
pd.set_option("display.max_rows", None, "display.max_columns", None)


# File Reading dropping unnecessary columns

In [16]:
spark=SparkSession.builder.getOrCreate()
df=spark.read.csv("BCancer.csv",inferSchema=True,header=True).drop("_c32").drop("id")
df.printSchema()

root
 |-- diagnosis: string (nullable = true)
 |-- radius_mean: double (nullable = true)
 |-- texture_mean: double (nullable = true)
 |-- perimeter_mean: double (nullable = true)
 |-- area_mean: double (nullable = true)
 |-- smoothness_mean: double (nullable = true)
 |-- compactness_mean: double (nullable = true)
 |-- concavity_mean: double (nullable = true)
 |-- concave points_mean: double (nullable = true)
 |-- symmetry_mean: double (nullable = true)
 |-- fractal_dimension_mean: double (nullable = true)
 |-- radius_se: double (nullable = true)
 |-- texture_se: double (nullable = true)
 |-- perimeter_se: double (nullable = true)
 |-- area_se: double (nullable = true)
 |-- smoothness_se: double (nullable = true)
 |-- compactness_se: double (nullable = true)
 |-- concavity_se: double (nullable = true)
 |-- concave points_se: double (nullable = true)
 |-- symmetry_se: double (nullable = true)
 |-- fractal_dimension_se: double (nullable = true)
 |-- radius_worst: double (nullable = true)


# Vector Assembler

In [17]:
va=VectorAssembler(inputCols=([c for c in df.columns if c!='diagnosis']),outputCol='features')
vaDf=va.transform(df)
vaDf.select('features','diagnosis').show()

+--------------------+---------+
|            features|diagnosis|
+--------------------+---------+
|[17.99,10.38,122....|        M|
|[20.57,17.77,132....|        M|
|[19.69,21.25,130....|        M|
|[11.42,20.38,77.5...|        M|
|[20.29,14.34,135....|        M|
|[12.45,15.7,82.57...|        M|
|[18.25,19.98,119....|        M|
|[13.71,20.83,90.2...|        M|
|[13.0,21.82,87.5,...|        M|
|[12.46,24.04,83.9...|        M|
|[16.02,23.24,102....|        M|
|[15.78,17.89,103....|        M|
|[19.17,24.8,132.4...|        M|
|[15.85,23.95,103....|        M|
|[13.73,22.61,93.6...|        M|
|[14.54,27.54,96.7...|        M|
|[14.68,20.13,94.7...|        M|
|[16.13,20.68,108....|        M|
|[19.81,22.15,130....|        M|
|[13.54,14.36,87.4...|        B|
+--------------------+---------+
only showing top 20 rows



# String Indexer

In [18]:
sInd=StringIndexer(inputCol='diagnosis',outputCol='label')
df=sInd.fit(vaDf).transform(vaDf).select('features','label')


# Creating Boostraps 

In [19]:
#Bootstraps
bootstraps=df.randomSplit([1/bootstrap_count for i in range(bootstrap_count)])
test=bootstraps[0]
del bootstraps[0]

test.count()

108

Fitting and testing and Selecting prediction column of each Model

In [None]:
predicted=[]
for bootstrap in bootstraps:
    for classifier in classifiers:
        predicted.append(classifier(labelCol='label',featuresCol='features').fit(bootstrap).transform(test).select('prediction').toPandas())
            

# Stacking prediction columns of all models into One

In [None]:

prediction=pd.concat(predicted,axis=1)
len(prediction)

In [None]:
prediction

# Voting from all predictions 

In [None]:

import numpy as np
import builtins as p
final=[]
for index,row in prediction.iterrows():
    final.append(p.max(row,key=list(row).count))
    
final=pd.DataFrame(final, columns = ['final'])

final_prediction=pd.concat([prediction,final],axis=1)


In [None]:
final_prediction