In [1]:
#Importing PySpark libraries
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
import pandas as pd

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()


In [3]:
#Test the spark
df = spark.sql('''select 'chy' as sweetpea ''')
df.show()

+--------+
|sweetpea|
+--------+
|     chy|
+--------+



In [4]:
spark = SparkSession.builder.appName("Cervical Cancer Classification").getOrCreate()

In [5]:
#Load Dataset
file_loc_csv = 'kag_risk_factors_cervical_cancer.csv'
cervical = spark.read.csv(file_loc_csv, inferSchema=True, header=True)

In [6]:
#Data type
print(type(cervical))

<class 'pyspark.sql.dataframe.DataFrame'>


In [7]:
#Check number of rows
cervical.count()

858

In [8]:
#Check number of columns
len(cervical.columns)

36

In [9]:
#cervical.describe().show()

In [10]:
#Show details of the top 2 rows
cervical.head(2)

[Row(Age=18, Number of sexual partners='4.0', First sexual intercourse='15.0', Num of pregnancies='1.0', Smokes='0.0', Smokes (years)='0.0', Smokes (packs/year)='0.0', Hormonal Contraceptives='0.0', Hormonal Contraceptives (years)='0.0', IUD='0.0', IUD (years)='0.0', STDs='0.0', STDs (number)='0.0', STDs:condylomatosis='0.0', STDs:cervical condylomatosis='0.0', STDs:vaginal condylomatosis='0.0', STDs:vulvo-perineal condylomatosis='0.0', STDs:syphilis='0.0', STDs:pelvic inflammatory disease='0.0', STDs:genital herpes='0.0', STDs:molluscum contagiosum='0.0', STDs:AIDS='0.0', STDs:HIV='0.0', STDs:Hepatitis B='0.0', STDs:HPV='0.0', STDs: Number of diagnosis=0, STDs: Time since first diagnosis='?', STDs: Time since last diagnosis='?', Dx:Cancer=0, Dx:CIN=0, Dx:HPV=0, Dx=0, Hinselmann=0, Schiller=0, Citology=0, Biopsy=0),
 Row(Age=15, Number of sexual partners='1.0', First sexual intercourse='14.0', Num of pregnancies='1.0', Smokes='0.0', Smokes (years)='0.0', Smokes (packs/year)='0.0', Horm

In [11]:
#Print top 5 rows
cervical.show(5)

+---+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+---+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)|IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital herpe

In [12]:
#Check datatypes
cervical.printSchema() 

root
 |-- Age: integer (nullable = true)
 |-- Number of sexual partners: string (nullable = true)
 |-- First sexual intercourse: string (nullable = true)
 |-- Num of pregnancies: string (nullable = true)
 |-- Smokes: string (nullable = true)
 |-- Smokes (years): string (nullable = true)
 |-- Smokes (packs/year): string (nullable = true)
 |-- Hormonal Contraceptives: string (nullable = true)
 |-- Hormonal Contraceptives (years): string (nullable = true)
 |-- IUD: string (nullable = true)
 |-- IUD (years): string (nullable = true)
 |-- STDs: string (nullable = true)
 |-- STDs (number): string (nullable = true)
 |-- STDs:condylomatosis: string (nullable = true)
 |-- STDs:cervical condylomatosis: string (nullable = true)
 |-- STDs:vaginal condylomatosis: string (nullable = true)
 |-- STDs:vulvo-perineal condylomatosis: string (nullable = true)
 |-- STDs:syphilis: string (nullable = true)
 |-- STDs:pelvic inflammatory disease: string (nullable = true)
 |-- STDs:genital herpes: string (nulla

In [13]:
#DATA CLEANING

#Convert data type from string to float
from pyspark.sql.functions import col
cervical = cervical.select(*(col(c).cast("float").alias(c) for c in cervical.columns))
cervical.printSchema()

root
 |-- Age: float (nullable = true)
 |-- Number of sexual partners: float (nullable = true)
 |-- First sexual intercourse: float (nullable = true)
 |-- Num of pregnancies: float (nullable = true)
 |-- Smokes: float (nullable = true)
 |-- Smokes (years): float (nullable = true)
 |-- Smokes (packs/year): float (nullable = true)
 |-- Hormonal Contraceptives: float (nullable = true)
 |-- Hormonal Contraceptives (years): float (nullable = true)
 |-- IUD: float (nullable = true)
 |-- IUD (years): float (nullable = true)
 |-- STDs: float (nullable = true)
 |-- STDs (number): float (nullable = true)
 |-- STDs:condylomatosis: float (nullable = true)
 |-- STDs:cervical condylomatosis: float (nullable = true)
 |-- STDs:vaginal condylomatosis: float (nullable = true)
 |-- STDs:vulvo-perineal condylomatosis: float (nullable = true)
 |-- STDs:syphilis: float (nullable = true)
 |-- STDs:pelvic inflammatory disease: float (nullable = true)
 |-- STDs:genital herpes: float (nullable = true)
 |-- STDs

In [14]:
# Replace invalid data '?' with null 
import pyspark.sql.functions as F
from pyspark.sql.functions import col,when
cervical=cervical.select([when(col(c)=="?",None).otherwise(col(c)).alias(c) for c in cervical.columns])
cervical.show(5)


+----+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+---+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)|IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital her

In [15]:
#Remove all duplicate
cervical.dropDuplicates().show()

+----+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+----+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)| IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital h

In [16]:
#Remove columns that have only null values
cervical.na.drop("all").show()

+----+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+----+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)| IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital h

In [17]:
# checking the sum/count of null values in all columns
from pyspark.sql.functions import col, isnan, when, count

null_count = cervical.select([count(when(isnan(i)| col(i).isNull(), i)).alias(i)for i in cervical.columns])
null_count.show()

+---+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+---+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)|IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital herpe

In [18]:
null_count = null_count.collect()[0].asDict()
null_count

{'Age': 0,
 'Number of sexual partners': 26,
 'First sexual intercourse': 7,
 'Num of pregnancies': 56,
 'Smokes': 13,
 'Smokes (years)': 13,
 'Smokes (packs/year)': 13,
 'Hormonal Contraceptives': 108,
 'Hormonal Contraceptives (years)': 108,
 'IUD': 117,
 'IUD (years)': 117,
 'STDs': 105,
 'STDs (number)': 105,
 'STDs:condylomatosis': 105,
 'STDs:cervical condylomatosis': 105,
 'STDs:vaginal condylomatosis': 105,
 'STDs:vulvo-perineal condylomatosis': 105,
 'STDs:syphilis': 105,
 'STDs:pelvic inflammatory disease': 105,
 'STDs:genital herpes': 105,
 'STDs:molluscum contagiosum': 105,
 'STDs:AIDS': 105,
 'STDs:HIV': 105,
 'STDs:Hepatitis B': 105,
 'STDs:HPV': 105,
 'STDs: Number of diagnosis': 0,
 'STDs: Time since first diagnosis': 787,
 'STDs: Time since last diagnosis': 787,
 'Dx:Cancer': 0,
 'Dx:CIN': 0,
 'Dx:HPV': 0,
 'Dx': 0,
 'Hinselmann': 0,
 'Schiller': 0,
 'Citology': 0,
 'Biopsy': 0}

In [19]:
#Drop columns with over 20% null values i.e 0.2*856 = 171
cols_to_drop = [c for c,i in null_count.items() if i > 171]
cervical = cervical.drop(*cols_to_drop)
cervical.show(5)

+----+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+---+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+---------+------+------+---+----------+--------+--------+------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)|IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital herpes|STDs:molluscum contagiosum|STDs:AIDS|STDs:HIV|STDs:Hepatitis 

In [20]:
#Fitting the other null values
#Importing relevant library
from pyspark.ml.feature import Imputer


#Check columns to fit
cols_to_fit = [c for c,i in null_count.items() if i > 0]
cols_to_fit




['Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV',
 'STDs: Time since first diagnosis',
 'STDs: Time since last diagnosis']

In [21]:
#Fitting the null values
imputer = Imputer(inputCols=['Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV'],
                 outputCols=['Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV'])
model = imputer.fit(cervical)
cervical = model.transform(cervical)



In [22]:
#checking for null/nan values after fitting the data

null_count = cervical.select([count(when(isnan(i)| col(i).isNull(), i)).alias(i)for i in cervical.columns])
null_count = null_count.collect()[0].asDict()
null_count

{'Age': 0,
 'Number of sexual partners': 0,
 'First sexual intercourse': 0,
 'Num of pregnancies': 0,
 'Smokes': 0,
 'Smokes (years)': 0,
 'Smokes (packs/year)': 0,
 'Hormonal Contraceptives': 0,
 'Hormonal Contraceptives (years)': 0,
 'IUD': 0,
 'IUD (years)': 0,
 'STDs': 0,
 'STDs (number)': 0,
 'STDs:condylomatosis': 0,
 'STDs:cervical condylomatosis': 0,
 'STDs:vaginal condylomatosis': 0,
 'STDs:vulvo-perineal condylomatosis': 0,
 'STDs:syphilis': 0,
 'STDs:pelvic inflammatory disease': 0,
 'STDs:genital herpes': 0,
 'STDs:molluscum contagiosum': 0,
 'STDs:AIDS': 0,
 'STDs:HIV': 0,
 'STDs:Hepatitis B': 0,
 'STDs:HPV': 0,
 'STDs: Number of diagnosis': 0,
 'Dx:Cancer': 0,
 'Dx:CIN': 0,
 'Dx:HPV': 0,
 'Dx': 0,
 'Hinselmann': 0,
 'Schiller': 0,
 'Citology': 0,
 'Biopsy': 0}

In [23]:
#Check new number of columns
len(cervical.columns)

34

In [24]:
#MACHINE LEARNING


#Import relevant PySpark Machine Learning Libraries
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator



In [25]:
#Checking the Biopsy data. 
cervical.groupBy("Biopsy").count().show()

+------+-----+
|Biopsy|count|
+------+-----+
|   1.0|   55|
|   0.0|  803|
+------+-----+



In [27]:
#Vector assembling
cols = cervical.columns
cols.remove("Biopsy")
print(cols)
assembler = VectorAssembler(inputCols = cols, outputCol = "features")
cervical.show()

['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology']


In [28]:
#Tranforming the dataset
cervical = assembler.transform(cervical)
cervical.select("features",'Biopsy').show(truncate = False)

+--------------------------------------------------------------------------------------------------------------------------------------------+------+
|features                                                                                                                                    |Biopsy|
+--------------------------------------------------------------------------------------------------------------------------------------------+------+
|(33,[0,1,2,3],[18.0,4.0,15.0,1.0])                                                                                                          |0.0   |
|(33,[0,1,2,3],[15.0,1.0,14.0,1.0])                                                                                                          |0.0   |
|(33,[0,1,2,3],[34.0,1.0,16.99530029296875,1.0])                                                                                             |0.0   |
|(33,[0,1,2,3,4,5,6,7,8,26,28],[52.0,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,1.0,1.0])                   

In [29]:
#apply Scaling to achieve better results
standardscaler = StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
cervical = standardscaler.fit(cervical).transform(cervical)

In [30]:
assembled_data = cervical.select("Scaled_features","Biopsy")
assembled_data.show()

+--------------------+------+
|     Scaled_features|Biopsy|
+--------------------+------+
|(33,[0,1,2,3],[2....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[4....|   0.0|
|(33,[0,1,2,3,4,5,...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3],[4....|   0.0|
|(33,[0,1,2,3,4,5,...|   1.0|
|(33,[0,1,2,3,7,8,...|   0.0|
|(33,[0,1,2,3,26,2...|   0.0|
|(33,[0,1,2,3,4,5,...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3,7,8,...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3,9,10...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3,7,8]...|   0.0|
|(33,[0,1,2,3,7,8,...|   0.0|
|(33,[0,1,2,3,9,10...|   0.0|
+--------------------+------+
only showing top 20 rows



In [31]:
#Splitting into train and test data
train_cervical,test_cervical = assembled_data.randomSplit([0.7,0.3])

train_cervical.show()
test_cervical.show()

+--------------------+------+
|     Scaled_features|Biopsy|
+--------------------+------+
|(33,[0,1,2],[2.23...|   0.0|
|(33,[0,1,2],[2.82...|   0.0|
|(33,[0,1,2],[3.88...|   0.0|
|(33,[0,1,2],[3.88...|   0.0|
|(33,[0,1,2],[4.00...|   0.0|
|(33,[0,1,2],[5.41...|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[2....|   0.0|
|(33,[0,1,2,3],[2....|   0.0|
|(33,[0,1,2,3],[2....|   0.0|
|(33,[0,1,2,3],[2....|   0.0|
|(33,[0,1,2,3],[2....|   0.0|
+--------------------+------+
only showing top 20 rows

+--------------------+------+
|     Scaled_features|Biopsy|
+--------------------+------+
|(33,[0,1,2],[1.52...|   0.0|
|(33,[0,1,2],[2.70...|   0.0|
|(33,[0,1,2],[4.00...|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3],[1....|   0.0|
|(33,[0,1,2,3]

In [32]:
#LOGISTIC REGRESSION
log_reg = LogisticRegression(labelCol = "Biopsy", featuresCol = "Scaled_features", maxIter=40)
model = log_reg.fit(train_cervical)

model

LogisticRegressionModel: uid=LogisticRegression_20de95b2fcf0, numClasses=2, numFeatures=33

In [33]:
prediction_test = model.transform(test_cervical)
prediction_test.show()



+--------------------+------+--------------------+--------------------+----------+
|     Scaled_features|Biopsy|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(33,[0,1,2],[1.52...|   0.0|[5.58824845372962...|[0.99627237153245...|       0.0|
|(33,[0,1,2],[2.70...|   0.0|[5.65702715706826...|[0.99651927250372...|       0.0|
|(33,[0,1,2],[4.00...|   0.0|[5.78602786304180...|[0.99693924393295...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[5.64499978484119...|[0.99647730409374...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[5.52500361581862...|[0.99602996968235...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[5.65549339595949...|[0.99651394842802...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[5.70695712323777...|[0.99668823627142...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[5.53376189117525...|[0.99606445223535...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[5.53202655541358...|[0.99605764375709...|       0.0|
|(33

In [34]:
prediction_test.select("Biopsy","prediction").show(10)


+------+----------+
|Biopsy|prediction|
+------+----------+
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
+------+----------+
only showing top 10 rows



In [35]:
#compute raw scores on the test set
predictionAndLabels = prediction_test.select("Biopsy","prediction").rdd
predictionAndLabels.collect()


[Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.0, prediction=0.0),
 Row(Biopsy=0.

In [36]:
metrics = BinaryClassificationMetrics(predictionAndLabels)




In [37]:
#Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)


Area under ROC = 0.8374208860759493


In [38]:
evaluator = MulticlassClassificationEvaluator(labelCol = "Biopsy", predictionCol = "prediction", metricName = "accuracy")
accuracy_log_reg = evaluator.evaluate(prediction_test)
print("Logistic Reg Accuracy = ",accuracy_log_reg)

Logistic Reg Accuracy =  0.9683794466403162


In [39]:
#NAIVEBAYES
naive_bayes = NaiveBayes(labelCol = "Biopsy", featuresCol = "Scaled_features", smoothing = 1.0)

model = naive_bayes.fit(train_cervical)
model



NaiveBayesModel: uid=NaiveBayes_cf409dd69025, modelType=multinomial, numClasses=2, numFeatures=33

In [40]:
#select example rows to display
prediction_test = model.transform(test_cervical)
prediction_test.show()



+--------------------+------+--------------------+--------------------+----------+
|     Scaled_features|Biopsy|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(33,[0,1,2],[1.52...|   0.0|[-9.5808914477509...|[0.99863451852707...|       0.0|
|(33,[0,1,2],[2.70...|   0.0|[-14.011632583146...|[0.99965887485007...|       0.0|
|(33,[0,1,2],[4.00...|   0.0|[-19.056981644766...|[0.99993557187088...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[-13.705364689730...|[0.99945864115312...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[-13.469717378786...|[0.99948296825971...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[-12.160712023854...|[0.99929381819685...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[-12.565039958075...|[0.99943396628588...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[-13.679997370788...|[0.99951188313946...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[-15.434930028666...|[0.99964675702078...|       0.0|
|(33

In [41]:
prediction_test.select("Biopsy","prediction").show(10)

predictionAndLabels = prediction_test.select("Biopsy","prediction").rdd



+------+----------+
|Biopsy|prediction|
+------+----------+
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
+------+----------+
only showing top 10 rows



In [42]:
metrics = BinaryClassificationMetrics(predictionAndLabels)



In [43]:
#Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under ROC = 0.7049305320230431


In [44]:
#Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol = "Biopsy", predictionCol = "prediction", metricName = "accuracy")
accuracy_naive_bayes = evaluator.evaluate(prediction_test)
print("Naive Bayes Accuracy = ",accuracy_naive_bayes)


Naive Bayes Accuracy =  0.9288537549407114


In [45]:
#RANDOM FOREST CLASSIFIER

random_forest_classifier = RandomForestClassifier(labelCol = "Biopsy", featuresCol = "Scaled_features", numTrees = 40)
model = random_forest_classifier.fit(train_cervical)
model



RandomForestClassificationModel: uid=RandomForestClassifier_9cdfd271d805, numTrees=40, numClasses=2, numFeatures=33

In [46]:
prediction_test = model.transform(test_cervical)
prediction_test.show()



+--------------------+------+--------------------+--------------------+----------+
|     Scaled_features|Biopsy|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(33,[0,1,2],[1.52...|   0.0|[38.5186909633311...|[0.96296727408327...|       0.0|
|(33,[0,1,2],[2.70...|   0.0|[39.5495904342305...|[0.98873976085576...|       0.0|
|(33,[0,1,2],[4.00...|   0.0|[39.450881989533,...|[0.98627204973832...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[39.5052078172636...|[0.98763019543159...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[39.5052078172636...|[0.98763019543159...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[39.5052078172636...|[0.98763019543159...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[39.5052078172636...|[0.98763019543159...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[39.5052078172636...|[0.98763019543159...|       0.0|
|(33,[0,1,2,3],[1....|   0.0|[39.5052078172636...|[0.98763019543159...|       0.0|
|(33

In [47]:
prediction_test.select("Biopsy","prediction").show(10)


+------+----------+
|Biopsy|prediction|
+------+----------+
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
|   0.0|       0.0|
+------+----------+
only showing top 10 rows



In [48]:
predictionAndLabels = prediction_test.select("Biopsy","prediction").rdd.map(lambda row: row[0:])

metrics = BinaryClassificationMetrics(predictionAndLabels)



In [49]:
#Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)




Area under ROC = 0.8529116465863453


In [50]:
#Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol = "Biopsy", predictionCol = "prediction", metricName = "accuracy")
accuracy_random_forest = evaluator.evaluate(prediction_test)
print("Random Forest Accuracy = ",accuracy_random_forest)





Random Forest Accuracy =  0.9525691699604744


In [51]:
print("Accuracy of Logistics Reg = ",accuracy_log_reg)
print("Accuracy of Naive Bayes   = ",accuracy_naive_bayes)
print("Accuracy of Random Forest = ",accuracy_random_forest)

Accuracy of Logistics Reg =  0.9683794466403162
Accuracy of Naive Bayes   =  0.9288537549407114
Accuracy of Random Forest =  0.9525691699604744


In [295]:
#KMEANS Clustering
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer


In [296]:
#identify observation clusters by applying K-means clustering on the data variables: age, number of sexual partners, 
#age of first sexual experience,number of pregnancies,hormonal contraceptives(years), HPV STD  

In [291]:
cervical.columns

['Age',
 'Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV',
 'STDs: Number of diagnosis',
 'Dx:Cancer',
 'Dx:CIN',
 'Dx:HPV',
 'Dx',
 'Hinselmann',
 'Schiller',
 'Citology',
 'Biopsy',
 'features',
 'Scaled_features']

In [297]:
cervical = cervical.select('Age',
 'Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Hormonal Contraceptives (years)',
 'STDs:HPV',
 'Biopsy')
cervical.show()

+----+-------------------------+------------------------+------------------+-------------------------------+--------+------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Hormonal Contraceptives (years)|STDs:HPV|Biopsy|
+----+-------------------------+------------------------+------------------+-------------------------------+--------+------+
|18.0|                      4.0|                    15.0|               1.0|                            0.0|     0.0|   0.0|
|15.0|                      1.0|                    14.0|               1.0|                            0.0|     0.0|   0.0|
|34.0|                      1.0|                 16.9953|               1.0|                            0.0|     0.0|   0.0|
|52.0|                      5.0|                    16.0|               4.0|                            3.0|     0.0|   0.0|
|46.0|                      3.0|                    21.0|               4.0|                           15.0|     0.0|   0.0|


In [298]:
#Vector assembling
cols = cervical.columns
cols.remove("Biopsy")
assembler = VectorAssembler(inputCols = cols, outputCol = "features")
cervical.show()

+----+-------------------------+------------------------+------------------+-------------------------------+--------+------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Hormonal Contraceptives (years)|STDs:HPV|Biopsy|
+----+-------------------------+------------------------+------------------+-------------------------------+--------+------+
|18.0|                      4.0|                    15.0|               1.0|                            0.0|     0.0|   0.0|
|15.0|                      1.0|                    14.0|               1.0|                            0.0|     0.0|   0.0|
|34.0|                      1.0|                 16.9953|               1.0|                            0.0|     0.0|   0.0|
|52.0|                      5.0|                    16.0|               4.0|                            3.0|     0.0|   0.0|
|46.0|                      3.0|                    21.0|               4.0|                           15.0|     0.0|   0.0|


In [299]:
#Tranforming the dataset
cervical = assembler.transform(cervical)
cervical.select("features",'Biopsy').show(truncate = False)

+------------------------------------------+------+
|features                                  |Biopsy|
+------------------------------------------+------+
|[18.0,4.0,15.0,1.0,0.0,0.0]               |0.0   |
|[15.0,1.0,14.0,1.0,0.0,0.0]               |0.0   |
|[34.0,1.0,16.99530029296875,1.0,0.0,0.0]  |0.0   |
|[52.0,5.0,16.0,4.0,3.0,0.0]               |0.0   |
|[46.0,3.0,21.0,4.0,15.0,0.0]              |0.0   |
|[42.0,3.0,23.0,2.0,0.0,0.0]               |0.0   |
|[51.0,3.0,17.0,6.0,0.0,0.0]               |1.0   |
|[26.0,1.0,26.0,3.0,2.0,0.0]               |0.0   |
|[45.0,1.0,20.0,5.0,0.0,0.0]               |0.0   |
|[44.0,3.0,15.0,2.2755610942840576,0.0,0.0]|0.0   |
|[44.0,3.0,26.0,4.0,2.0,0.0]               |0.0   |
|[27.0,1.0,17.0,3.0,8.0,0.0]               |0.0   |
|[45.0,4.0,14.0,6.0,10.0,0.0]              |0.0   |
|[44.0,2.0,25.0,2.0,5.0,0.0]               |0.0   |
|[43.0,2.0,18.0,5.0,0.0,0.0]               |0.0   |
|[40.0,3.0,18.0,2.0,15.0,0.0]              |0.0   |
|[41.0,4.0,2

In [300]:
cervical.groupBy("Number of sexual partners").count().show()


+-------------------------+-----+
|Number of sexual partners|count|
+-------------------------+-----+
|                      9.0|    1|
|                      5.0|   44|
|                      7.0|    7|
|                      2.0|  272|
|                      3.0|  208|
|                     10.0|    1|
|                2.5276442|   26|
|                      1.0|  206|
|                      6.0|    9|
|                      8.0|    4|
|                     15.0|    1|
|                      4.0|   78|
|                     28.0|    1|
+-------------------------+-----+



In [301]:
cervical.groupBy("Number of sexual partners").count().show()

+-------------------------+-----+
|Number of sexual partners|count|
+-------------------------+-----+
|                      9.0|    1|
|                      5.0|   44|
|                      7.0|    7|
|                      2.0|  272|
|                      3.0|  208|
|                     10.0|    1|
|                2.5276442|   26|
|                      1.0|  206|
|                      6.0|    9|
|                      8.0|    4|
|                     15.0|    1|
|                      4.0|   78|
|                     28.0|    1|
+-------------------------+-----+



In [302]:
cervical.groupBy("Biopsy").count().show()

+------+-----+
|Biopsy|count|
+------+-----+
|   1.0|   55|
|   0.0|  803|
+------+-----+



In [303]:
kmeans = KMeans(featuresCol = "features", k =2)

model = kmeans.fit(cervical)
model



KMeansModel: uid=KMeans_6b0a42668e7e, k=2, distanceMeasure=euclidean, numFeatures=6

In [304]:
model.transform(cervical).groupBy("prediction").count().show()



+----------+-----+
|prediction|count|
+----------+-----+
|         1|  312|
|         0|  546|
+----------+-----+



In [305]:
predictions = model.transform(cervical)
predictions.show()



+----+-------------------------+------------------------+------------------+-------------------------------+--------+------+--------------------+----------+
| Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Hormonal Contraceptives (years)|STDs:HPV|Biopsy|            features|prediction|
+----+-------------------------+------------------------+------------------+-------------------------------+--------+------+--------------------+----------+
|18.0|                      4.0|                    15.0|               1.0|                            0.0|     0.0|   0.0|[18.0,4.0,15.0,1....|         0|
|15.0|                      1.0|                    14.0|               1.0|                            0.0|     0.0|   0.0|[15.0,1.0,14.0,1....|         0|
|34.0|                      1.0|                 16.9953|               1.0|                            0.0|     0.0|   0.0|[34.0,1.0,16.9953...|         1|
|52.0|                      5.0|                    16.0| 

In [306]:
predictions.groupBy("Biopsy","prediction").count().show()



+------+----------+-----+
|Biopsy|prediction|count|
+------+----------+-----+
|   1.0|         0|   31|
|   0.0|         0|  515|
|   0.0|         1|  288|
|   1.0|         1|   24|
+------+----------+-----+



In [307]:
#Evaluate clustering by computing Silhouette score
evaluator = new ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")

SyntaxError: invalid syntax (Temp/ipykernel_18604/2575530022.py, line 2)

In [None]:

#SAMPLE
/ Loads data.
val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

// Trains a k-means model.
val kmeans = new KMeans().setK(2).setSeed(1L)
val model = kmeans.fit(dataset)

// Make predictions
val predictions = model.transform(dataset)

// Evaluate clustering by computing Silhouette score
val evaluator = new ClusteringEvaluator()

val silhouette = evaluator.evaluate(predictions)
println(s"Silhouette with squared euclidean distance = $silhouette")

// Shows the result.
println("Cluster Centers: ")
model.clusterCenters.foreach(println)














