# Classification using Logistic Regression

We are using the logistic regression algorithm to predict a person is diabetic or not based on the health data

To add PySpark to sys.path for running the code on the Jupyter IDE we are Using the package findspark

In [None]:
import pyspark
import findspark
findspark.init()
findspark.find()

To perform any task on spark you need start a spark session, here we are starting a session for our logistic regression

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Logistic App").getOrCreate()

To start, we are loading the diabetes dataset 

In [None]:
diabetes = spark.read.csv('diabetes.csv',header= True)
diabetes.printSchema()
diabetes.show()

# Dropping unwanted columns

We need to drop unwanted columns from the dataset. By looking into the dataset we can see columns 'PatientID' have no relevance in predicting the diabetes. To have this insight in a complex problem. we have to formulate the hypothesis and evaluation of the hypothesis should be done.

In [None]:
colm = 'PatientID'
db_df = diabetes.select([column for column in diabetes.columns if column not in colm])
db_df.printSchema()

# Changing the column datatype

We need to change column datatype to float from the initial string datatype

In [None]:
from pyspark.sql.functions import col
db_df = db_df.select(*(col(c).cast('float').alias(c) for c in db_df.columns))
db_df.printSchema()

# Taking the count of the null and missing values

In [None]:
from pyspark.sql.functions import col, count, isnan, when
db_df.select([count(when(col(c).isNull(), c)).alias(c) for c in db_df.columns]).show()

# List of columns to vector form

We are using VectorAssembler to convert the list columns in our dataset to vector form in which all the features are grouped to vector form

In [None]:
features = db_df.drop('Diabetic')
from pyspark.ml.feature import VectorAssembler
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")
output = assembler.transform(db_df).select('features','Diabetic')

# Splitting the data into training and testing datasets

The dataset in vector form is now splitting into train and test datset fractions


In [None]:
train,test = output.randomSplit([0.75, 0.25])
train.show()
test.show()

# Train the Logistic Regression Model

We are using Logistic Regression model for classsification problem. In the following step we training the logistic regression model with labels and features. This kind of training where both labels and features are used are known as supervised learning

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features', labelCol = 'Diabetic', maxIter=10)
lrModel = lr.fit(train)

# Coefficients of the Model

In [None]:
import matplotlib.pyplot as plt
import numpy as np
coeff = np.sort(lrModel.coefficients)
plt.plot(coeff)
plt.ylabel('Coefficients')
plt.xlabel('Iterations')
plt.show()

# Predictions and labels

To compare the predictions with actual labels we use model.transform

In [None]:
predictions = lrModel.transform(test)
predictions.show(10)
predictions = predictions.withColumnRenamed("Diabetic","label")
predictions.show(10)

# Confusion matrix

In [None]:
import sklearn

y_true = predictions.select(['label']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import   confusion_matrix
from sklearn import metrics
#print(classification_report(y_true, y_pred))
print("Confusion Matrix", confusion_matrix(y_true, y_pred))

# Accuracy, Precision, Recall, F1-Score 

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
evaluator = BinaryClassificationEvaluator()
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
#print('Area Under ROC', evaluator.evaluate(predictions))


# Get metrics
acc = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedRecall"})
#auc = evaluator.evaluate(predictionAndTarget)

print('Precision', weightedPrecision)
print('Accuracy', acc)
print('F1-Score', f1)
print('Recall', weightedRecall)


# Area Under ROC
ROC Curve is plotting using model.summary and AUC 

In [None]:
trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Area UnderROC: ' + str(trainingSummary.areaUnderROC))

# Train the Decision Tree Classification Model

We are using Decision Tree model for classsification problem. In the following step we training the decision tree model with labels and features. This kind of training where both labels and features are used are known as supervised learning

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="Diabetic", featuresCol="features", maxDepth = 3)
dtModel = dt.fit(train)
predictions_dt = dtModel.transform(test)
predictions_dt.show(10)

# Accuracy, Precision, Recall, and F1-Score of Decision Tree Model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
evaluator = BinaryClassificationEvaluator()
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="Diabetic", predictionCol="prediction")
#print('Area Under ROC', evaluator.evaluate(predictions))


# Get metrics
acc = evaluatorMulti.evaluate(predictions_dt, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictions_dt, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictions_dt, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictions_dt, {evaluatorMulti.metricName: "weightedRecall"})
#auc = evaluator.evaluate(predictionAndTarget)

print('Precision', weightedPrecision)
print('Accuracy', acc)
print('F1-Score', f1)
print('Recall', weightedRecall)


# Confusion Matrix of Decision Tree Model 

In [None]:
import sklearn

y_true = predictions_dt.select(['Diabetic']).collect()
y_pred = predictions_dt.select(['prediction']).collect()

from sklearn.metrics import   confusion_matrix
from sklearn import metrics
#print(classification_report(y_true, y_pred))
print("Confusion Matrix", confusion_matrix(y_true, y_pred))



# Area Under ROC for Decision Tree Classifier

In [None]:
predictions_dt = predictions_dt.withColumnRenamed("Diabetic","label")
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_dt, {evaluator.metricName: "areaUnderROC"})))

# Binary Classification Using Random Forest Classifier

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="Diabetic", featuresCol="features", maxDepth = 3)
rfModel = rf.fit(train)
predictions_rf = rfModel.transform(test)
predictions_rf.show(10)

# Precision, Accuracy, Recall, and F1-Score of Random Forest Model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
evaluator = BinaryClassificationEvaluator()
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="Diabetic", predictionCol="prediction")
#print('Area Under ROC', evaluator.evaluate(predictions))


# Get metrics
acc = evaluatorMulti.evaluate(predictions_rf, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictions_rf, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictions_rf, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictions_rf, {evaluatorMulti.metricName: "weightedRecall"})
#auc = evaluator.evaluate(predictionAndTarget)

print('Precision', weightedPrecision)
print('Accuracy', acc)
print('F1-Score', f1)
print('Recall', weightedRecall)

# Confusion Matrix for Random Forest Model

In [None]:
import sklearn

y_true = predictions_rf.select(['Diabetic']).collect()
y_pred = predictions_rf.select(['prediction']).collect()

from sklearn.metrics import   confusion_matrix
from sklearn import metrics
#print(classification_report(y_true, y_pred))
print("Confusion Matrix", confusion_matrix(y_true, y_pred))

# Area Under ROC for Random Forest Classifier

In [None]:
predictions_rf = predictions_rf.withColumnRenamed("Diabetic","label")
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_rf, {evaluator.metricName: "areaUnderROC"})))

In [None]:
spark.stop()

# Multi Class Classification using Random Forest Algorithm

We are using the decision tree algorithm to predict the species of flower. The iris dataset used here includes three iris species with 50 samples each as well as some properties about each flower.

To perform any task on spark you need start a spark session, here we are starting a session for our multiclass classification

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Multi App").getOrCreate()

To start, we are loading the Iris dataset 

In [None]:
!wget https://raw.githubusercontent.com/ismayilsiyad/hpe_ml/main/IRIS.csv
iris = spark.read.csv('IRIS.csv', header = True, inferSchema = True)
iris.printSchema()
iris.show()

# Changing the column datatype

We need to change column datatype to float from the initial string datatype

In [None]:
from pyspark.sql.functions import col
features = iris.drop('species')
features = features.select(*(col(c).cast('float').alias(c) for c in features.columns))
features.printSchema()

# Taking the count of the null and missing values

In [None]:
from pyspark.sql.functions import col, count, isnan, when
features.select([count(when(col(c).isNull(), c)).alias(c) for c in features.columns]).show()

# List of columns to vector form

We are using VectorAssembler to convert the list columns in our dataset to vector form in which all the features are grouped to vector form

In [None]:

from pyspark.ml.feature import StringIndexer, VectorAssembler
#let's assemble our features together using vectorAssembler
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")
output = assembler.transform(iris).select('features','species')
output.show()

In [None]:
label_stringIdx = StringIndexer(inputCol = 'species', outputCol = 'labelIndex')
df = label_stringIdx.fit(output).transform(output)
df.show()

# Splitting the data into training and testing datasets

The dataset in vector form is now splitting into train and test datset fractions


In [None]:
train,test = df.randomSplit([0.7, 0.3])
train.show()
test.show()

# Train the Classification Model

We are using Random Forest model for multiclass classification problem. In the following step we training the random forest model with labels and features. This kind of training where both labels and features are used are known as supervised learning

In [None]:
from pyspark.ml.classification import RandomForestClassifier


rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'labelIndex')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.select('features','labelIndex', 'rawPrediction', 'prediction', 'probability').show(500)



# Training Accuracy and Error

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

# Predictions and labels

To compare the predictions with actual labels we use model.transform

In [None]:
predictions = rfModel.transform(test)
predictions.show(10)
predictions = predictions.withColumnRenamed("labelIndex","label")
predictions.show(10)

# Confusion Matrix

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = predictions.select(['prediction','label']).withColumn('label', F.col('label').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

# Accuracy, Precision, Recall, and F1-Score

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
evaluator = BinaryClassificationEvaluator()
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
#print('Area Under ROC', evaluator.evaluate(predictions))


# Get metrics
acc = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedRecall"})
#auc = evaluator.evaluate(predictionAndTarget)

print('Precision', weightedPrecision)
print('Accuracy', acc)
print('F1-Score', f1)
print('Recall', weightedRecall)


In [None]:
spark.stop()