# Logistic Regression Example

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 50 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 52.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=4b68cd759695a55a40dff2f711e560bc17ff0f9a38b15d6ffe24d3db72327816
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('logRegEx').getOrCreate()

In [None]:
from pyspark.ml.classification import LogisticRegression

#load training data
training_data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

lr = LogisticRegression()

#fit the model
lrModel = lr.fit(training_data)

trainingSummary = lrModel.summary

trainingSummary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [None]:
predictionAndLabels = lrModel.evaluate(training_data)
predictionAndLabels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [None]:
predictionAndLabels = predictionAndLabels.predictions.select('label', 'prediction')
predictionAndLabels.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



## Evaluator

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [None]:
#binary
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol= 'label')

In [None]:
evaluator.evaluate(predictionAndLabels)

1.0

In [None]:
#multiclass
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
evaluator.evaluate(predictionAndLabels)

1.0

# Logistic Regression Titanic Dataset

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('titanic').getOrCreate()
data = spark.read.csv('titanic.csv', inferSchema=True, header=True)

In [None]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [None]:
data.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [None]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [None]:
cols = data.select(['Survived',
                    'Pclass',
                    'Sex',
                    'Age',
                    'SibSp',
                    'Parch',
                    'Fare',
                    'Embarked'])

In [None]:
finalData = cols.na.drop()

In [None]:
finalData.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

In [None]:
finalData.describe().show()

+-------+------------------+------------------+------+-----------------+------------------+-------------------+------------------+--------+
|summary|          Survived|            Pclass|   Sex|              Age|             SibSp|              Parch|              Fare|Embarked|
+-------+------------------+------------------+------+-----------------+------------------+-------------------+------------------+--------+
|  count|               712|               712|   712|              712|               712|                712|               712|     712|
|   mean|0.4044943820224719| 2.240168539325843|  null|29.64209269662921|0.5140449438202247|0.43258426966292135| 34.56725140449432|    null|
| stddev|0.4911389472541192|0.8368543166903446|  null|14.49293290032352|0.9306921267673427| 0.8541814457454133|52.938648174710906|    null|
|    min|                 0|                 1|female|             0.42|                 0|                  0|               0.0|       C|
|    max|           

## Categorical columns

In [None]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [None]:
genderIndexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
genderEncoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

embarkIndexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embarkEncoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [None]:
assembler = VectorAssembler(inputCols=['Pclass',
                                      'SexVec',
                                      'Age',
                                      'SibSp',
                                      'Parch',
                                      'Fare',
                                      'EmbarkVec'], outputCol='features')

In [None]:
from pyspark.ml.classification import LogisticRegression

## Pipeline

In [None]:
from pyspark.ml import Pipeline

In [None]:
logTitanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [None]:
pipeline = Pipeline(stages=[genderIndexer, embarkIndexer, 
                            genderEncoder, embarkEncoder, 
                            assembler, logTitanic])

In [None]:
titanicTrain, titanicTest = finalData.randomSplit([0.7, 0.3])

In [None]:
fitModel = pipeline.fit(titanicTrain)
result = fitModel.transform(titanicTest)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluation = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                           labelCol='Survived')

In [None]:
result.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [None]:
AUC = evaluation.evaluate(result)
AUC

0.7534037558685446