# Logistic Regression

https://spark.apache.org/docs/latest/ml-classification-regression.html#logistic-regression


Evaluators will be a very important part of our pipline when working with Machine Learning:

https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.BinaryClassificationEvaluator.html#pyspark.ml.evaluation.BinaryClassificationEvaluator.metricName

https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html


In [1]:
path = '/home/danial/Desktop/myspark/Apache-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/sample_libsvm_data.txt'

In [1]:
import findspark

In [2]:
findspark.init("/home/danial/spark-3.3.2-bin-hadoop3")

In [3]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('logis').getOrCreate()

23/04/04 11:17:00 WARN Utils: Your hostname, danial-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/04/04 11:17:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/04 11:17:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/04 11:17:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [9]:
data = spark.read.format('libsvm').load(path)

23/04/04 11:19:29 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [10]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [11]:
data.select('label').distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
+-----+



In [12]:
from pyspark.ml.classification import LogisticRegression

In [37]:
train_set, test_set = data.randomSplit([0.7, 0.3])

In [38]:
lg = LogisticRegression()

In [50]:
lg_model = lg.fit(train_set)

In [40]:
log_summary = lg_model.summary

In [41]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [42]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[25.1513018780422...|[0.99999999998806...|       0.0|
|  0.0|(692,[121,122,123...|[26.0753841057895...|[0.99999999999526...|       0.0|
|  0.0|(692,[122,123,124...|[20.2151747204161...|[0.99999999833788...|       0.0|
|  0.0|(692,[122,123,148...|[22.3056083865609...|[0.99999999979450...|       0.0|
|  0.0|(692,[123,124,125...|[24.0462456874123...|[0.99999999996395...|       0.0|
|  0.0|(692,[124,125,126...|[33.1079069148920...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[34.1141437492309...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[36.4386030338422...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[22.1688960637801...|[0.99999999976440...|       0.0|
|  0.0|(692,[125

In [43]:
# Evaluating the model 

In [44]:
predictions_and_labels = lg_model.evaluate(test_set)

In [45]:
predictions_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[25.1513018780422...|[0.99999999998806...|       0.0|
|  0.0|(692,[100,101,102...|[2.22487928435347...|[0.90246153751556...|       0.0|
|  0.0|(692,[121,122,123...|[26.0753841057895...|[0.99999999999526...|       0.0|
|  0.0|(692,[123,124,125...|[36.1540641835322...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[21.0332131007928...|[0.99999999926651...|       0.0|
|  0.0|(692,[124,125,126...|[21.9067180479556...|[0.99999999969378...|       0.0|
|  0.0|(692,[126,127,128...|[16.3898344184417...|[0.99999992379467...|       0.0|
|  0.0|(692,[127,128,129...|[20.9660471008558...|[0.99999999921555...|       0.0|
|  0.0|(692,[127,128,129...|[26.3809835658767...|[0.99999999999650...|       0.0|
|  0.0|(692,[127

In [46]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                    MulticlassClassificationEvaluator)

In [47]:
my_eval = BinaryClassificationEvaluator()

In [48]:
# areaUnderROC:

my_final_roc = my_eval.evaluate(predictions_and_labels.predictions)

In [49]:
my_final_roc

1.0

In [6]:
path = '/home/danial/Desktop/myspark/Apache-Spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/titanic.csv'

In [3]:
import findspark
findspark.init('/home/danial/spark-3.3.2-bin-hadoop3')
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('titanic').getOrCreate()

23/04/06 08:30:17 WARN Utils: Your hostname, danial-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/04/06 08:30:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/06 08:30:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df = spark.read.csv(path, header=True, inferSchema=True)

In [8]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [9]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [10]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [13]:
my_columns = df.select(['Survived',
                     'Pclass',
                     'Sex',
                     'Age',
                     'SibSp',
                     'Parch',
                     'Fare',
                     'Embarked'])

In [14]:
my_columns.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     3|  male|null|    0|    0| 8.4583|       Q|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       

In [20]:
my_columns.count()

891

In [None]:
my_columns.isN

In [21]:
from pyspark.sql.functions import col, sum

missing_counts = my_columns.select([sum(col(c).isNull().cast("int")).alias(c) for c in my_columns.columns])

# Print the missing value counts for each column
missing_counts.show()


+--------+------+---+---+-----+-----+----+--------+
|Survived|Pclass|Sex|Age|SibSp|Parch|Fare|Embarked|
+--------+------+---+---+-----+-----+----+--------+
|       0|     0|  0|177|    0|    0|   0|       2|
+--------+------+---+---+-----+-----+----+--------+



In [32]:
my_final_data = my_columns.na.drop()

In [34]:
my_final_data.select([sum(col(c).isNull().cast('int')).alias(c) for c in my_final_data.columns]).show()

+--------+------+---+---+-----+-----+----+--------+
|Survived|Pclass|Sex|Age|SibSp|Parch|Fare|Embarked|
+--------+------+---+---+-----+-----+----+--------+
|       0|     0|  0|  0|    0|    0|   0|       0|
+--------+------+---+---+-----+-----+----+--------+



In [35]:
my_final_data.count()

712

In [36]:
my_final_data.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)



In [37]:
my_final_data.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       S|
|       0|     3|female|14.0|    0|    0| 7.8542|       

In [48]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [45]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [46]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkedIndex', outputCol='EmbarkedVec')

In [47]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'Age', 'SibSp',
                                       'Parch', 'Fare', 'EmbarkedVec'], outputCol='features')

In [49]:
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [50]:
pipeline = Pipeline(stages =[gender_indexer, embark_indexer,
                             gender_encoder, embark_encoder,
                             assembler, log_reg_titanic
                                                        ])

In [51]:
train_data, test_data = my_final_data.randomSplit([0.7, 0.3])

In [52]:
fit_model = pipeline.fit(train_data)

[Stage 36:>                                                         (0 + 1) / 1]                                                                                

23/04/06 09:10:15 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/06 09:10:15 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


In [53]:
results = fit_model.transform(test_data)

In [54]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [55]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [57]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [59]:
AUC =  my_eval.evaluate(results)
AUC

0.750515767757147