# <span style = 'color:green;font-family:helvetica'> Logistic Regression with pySpark

In [1]:
import findspark

In [2]:
findspark.init('/home/chandan/spark-3.2.4-bin-hadoop3.2')

#### pySpark Documentation Example

* With Logistic Regression we will also get introduced to the concept of `"Evaluators"`.

* Evaluators behave simialr to Machine Learning algorithm object, but are designed to take in evaluation

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("mylogreg").getOrCreate()

23/05/20 14:15:19 WARN Utils: Your hostname, chandan-VivoBook-ASUSLaptop-X515MA-X515MA resolves to a loopback address: 127.0.1.1; using 192.168.0.169 instead (on interface wlo1)
23/05/20 14:15:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/20 14:15:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
from pyspark.ml.classification import LogisticRegression

In [7]:
my_data = spark.read.format('libsvm').load("sample_libsvm_data.txt")

23/05/20 14:15:26 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
                                                                                

In [8]:
my_data.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



                                                                                

In [9]:
my_log_reg = LogisticRegression()

In [10]:
log_reg_model = my_log_reg.fit(my_data)

23/05/20 14:15:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/20 14:15:41 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/20 14:15:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/05/20 14:15:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [11]:
log_summary = log_reg_model.summary

In [12]:
log_summary.predictions



DataFrame[label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [13]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

#### As we saw the general work around of Logistic Regression, now we will se the concept of Evaluators

In [14]:
lr_train, lr_test = my_data.randomSplit([0.7,0.3])

In [15]:
final_model = LogisticRegression()

In [16]:
fit_final = final_model.fit(lr_train)

In [17]:
prediction_and_labels = fit_final.evaluate(lr_test)

In [18]:
prediction_and_labels.predictions.show()



+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[20.1778353375048...|[0.99999999827464...|       0.0|
|  0.0|(692,[100,101,102...|[1.19708033509664...|[0.76800498418951...|       0.0|
|  0.0|(692,[124,125,126...|[16.6124397300044...|[0.99999993900291...|       0.0|
|  0.0|(692,[126,127,128...|[26.5120701373837...|[0.99999999999693...|       0.0|
|  0.0|(692,[127,128,129...|[19.1626470695618...|[0.99999999523823...|       0.0|
|  0.0|(692,[128,129,130...|[19.8634824897547...|[0.99999999763735...|       0.0|
|  0.0|(692,[152,153,154...|[10.3945175147437...|[0.99996940114946...|       0.0|
|  0.0|(692,[234,235,237...|[-2.1300789874252...|[0.10620749336616...|       1.0|
|  1.0|(692,[97,98,99,12...|[-14.215142998947...|[6.70566023838883...|       1.0|
|  1.0|(692,[119

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [20]:
my_eval = BinaryClassificationEvaluator()

In [21]:
final_roc_results = my_eval.evaluate(prediction_and_labels.predictions)



In [22]:
final_roc_results

0.9943181818181819

#### This means the ROC that almost 100%

#### Another Example with Titanic dataset

In [23]:
df = spark.read.csv("titanic.csv", inferSchema=True, header=True)

In [24]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [26]:
my_cols = df.select(['Survived','Pclass','Sex',
                     'Age','SibSp','Parch','Fare','Embarked'])

In [27]:
my_cols.show()

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
|       0|     3|  male|null|    0|    0| 8.4583|       Q|
|       0|     1|  male|54.0|    0|    0|51.8625|       S|
|       0|     3|  male| 2.0|    3|    1| 21.075|       S|
|       1|     3|female|27.0|    0|    2|11.1333|       S|
|       1|     2|female|14.0|    1|    0|30.0708|       C|
|       1|     3|female| 4.0|    1|    1|   16.7|       S|
|       1|     1|female|58.0|    0|    0|  26.55|       S|
|       0|     3|  male|20.0|    0|    0|   8.05|       S|
|       0|     3|  male|39.0|    1|    5| 31.275|       

#### Missing Data

In [29]:
my_final_data = my_cols.na.drop()

 #### Working with categorical

In [30]:
from pyspark.ml.feature import VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer

In [31]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol= "SexIndex", outputCol='SexVec')

In [32]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol= 'EmbarkIndex', outputCol='EmbarkVec')

In [33]:
assembler = VectorAssembler(inputCols=['Pclass','SexVec','EmbarkVec',"Age","SibSp","Parch",'Fare'],
                           outputCol= 'features')

### Creating a pipeline

In [34]:
from pyspark.ml.classification import LogisticRegression

In [37]:
from pyspark.ml import Pipeline

In [36]:
log_reg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived')

In [38]:
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,
                           gender_encoder,embark_encoder,
                           assembler, log_reg_titanic])

In [39]:
train_data, test_data = my_final_data.randomSplit([0.7,0.3])

In [40]:
fit_model =  pipeline.fit(train_data)

                                                                                

In [41]:
results = fit_model.transform(test_data)

In [42]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [43]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol= 'prediction',labelCol='Survived')

In [44]:
AUC = my_eval.evaluate(results)

In [45]:
AUC

0.8207288042483062

The Area under curve close to 1 which means model is good

# Practice Exercise

In [46]:
df = spark.read.csv('customer_churn.csv', inferSchema=True, header = True)

In [49]:
df.head(2)[0]

Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)

In [50]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [51]:
df.describe().show()

[Stage 113:>                                                        (0 + 1) / 1]

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|       Onboard_date|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|               null|                null|                null|0.16666666666666666|
| stddev| 

                                                                                

In [52]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [53]:
from pyspark.ml.feature import VectorAssembler

In [54]:
assembler = VectorAssembler(inputCols=['Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',], outputCol='features')

In [55]:
output = assembler.transform(df)

In [56]:
final_data = output.select(['features','churn'])

In [57]:
train_churn, test_churn = final_data.randomSplit([0.7,0.3])

In [58]:
from pyspark.ml.classification import LogisticRegression

In [61]:
lr_churn = LogisticRegression(labelCol='churn')

In [63]:
fitted_churn_model = lr_churn.fit(train_churn)

In [64]:
training_sum = fitted_churn_model.summary

In [65]:
training_sum.predictions.describe().show()



+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                617|                617|
|   mean|0.15883306320907617| 0.1166936790923825|
| stddev|0.36581691140689965|0.32131541505231426|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [66]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [67]:
pred_and_labels = fitted_churn_model.evaluate(test_churn)

In [68]:
pred_and_labels.predictions.show()



+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|    0|[4.68048104384972...|[0.99081067551387...|       0.0|
|[27.0,8628.8,1.0,...|    0|[5.59372419041456...|[0.99629265179645...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.68911026087043...|[0.84410711468587...|       0.0|
|[28.0,11204.23,0....|    0|[1.67114785096243...|[0.84172879948845...|       0.0|
|[29.0,5900.78,1.0...|    0|[4.34026124077286...|[0.98713455396476...|       0.0|
|[29.0,9617.59,0.0...|    0|[4.50410598647276...|[0.98905758449393...|       0.0|
|[29.0,11274.46,1....|    0|[4.46482481185961...|[0.98862418677715...|       0.0|
|[29.0,12711.15,0....|    0|[5.37067482905039...|[0.99537053995314...|       0.0|
|[30.0,7960.64,1.0...|    1|[3.00761048291274...|[0.95291676122730...|       0.0|
|[30.0,8677.28,1

In [69]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [70]:
auc = churn_eval.evaluate(pred_and_labels.predictions)



In [71]:
auc

0.8064851814851814

### Lets evaluate on new data

In [72]:
final_lr_model = lr_churn.fit(final_data)

In [73]:
new_cust = spark.read.csv('new_customers.csv', inferSchema=True, header = True)

In [74]:
new_cust.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [75]:
test_new_customers = assembler.transform(new_cust)

In [76]:
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [77]:
final_results =  final_lr_model.transform(test_new_customers)

In [78]:
final_results.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|            features|       rawPrediction|         probability|prediction|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|[37.0,9935.53,1.0...|[2.22168680572547...|[0.90218015921764...|       0.0|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|[23.0,7526.94,1.0...|[-6.2207539991844...|[0.00198380259784...|       

In [79]:
final_results.select('Company', 'prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+



Thank You!!