In [1]:
#import SparkSession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('log_reg').getOrCreate()

In [2]:
#read the dataset
df=spark.read.csv('Log_Reg_dataset.csv',inferSchema=True,header=True)

In [14]:
from pyspark.sql.functions import *


In [3]:
#check the shape of the data 
print((df.count(),len(df.columns)))

(20000, 6)


In [4]:
#printSchema
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)



In [5]:
#number of columns in dataset
df.columns

['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']

In [6]:
#view the dataset
df.show(5)

+---------+---+--------------+--------+----------------+------+
|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|
+---------+---+--------------+--------+----------------+------+
|    India| 41|             1|   Yahoo|              21|     1|
|   Brazil| 28|             1|   Yahoo|               5|     0|
|   Brazil| 40|             0|  Google|               3|     0|
|Indonesia| 31|             1|    Bing|              15|     1|
| Malaysia| 32|             0|  Google|              15|     1|
+---------+---+--------------+--------+----------------+------+
only showing top 5 rows



In [19]:
#Exploratory Data Analysis
df.describe().show()


+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+
|  count|   20000|            20000|            20000|   20000|            20000|             20000|
|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|
| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|
|    min|  Brazil|               17|                0|    Bing|                1|                 0|
|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|
+-------+--------+-----------------+-----------------+--------+-----------------+------------------+



In [22]:
df.groupBy('Country').count().show()

+---------+-----+
|  Country|count|
+---------+-----+
| Malaysia| 1218|
|    India| 4018|
|Indonesia|12178|
|   Brazil| 2586|
+---------+-----+



In [118]:
df.groupBy('Platform').count().show()

+--------+-----+
|Platform|count|
+--------+-----+
|   Yahoo| 9859|
|    Bing| 4360|
|  Google| 5781|
+--------+-----+



In [119]:
df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1|10000|
|     0|10000|
+------+-----+



In [23]:
df.groupBy('Country').mean().show()

+---------+------------------+-------------------+---------------------+--------------------+
|  Country|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|         avg(Status)|
+---------+------------------+-------------------+---------------------+--------------------+
| Malaysia|27.792282430213465| 0.5730706075533661|   11.192118226600986|  0.6568144499178982|
|    India|27.976854156296664| 0.5433051269288203|   10.727227476356397|  0.6212045793927327|
|Indonesia| 28.43159796354081| 0.5207751683363442|    9.985711939563148|  0.5422893742814913|
|   Brazil|30.274168600154677|  0.322892498066512|    4.921113689095128|0.038669760247486466|
+---------+------------------+-------------------+---------------------+--------------------+



In [24]:
df.groupBy('Platform').mean().show()

+--------+------------------+-------------------+---------------------+------------------+
|Platform|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|       avg(Status)|
+--------+------------------+-------------------+---------------------+------------------+
|   Yahoo|28.569226087838523| 0.5094837204584644|    9.599655137437875|0.5071508266558474|
|    Bing| 28.68394495412844| 0.4720183486238532|    9.114908256880733|0.4559633027522936|
|  Google|28.380038055699707| 0.5149628092025601|    9.804878048780488|0.5210171250648676|
+--------+------------------+-------------------+---------------------+------------------+



In [25]:
df.groupBy('Status').mean().show()

+------+--------+-------------------+---------------------+-----------+
|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|
+------+--------+-------------------+---------------------+-----------+
|     1| 26.5435|             0.7019|              14.5617|        1.0|
|     0| 30.5356|             0.3039|               4.5449|        0.0|
+------+--------+-------------------+---------------------+-----------+



In [120]:
#converting categorical data to numerical form

In [121]:
#import required libraries

from pyspark.ml.feature import StringIndexer


In [122]:
#Indexing 

In [123]:
platform_indexer = StringIndexer(inputCol="Platform", outputCol="platform_num").fit(df)
df = platform_indexer.transform(df)

In [124]:
df.show(3,False)

+-------+---+--------------+--------+----------------+------+------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|
+-------+---+--------------+--------+----------------+------+------------+
|India  |41 |1             |Yahoo   |21              |1     |0.0         |
|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |
|Brazil |40 |0             |Google  |3               |0     |1.0         |
+-------+---+--------------+--------+----------------+------+------------+
only showing top 3 rows



In [125]:
from pyspark.ml.feature import OneHotEncoder

In [126]:
#one hot encoding
platform_encoder = OneHotEncoder(inputCol="platform_num", outputCol="platform_vector")
df = platform_encoder.transform(df)

In [129]:
df.show(3,False)

+-------+---+--------------+--------+----------------+------+------------+---------------+
|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|platform_vector|
+-------+---+--------------+--------+----------------+------+------------+---------------+
|India  |41 |1             |Yahoo   |21              |1     |0.0         |(2,[0],[1.0])  |
|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |(2,[0],[1.0])  |
|Brazil |40 |0             |Google  |3               |0     |1.0         |(2,[1],[1.0])  |
+-------+---+--------------+--------+----------------+------+------------+---------------+
only showing top 3 rows



In [134]:
df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)

+--------+-----+
|Platform|count|
+--------+-----+
|Yahoo   |9859 |
|Google  |5781 |
|Bing    |4360 |
+--------+-----+



In [135]:
df.groupBy('platform_num').count().orderBy('count',ascending=False).show(5,False)

+------------+-----+
|platform_num|count|
+------------+-----+
|0.0         |9859 |
|1.0         |5781 |
|2.0         |4360 |
+------------+-----+



In [136]:
df.groupBy('platform_vector').count().orderBy('count',ascending=False).show(5,False)

+---------------+-----+
|platform_vector|count|
+---------------+-----+
|(2,[0],[1.0])  |9859 |
|(2,[1],[1.0])  |5781 |
|(2,[],[])      |4360 |
+---------------+-----+



In [137]:
country_indexer = StringIndexer(inputCol="Country", outputCol="country_num").fit(df)
df = country_indexer.transform(df)

In [139]:
df.select(['Country','country_num']).show(3,False)

+-------+-----------+
|Country|country_num|
+-------+-----------+
|India  |1.0        |
|Brazil |2.0        |
|Brazil |2.0        |
+-------+-----------+
only showing top 3 rows



In [140]:
#one hot encoding
country_encoder = OneHotEncoder(inputCol="country_num", outputCol="country_vector")
df = country_encoder.transform(df)

In [141]:
df.select(['Country','country_num','country_vector']).show(3,False)

+-------+-----------+--------------+
|Country|country_num|country_vector|
+-------+-----------+--------------+
|India  |1.0        |(3,[1],[1.0]) |
|Brazil |2.0        |(3,[2],[1.0]) |
|Brazil |2.0        |(3,[2],[1.0]) |
+-------+-----------+--------------+
only showing top 3 rows



In [142]:
df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)

+---------+-----+
|Country  |count|
+---------+-----+
|Indonesia|12178|
|India    |4018 |
|Brazil   |2586 |
|Malaysia |1218 |
+---------+-----+



In [143]:
df.groupBy('country_num').count().orderBy('count',ascending=False).show(5,False)

+-----------+-----+
|country_num|count|
+-----------+-----+
|0.0        |12178|
|1.0        |4018 |
|2.0        |2586 |
|3.0        |1218 |
+-----------+-----+



In [144]:
df.groupBy('country_vector').count().orderBy('count',ascending=False).show(5,False)

+--------------+-----+
|country_vector|count|
+--------------+-----+
|(3,[0],[1.0]) |12178|
|(3,[1],[1.0]) |4018 |
|(3,[2],[1.0]) |2586 |
|(3,[],[])     |1218 |
+--------------+-----+



In [145]:
from pyspark.ml.feature import VectorAssembler

In [146]:
df_assembler = VectorAssembler(inputCols=['platform_vector','country_vector','Age', 'Repeat_Visitor','Web_pages_viewed'], outputCol="features")
df = df_assembler.transform(df)

In [147]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Repeat_Visitor: integer (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Web_pages_viewed: integer (nullable = true)
 |-- Status: integer (nullable = true)
 |-- platform_num: double (nullable = false)
 |-- platform_vector: vector (nullable = true)
 |-- country_num: double (nullable = false)
 |-- country_vector: vector (nullable = true)
 |-- features: vector (nullable = true)



In [148]:
df.select(['features','Status']).show(10,False)

+-----------------------------------+------+
|features                           |Status|
+-----------------------------------+------+
|[1.0,0.0,0.0,1.0,0.0,41.0,1.0,21.0]|1     |
|[1.0,0.0,0.0,0.0,1.0,28.0,1.0,5.0] |0     |
|(8,[1,4,5,7],[1.0,1.0,40.0,3.0])   |0     |
|(8,[2,5,6,7],[1.0,31.0,1.0,15.0])  |1     |
|(8,[1,5,7],[1.0,32.0,15.0])        |1     |
|(8,[1,4,5,7],[1.0,1.0,32.0,3.0])   |0     |
|(8,[1,4,5,7],[1.0,1.0,32.0,6.0])   |0     |
|(8,[1,2,5,7],[1.0,1.0,27.0,9.0])   |0     |
|(8,[0,2,5,7],[1.0,1.0,32.0,2.0])   |0     |
|(8,[2,5,6,7],[1.0,31.0,1.0,16.0])  |1     |
+-----------------------------------+------+
only showing top 10 rows



In [149]:
#select data for building model
model_df=df.select(['features','Status'])

In [150]:
from pyspark.ml.classification import LogisticRegression

In [151]:
#split the data 
training_df,test_df=model_df.randomSplit([0.75,0.25])

In [152]:
training_df.count()

14907

In [160]:
training_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 7417|
|     0| 7490|
+------+-----+



In [153]:
test_df.count()

5093

In [161]:
test_df.groupBy('Status').count().show()

+------+-----+
|Status|count|
+------+-----+
|     1| 2583|
|     0| 2510|
+------+-----+



In [154]:
log_reg=LogisticRegression(labelCol='Status').fit(training_df)

In [None]:
#Training Results

In [155]:
train_results=log_reg.evaluate(training_df).predictions

In [168]:
train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)

+------+----------+----------------------------------------+
|Status|prediction|probability                             |
+------+----------+----------------------------------------+
|1     |1.0       |[0.2978572628475072,0.7021427371524929] |
|1     |1.0       |[0.2978572628475072,0.7021427371524929] |
|1     |1.0       |[0.16704676975730415,0.8329532302426959]|
|1     |1.0       |[0.16704676975730415,0.8329532302426959]|
|1     |1.0       |[0.16704676975730415,0.8329532302426959]|
|1     |1.0       |[0.08659913656062515,0.9134008634393749]|
|1     |1.0       |[0.08659913656062515,0.9134008634393749]|
|1     |1.0       |[0.08659913656062515,0.9134008634393749]|
|1     |1.0       |[0.08659913656062515,0.9134008634393749]|
|1     |1.0       |[0.08659913656062515,0.9134008634393749]|
+------+----------+----------------------------------------+
only showing top 10 rows



Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class

In [177]:
correct_preds=train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).count()


In [174]:
training_df.filter(training_df['Status']==1).count()

7417

In [178]:
#accuracy on training dataset 
float(correct_preds)/(training_df.filter(training_df['Status']==1).count())

0.9366320614803829

In [None]:
#Test Set results

In [170]:
results=log_reg.evaluate(test_df).predictions

In [93]:
results.select(['Status','prediction']).show(10,False)

+------+----------+
|Status|prediction|
+------+----------+
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|0     |0.0       |
|1     |0.0       |
|0     |0.0       |
|1     |1.0       |
|0     |1.0       |
|1     |1.0       |
|1     |1.0       |
+------+----------+
only showing top 10 rows



In [91]:
results.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Status: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [92]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [94]:
#confusion matrix
true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()

In [98]:
print (true_postives)
print (true_negatives)
print (false_positives)
print (false_negatives)
print(true_postives+true_negatives+false_positives+false_negatives)
print (results.count())

2356
2363
158
157
5034
5034


In [99]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.937524870672503


In [100]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

0.9371519490851233


In [103]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.9374255065554231
