In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').config("spark.network.timeout", "600s").config("spark.executor.heartbeatInterval", "60s").getOrCreate()

In [2]:
df_spark = spark.read.csv('Social_Network_Ads.csv',header=True,inferSchema=True)

In [3]:
df_spark.show()

+--------+------+---+---------------+---------+
| User ID|Gender|Age|EstimatedSalary|Purchased|
+--------+------+---+---------------+---------+
|15624510|  Male| 19|          19000|        0|
|15810944|  Male| 35|          20000|        0|
|15668575|Female| 26|          43000|        0|
|15603246|Female| 27|          57000|        0|
|15804002|  Male| 19|          76000|        0|
|15728773|  Male| 27|          58000|        0|
|15598044|Female| 27|          84000|        0|
|15694829|Female| 32|         150000|        1|
|15600575|  Male| 25|          33000|        0|
|15727311|Female| 35|          65000|        0|
|15570769|Female| 26|          80000|        0|
|15606274|Female| 26|          52000|        0|
|15746139|  Male| 20|          86000|        0|
|15704987|  Male| 32|          18000|        0|
|15628972|  Male| 18|          82000|        0|
|15697686|  Male| 29|          80000|        0|
|15733883|  Male| 47|          25000|        1|
|15617482|  Male| 45|          26000|   

In [4]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer(inputCol="Gender", outputCol="Gender_index")
indexed_data = indexer.fit(df_spark).transform(df_spark)
indexed_data.show()

+--------+------+---+---------------+---------+------------+
| User ID|Gender|Age|EstimatedSalary|Purchased|Gender_index|
+--------+------+---+---------------+---------+------------+
|15624510|  Male| 19|          19000|        0|         1.0|
|15810944|  Male| 35|          20000|        0|         1.0|
|15668575|Female| 26|          43000|        0|         0.0|
|15603246|Female| 27|          57000|        0|         0.0|
|15804002|  Male| 19|          76000|        0|         1.0|
|15728773|  Male| 27|          58000|        0|         1.0|
|15598044|Female| 27|          84000|        0|         0.0|
|15694829|Female| 32|         150000|        1|         0.0|
|15600575|  Male| 25|          33000|        0|         1.0|
|15727311|Female| 35|          65000|        0|         0.0|
|15570769|Female| 26|          80000|        0|         0.0|
|15606274|Female| 26|          52000|        0|         0.0|
|15746139|  Male| 20|          86000|        0|         1.0|
|15704987|  Male| 32|   

In [5]:
encoder = OneHotEncoder(inputCol='Gender_index',outputCol='Gender_encoded')
encoded_data = encoder.fit(indexed_data).transform(indexed_data)
encoded_data.show()

+--------+------+---+---------------+---------+------------+--------------+
| User ID|Gender|Age|EstimatedSalary|Purchased|Gender_index|Gender_encoded|
+--------+------+---+---------------+---------+------------+--------------+
|15624510|  Male| 19|          19000|        0|         1.0|     (1,[],[])|
|15810944|  Male| 35|          20000|        0|         1.0|     (1,[],[])|
|15668575|Female| 26|          43000|        0|         0.0| (1,[0],[1.0])|
|15603246|Female| 27|          57000|        0|         0.0| (1,[0],[1.0])|
|15804002|  Male| 19|          76000|        0|         1.0|     (1,[],[])|
|15728773|  Male| 27|          58000|        0|         1.0|     (1,[],[])|
|15598044|Female| 27|          84000|        0|         0.0| (1,[0],[1.0])|
|15694829|Female| 32|         150000|        1|         0.0| (1,[0],[1.0])|
|15600575|  Male| 25|          33000|        0|         1.0|     (1,[],[])|
|15727311|Female| 35|          65000|        0|         0.0| (1,[0],[1.0])|
|15570769|Fe

In [6]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["Gender_encoded","Age","EstimatedSalary"],outputCol="Independent Features")

In [7]:
training = assembler.transform(encoded_data)

In [8]:
training.show()

+--------+------+---+---------------+---------+------------+--------------+--------------------+
| User ID|Gender|Age|EstimatedSalary|Purchased|Gender_index|Gender_encoded|Independent Features|
+--------+------+---+---------------+---------+------------+--------------+--------------------+
|15624510|  Male| 19|          19000|        0|         1.0|     (1,[],[])|  [0.0,19.0,19000.0]|
|15810944|  Male| 35|          20000|        0|         1.0|     (1,[],[])|  [0.0,35.0,20000.0]|
|15668575|Female| 26|          43000|        0|         0.0| (1,[0],[1.0])|  [1.0,26.0,43000.0]|
|15603246|Female| 27|          57000|        0|         0.0| (1,[0],[1.0])|  [1.0,27.0,57000.0]|
|15804002|  Male| 19|          76000|        0|         1.0|     (1,[],[])|  [0.0,19.0,76000.0]|
|15728773|  Male| 27|          58000|        0|         1.0|     (1,[],[])|  [0.0,27.0,58000.0]|
|15598044|Female| 27|          84000|        0|         0.0| (1,[0],[1.0])|  [1.0,27.0,84000.0]|
|15694829|Female| 32|         

In [9]:
final_data = training.select(['Independent Features','Purchased'])
final_data.show()

+--------------------+---------+
|Independent Features|Purchased|
+--------------------+---------+
|  [0.0,19.0,19000.0]|        0|
|  [0.0,35.0,20000.0]|        0|
|  [1.0,26.0,43000.0]|        0|
|  [1.0,27.0,57000.0]|        0|
|  [0.0,19.0,76000.0]|        0|
|  [0.0,27.0,58000.0]|        0|
|  [1.0,27.0,84000.0]|        0|
| [1.0,32.0,150000.0]|        1|
|  [0.0,25.0,33000.0]|        0|
|  [1.0,35.0,65000.0]|        0|
|  [1.0,26.0,80000.0]|        0|
|  [1.0,26.0,52000.0]|        0|
|  [0.0,20.0,86000.0]|        0|
|  [0.0,32.0,18000.0]|        0|
|  [0.0,18.0,82000.0]|        0|
|  [0.0,29.0,80000.0]|        0|
|  [0.0,47.0,25000.0]|        1|
|  [0.0,45.0,26000.0]|        1|
|  [0.0,46.0,28000.0]|        1|
|  [1.0,48.0,29000.0]|        1|
+--------------------+---------+
only showing top 20 rows



In [10]:
train_data, test_data = final_data.randomSplit([0.75,0.25])

In [11]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='Independent Features',outputCol='Scaled Independent Features')
scaler_model = scaler.fit(train_data)
scaled_train_data = scaler_model.transform(train_data)
scaled_test_data = scaler_model.transform(test_data)

In [12]:
scaled_train_data.show()

+--------------------+---------+---------------------------+
|Independent Features|Purchased|Scaled Independent Features|
+--------------------+---------+---------------------------+
|  [0.0,18.0,52000.0]|        0|       [0.0,1.7071860124...|
|  [0.0,18.0,82000.0]|        0|       [0.0,1.7071860124...|
|  [0.0,19.0,19000.0]|        0|       [0.0,1.8020296797...|
|  [0.0,19.0,25000.0]|        0|       [0.0,1.8020296797...|
|  [0.0,19.0,70000.0]|        0|       [0.0,1.8020296797...|
|  [0.0,19.0,85000.0]|        0|       [0.0,1.8020296797...|
|  [0.0,20.0,49000.0]|        0|       [0.0,1.8968733471...|
|  [0.0,20.0,74000.0]|        0|       [0.0,1.8968733471...|
|  [0.0,20.0,86000.0]|        0|       [0.0,1.8968733471...|
|  [0.0,21.0,72000.0]|        0|       [0.0,1.9917170144...|
|  [0.0,21.0,88000.0]|        0|       [0.0,1.9917170144...|
|  [0.0,22.0,18000.0]|        0|       [0.0,2.0865606818...|
|  [0.0,22.0,81000.0]|        0|       [0.0,2.0865606818...|
|  [0.0,23.0,20000.0]|  

In [13]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol='Scaled Independent Features',labelCol='Purchased')
model = model.fit(scaled_train_data)
predictions = model.transform(scaled_test_data)

In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


# Evaluate accuracy using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="Purchased", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9115616911130284
