In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install Spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz

# Unzip the Spark file to the current folder
!tar xf spark-3.0.3-bin-hadoop3.2.tgz

# Install findspark
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

# Import SparkSession
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Test Spark
df = spark.createDataFrame([{"hello": "world"} for x in range(1000)])
df.show(3)



+-----+
|hello|
+-----+
|world|
|world|
|world|
+-----+
only showing top 3 rows



In [None]:
df = spark.read.csv('/Social_Network_Ads.csv',header=True,escape="\"")

In [None]:
from pyspark.sql.types import IntegerType,BooleanType,DateType, DoubleType
df=df.withColumn("Age",df.Age.cast(IntegerType()))

In [None]:
from pyspark.sql.types import IntegerType,BooleanType,DateType, DoubleType
df=df.withColumn("EstimatedSalary",df.EstimatedSalary.cast(IntegerType()))

In [None]:
from pyspark.sql.types import IntegerType,BooleanType,DateType, DoubleType
df=df.withColumn("Purchased",df.Purchased.cast(IntegerType()))

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
genderIndexer=StringIndexer(inputCol="Gender", outputCol="GenderIndexed")
genderOneHotEncoder=OneHotEncoder(inputCols=[genderIndexer.getOutputCol()],outputCols=["GenderOHE"])

In [None]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler=VectorAssembler(inputCols=['GenderOHE', 'Age', 'EstimatedSalary'], outputCol="features")

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [None]:
from pyspark.ml.classification import LogisticRegression
logistic_regression = LogisticRegression(featuresCol="scaledFeatures", labelCol="Purchased")

In [None]:
train, test = df.randomSplit([0.8, 0.2], seed=13)

In [None]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[genderIndexer,genderOneHotEncoder,vectorAssembler,scaler,logistic_regression])
model=pipeline.fit(train)

In [None]:
results=model.transform(test)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator=BinaryClassificationEvaluator(labelCol="Purchased")
accuracy = evaluator.evaluate(results)
print (f"Accuracy of the Model: {accuracy}")

Accuracy of the Model: 0.9193313953488372


## Task **2**

In [None]:
df1 = spark.read.csv('/diabetes.csv',header=True,escape="\"")

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LinearSVCModel

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
df1


DataFrame[Pregnancies: string, Glucose: string, BloodPressure: string, SkinThickness: string, Insulin: string, BMI: string, DiabetesPedigreeFunction: string, Age: string, Outcome: string]

In [None]:
from pyspark.sql.types import IntegerType,BooleanType,DateType, DoubleType
df1=df1.withColumn("Pregnancies",df1.Pregnancies.cast(IntegerType()))
df1=df1.withColumn("Glucose",df1.Glucose.cast(IntegerType()))
df1=df1.withColumn("BloodPressure",df1.BloodPressure.cast(IntegerType()))
df1=df1.withColumn("SkinThickness",df1.SkinThickness.cast(IntegerType()))
df1=df1.withColumn("Insulin",df1.Insulin.cast(IntegerType()))
df1=df1.withColumn("BMI",df1.BMI.cast(IntegerType()))
df1=df1.withColumn("DiabetesPedigreeFunction",df1.DiabetesPedigreeFunction.cast(IntegerType()))
df1=df1.withColumn("Age",df1.Age.cast(IntegerType()))
df1=df1.withColumn("Outcome",df1.Outcome.cast(IntegerType()))

In [None]:
vectorAssembler=VectorAssembler(inputCols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], outputCol="features")

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.classification import LinearSVC

In [None]:
MultilayerPerceptronClassifier = MultilayerPerceptronClassifier(featuresCol="scaledFeatures", labelCol="Outcome",layers=[8, 5, 4, 2])

In [None]:
LinearSVC = LinearSVC(featuresCol="scaledFeatures", labelCol="Outcome")

In [None]:
train, test = df1.randomSplit([0.6, 0.4], seed=13)

In [None]:
from pyspark.ml import Pipeline
pipeline=Pipeline(stages=[vectorAssembler,scaler,MultilayerPerceptronClassifier])
model=pipeline.fit(train)

In [None]:
results1=model.transform(test)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(labelCol="Outcome")
accuracy = evaluator.evaluate(results1)
print (f"Accuracy of the Model: {accuracy}")

Accuracy of the Model: 0.7501561914282144


**Task-3**

In [None]:

spark = SparkSession.builder.appName("ALSRecommendationSystem").getOrCreate()

ratings = spark.read.csv('/content/drive/MyDrive/Data Analytic Lab/rating.csv', header=True, inferSchema=True)

ratings = ratings.withColumn("userId", ratings["userId"].cast(IntegerType())) \
                 .withColumn("movieId", ratings["movieId"].cast(IntegerType())) \
                 .withColumn("rating", ratings["rating"].cast(DoubleType()))


(training, testing) = ratings.randomSplit([0.7, 0.3])

als = ALS(itemCol="movieId", userCol="userId", rank=12, regParam=0.1, maxIter=20)


model = als.fit(training)

predictions = model.transform(testing)

predictions = predictions.filter(predictions['prediction'].isNotNull())

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")
