In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=67051ce99d356d234c9741db1324a436a0e1c3c3b5ee60d54156c5b16e48528b
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


The second dataset and find which set of features you have to use to better predict the performance index of the students.

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("StudentPerformancePrediction").getOrCreate()
data = spark.read.csv("Student_Performance.csv", header=True, inferSchema=True)

In [None]:
data.show(2)

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|            7|             99|                       Yes|          9|                               1|             91.0|
|            4|             82|                        No|          4|                               2|             65.0|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
only showing top 2 rows



In [None]:
selected_data = data.drop("Extracurricular Activities")

# Show the result
selected_data.show(4)

+-------------+---------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+-----------+--------------------------------+-----------------+
|            7|             99|          9|                               1|             91.0|
|            4|             82|          4|                               2|             65.0|
|            8|             51|          7|                               2|             45.0|
|            5|             52|          5|                               2|             36.0|
+-------------+---------------+-----------+--------------------------------+-----------------+
only showing top 4 rows



In [None]:
!pip install findspark



In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder,StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [None]:
# 1.  VectorAssembler
assembler = VectorAssembler(
    inputCols=["Hours Studied", "Previous Scores","Sleep Hours","Sample Question Papers Practiced","Performance Index"],
                            outputCol="features")

In [None]:
#stringIndexer = StringIndexer(inputCol="Extracurricular Activities", outputCol="End_out")

In [None]:
#data = stringIndexer.fit(data).transform(data)

In [None]:
# 3.  Data Scaling
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# 4.  Choose a Regression Model
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="Performance Index")

# 5.  Create the Pipeline
pipeline = Pipeline(stages=[assembler, scaler, lr])

In [None]:
pipeline

Pipeline_c09d7ac58692

In [None]:
# Split Data
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(train_data)

In [None]:
predictions.show(2)

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+-------+---+-------+--------------------+--------------------+------------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|Out_ind|End|End_out|            features|      scaledFeatures|        prediction|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+-------+---+-------+--------------------+--------------------+------------------+
|            1|             40|                        No|          4|                               3|             15.0|    0.0|0.0|    0.0|[1.0,40.0,4.0,3.0...|[0.38500154530584...|15.000000000091521|
|            1|             40|                        No|          4|                               8|             12.0|    0.0|0.0|    0.0|[1.0,40.0,4.0,8.0...|[0.38500154530584...|12.00

In [None]:
# Evaluate performance
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Performance Index", metricName="rmse")

rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 3.94395797985798e-11


In [None]:
from pyspark.ml.tuning import CrossValidator

In [None]:
# Additional Metrics (Example)
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

Mean Absolute Error (MAE): 3.1461827518222426e-11
R-squared: 1.0


In [None]:
# Show Predictions for the First Few Rows
predictions.select("features", "Performance Index", "prediction").show(5)

+--------------------+-----------------+------------------+
|            features|Performance Index|        prediction|
+--------------------+-----------------+------------------+
|[1.0,40.0,4.0,3.0...|             15.0|15.000000000091521|
|[1.0,40.0,4.0,8.0...|             12.0|12.000000000016977|
|[1.0,40.0,5.0,9.0...|             10.0| 9.999999999964098|
|[1.0,40.0,5.0,9.0...|             14.0|14.000000000033365|
|[1.0,40.0,6.0,0.0...|             15.0|15.000000000077621|
+--------------------+-----------------+------------------+
only showing top 5 rows

