In [38]:
#!pip install pyspark

The second dataset and find which set of features you have to use to better predict the performance index of the students.

In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("StudentPerformancePrediction").getOrCreate()
data = spark.read.csv("Student_Performance.csv", header=True, inferSchema=True)

In [7]:
data.show(2)

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
|            7|             99|                       Yes|          9|                               1|             91.0|
|            4|             82|                        No|          4|                               2|             65.0|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+
only showing top 2 rows



In [8]:
selected_data = data.drop("Extracurricular Activities")

# Show the result
selected_data.show(4)

+-------------+---------------+-----------+--------------------------------+-----------------+
|Hours Studied|Previous Scores|Sleep Hours|Sample Question Papers Practiced|Performance Index|
+-------------+---------------+-----------+--------------------------------+-----------------+
|            7|             99|          9|                               1|             91.0|
|            4|             82|          4|                               2|             65.0|
|            8|             51|          7|                               2|             45.0|
|            5|             52|          5|                               2|             36.0|
+-------------+---------------+-----------+--------------------------------+-----------------+
only showing top 4 rows



In [9]:
!pip install findspark



In [10]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder,StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [19]:
# 1.  VectorAssembler
assembler = VectorAssembler(
    inputCols=["Hours Studied", "Previous Scores","Sleep Hours","Sample Question Papers Practiced","Performance Index"],
                            outputCol="features")

In [25]:
#stringIndexer = StringIndexer(inputCol="Extracurricular Activities", outputCol="End_out")

In [26]:
#data = stringIndexer.fit(data).transform(data)

In [31]:
# 3.  Data Scaling
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

# 4.  Choose a Regression Model
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="Performance Index")

# 5.  Create the Pipeline
pipeline = Pipeline(stages=[assembler, scaler, lr])

In [29]:
pipeline

Pipeline_c09d7ac58692

In [32]:
# Split Data
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(train_data)

In [33]:
predictions.show(2)

+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+-------+---+-------+--------------------+--------------------+------------------+
|Hours Studied|Previous Scores|Extracurricular Activities|Sleep Hours|Sample Question Papers Practiced|Performance Index|Out_ind|End|End_out|            features|      scaledFeatures|        prediction|
+-------------+---------------+--------------------------+-----------+--------------------------------+-----------------+-------+---+-------+--------------------+--------------------+------------------+
|            1|             40|                        No|          4|                               3|             15.0|    0.0|0.0|    0.0|[1.0,40.0,4.0,3.0...|[0.38500154530584...|15.000000000091521|
|            1|             40|                        No|          4|                               8|             12.0|    0.0|0.0|    0.0|[1.0,40.0,4.0,8.0...|[0.38500154530584...|12.00

In [34]:
# Evaluate performance
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Performance Index", metricName="rmse")

rmse = evaluator.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 3.94395797985798e-11


In [35]:
from pyspark.ml.tuning import CrossValidator

In [36]:
# Additional Metrics (Example)
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

Mean Absolute Error (MAE): 3.1461827518222426e-11
R-squared: 1.0


In [37]:
# Show Predictions for the First Few Rows
predictions.select("features", "Performance Index", "prediction").show(5)

+--------------------+-----------------+------------------+
|            features|Performance Index|        prediction|
+--------------------+-----------------+------------------+
|[1.0,40.0,4.0,3.0...|             15.0|15.000000000091521|
|[1.0,40.0,4.0,8.0...|             12.0|12.000000000016977|
|[1.0,40.0,5.0,9.0...|             10.0| 9.999999999964098|
|[1.0,40.0,5.0,9.0...|             14.0|14.000000000033365|
|[1.0,40.0,6.0,0.0...|             15.0|15.000000000077621|
+--------------------+-----------------+------------------+
only showing top 5 rows

