In [1]:
!pip install pyspark==3.3.1

Collecting pyspark==3.3.1
  Using cached pyspark-3.3.1-py2.py3-none-any.whl
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.0
    Can't uninstall 'pyspark'. No files were found to uninstall.
Successfully installed pyspark-3.3.1


In [2]:
!pip install findspark



In [3]:
from pyspark.sql import SparkSession
import findspark

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [5]:
if SparkContext._active_spark_context:
    SparkContext._active_spark_context.stop()

sc = SparkContext()

In [6]:
SPARK_MASTER_URL = "spark://spark-master:7077"

spark = SparkSession.builder \
    .master(SPARK_MASTER_URL) \
    .appName("Jupyter-Spark-K8s") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

print("✅ Conectado a Spark en Kubernetes")

✅ Conectado a Spark en Kubernetes


In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [8]:
# Create a simple data set of infant height(cms) weight(kgs) chart.

mydata = [[46,2.5],[51,3.4],[54,4.4],[57,5.1],[60,5.6],[61,6.1],[63,6.4]]
  
# Mention column names of dataframe
columns = ["height", "weight"]
  
# creating a dataframe
mydf = spark.createDataFrame(mydata, columns)
  
# show data frame
mydf.show()

+------+------+
|height|weight|
+------+------+
|    46|   2.5|
|    51|   3.4|
|    54|   4.4|
|    57|   5.1|
|    60|   5.6|
|    61|   6.1|
|    63|   6.4|
+------+------+



In [9]:
assembler = VectorAssembler(
    inputCols=["height"],
    outputCol="features")

data = assembler.transform(mydf).select('features','weight')

In [10]:
# Create a LR model
lr = LinearRegression(featuresCol='features', labelCol='weight', maxIter=100)
lr.setRegParam(0.1)
# Fit the model
lrModel = lr.fit(data)

In [11]:
lrModel.save('infantheight2.model')

In [12]:
# You need LinearRegressionModel to load the model
from pyspark.ml.regression import LinearRegressionModel

In [13]:
model = LinearRegressionModel.load('infantheight2.model')

In [14]:
# This function converts a scalar number into a dataframe that can be used by the model to predict.
def predict(height):
    assembler = VectorAssembler(inputCols=["height"], outputCol="features")  # Adjusted input column name
    data = [[height, 0]]  # Changed input to reflect height
    columns = ["height", "weight"]  # Updated column names for clarity
    df = spark.createDataFrame(data, columns)
    transformed_df = assembler.transform(df).select('features', 'weight')  # Updated column selection
    predictions = model.transform(transformed_df)
    predictions.select('prediction').show()


In [15]:
predict(70)

+----------------+
|      prediction|
+----------------+
|7.86345471977588|
+----------------+



In [16]:
lrModel.save('babyweightprediction.model')

In [17]:
model = LinearRegressionModel.load('babyweightprediction.model')

In [18]:
predict(50)

+----------------+
|      prediction|
+----------------+
|3.46668267111646|
+----------------+

