In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 56.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=c2a346c9f2fc9ee6ab1a766fe8f3ffa0752b46070f6b5a8e5b73332bd16c9574
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('housing_price_model').getOrCreate()

In [5]:
path = "cruise_ship_info.csv"
df = spark.read.option("header",'True').option('delimiter', ',').csv(path)
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Tonnage: string (nullable = true)
 |-- passengers: string (nullable = true)
 |-- length: string (nullable = true)
 |-- cabins: string (nullable = true)
 |-- passenger_density: string (nullable = true)
 |-- crew: string (nullable = true)



In [7]:
from pyspark.ml.feature import StringIndexer


In [30]:
indexer=StringIndexer(inputCol="Cruise_line", outputCol='Cruise_line_cat')
df=indexer.fit(df).transform(df)

In [31]:
from pyspark.sql.types import IntegerType, FloatType, StringType
df = df.withColumn("Age", df["Age"].cast(IntegerType()))\
.withColumn("Tonnage", df["Tonnage"].cast(FloatType()))\
.withColumn("passengers", df["passengers"].cast(FloatType()))\
.withColumn("length", df["length"].cast(FloatType()))\
.withColumn("cabins", df["cabins"].cast(FloatType()))\
.withColumn("passenger_density", df["passenger_density"].cast(FloatType()))\
.withColumn("crew", df["crew"].cast(FloatType()))

In [26]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: float (nullable = true)
 |-- passengers: float (nullable = true)
 |-- length: float (nullable = true)
 |-- cabins: float (nullable = true)
 |-- passenger_density: float (nullable = true)
 |-- crew: float (nullable = true)



In [49]:
df.show(5)

+-----------+-----------+---+-------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_cat|
+-----------+-----------+---+-------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6| 30.277|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6| 30.277|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26| 47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|  110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
+-----------+-----------+---+-------+----------+------+------+-----------------+----+---------------+
only showing top 5 rows



In [50]:
for item in df.head(5):
    print(item)
    print('\n')

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.277000427246094, passengers=6.940000057220459, length=5.940000057220459, cabins=3.549999952316284, passenger_density=42.63999938964844, crew=3.549999952316284, Cruise_line_cat=16.0)


Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.277000427246094, passengers=6.940000057220459, length=5.940000057220459, cabins=3.549999952316284, passenger_density=42.63999938964844, crew=3.549999952316284, Cruise_line_cat=16.0)


Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262001037597656, passengers=14.859999656677246, length=7.21999979019165, cabins=7.429999828338623, passenger_density=31.799999237060547, crew=6.699999809265137, Cruise_line_cat=1.0)


Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.739999771118164, length=9.529999732971191, cabins=14.880000114440918, passenger_density=36.9900016784668, crew=19.100000381469727, Cruise_line_cat=1.0)


Row(S

In [22]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [32]:
assembler=VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruise_line_cat'],outputCol='features')
output=assembler.transform(df)
output.select('features','crew').show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.277000427...|3.55|
|[6.0,30.277000427...|3.55|
|[26.0,47.26200103...| 6.7|
|[11.0,110.0,29.73...|19.1|
|[17.0,101.3529968...|10.0|
+--------------------+----+
only showing top 5 rows



In [33]:
final_data=output.select('features','crew')

In [34]:
train_data,test_data=final_data.randomSplit([0.7,0.3])
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              105|
|   mean|7.755904754002889|
| stddev| 3.48913472852596|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [35]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                53|
|   mean| 7.870000018263763|
| stddev|3.5640373250181376|
|    min|              0.88|
|    max|              19.1|
+-------+------------------+



In [36]:
test_data.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[5.0,160.0,36.340...|13.6|
|[6.0,30.277000427...|3.55|
|[9.0,81.0,21.4400...|10.0|
|[9.0,85.0,19.6800...|8.69|
|[9.0,88.5,21.2399...|10.3|
+--------------------+----+
only showing top 5 rows



In [37]:
from pyspark.ml.regression import LinearRegression

In [39]:
lr=LinearRegression(featuresCol='features',labelCol='crew')

In [41]:
trained_model=lr.fit(train_data)

In [55]:
results=trained_model.evaluate(train_data)

In [56]:
print('Rsquared Error :',results.r2)

Rsquared Error : 0.9469262493757197


In [57]:
print(results.meanSquaredError)

0.6399695325641027


In [58]:
print(results.meanAbsoluteError)

0.5517530423756637


In [59]:
print(results.rootMeanSquaredError)

0.7999809576259317


In [60]:
unlabeled_data=test_data.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[5.0,160.0,36.340...|
|[6.0,30.277000427...|
|[9.0,81.0,21.4400...|
|[9.0,85.0,19.6800...|
|[9.0,88.5,21.2399...|
+--------------------+
only showing top 5 rows



In [61]:
predictions=trained_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,160.0,36.340...|15.039473940083584|
|[6.0,30.277000427...| 4.349973914063711|
|[9.0,81.0,21.4400...| 9.405353653196455|
|[9.0,85.0,19.6800...| 9.276204997604813|
|[9.0,88.5,21.2399...|  9.49596610172925|
|[9.0,105.0,27.200...|11.135601457643201|
|[9.0,110.0,29.739...|11.922192871763258|
|[10.0,58.82500076...| 7.228363655159785|
|[10.0,77.0,20.159...| 8.701419254614182|
|[10.0,90.08999633...| 8.872239377271269|
|[10.0,105.0,27.20...|11.124411719363957|
|[10.0,110.0,29.73...|  11.9071575291223|
|[10.0,138.0,31.13...|12.961095394888384|
|[11.0,58.59999847...| 7.342993064258255|
|[11.0,90.0,22.399...| 9.940504342626802|
|[11.0,108.9769973...|11.010791024140849|
|[11.0,110.0,29.73...|11.911255884185636|
|[12.0,25.0,3.8800...|2.9919406575782257|
|[12.0,77.10399627...| 8.692154581917933|
|[12.0,88.5,21.239...| 9.416814827782066|
+--------------------+------------