# Find Spark

In [1]:
import findspark
findspark.init(r'C:\Users\q1011812\Downloads\spark-3.0.1-bin-hadoop3.2\spark-3.0.1-bin-hadoop3.2')

# Import Spark and Start Session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LG-CP').getOrCreate()

# Import Linear Regression Package and Import Dataset

In [3]:
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('cruise_ship_info.csv', 
                         header = True, inferSchema = True)
dataset.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [4]:
for item in dataset.head(5):
    print(item)

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)
Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)
Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7)
Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1)
Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0)


In [5]:
dataset.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

# Handling Categorical data

In [6]:
dataset.groupBy('cruise_line').count().show(3)

+-----------+-----+
|cruise_line|count|
+-----------+-----+
|      Costa|   11|
|        P&O|    6|
|     Cunard|    3|
+-----------+-----+
only showing top 3 rows



In [7]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
indexed = indexer.fit(dataset).transform(dataset)
indexed.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|       1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
only showing top 5 rows



# Import Vectors and VectorAssembler

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols = ['cruise_cat', 'Age','Tonnage','passengers','length','cabins','passenger_density'],
                           outputCol = 'features') 

In [10]:
output = assembler.transform(indexed)

In [11]:
output.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- cruise_cat: double (nullable = false)
 |-- features: vector (nullable = true)



In [12]:
output.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[16.0,6.0,30.2769...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|[16.0,6.0,30.2769...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|[1.0,26.0,47.262,...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|[1.0,11.0,110.0,2...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0| 

In [13]:
final_data = output.select('features', 'Crew')

In [14]:
final_data.show()

+--------------------+----+
|            features|Crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



# Fit/Train the Model

# Train Test Split

In [15]:
training_data, test_data = final_data.randomSplit([0.7,0.3])

In [16]:
regressor = LinearRegression(labelCol = 'Crew')
model = regressor.fit(training_data)

In [17]:
test_results = model.evaluate(test_data)

In [18]:
test_results.rootMeanSquaredError

0.6462533657728571

In [19]:
test_results.r2

0.9546787437640386

In [20]:
unlabeled_data = test_data.select('features')

In [21]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[0.0,10.0,90.09,2...|
|[0.0,11.0,138.0,3...|
|[0.0,17.0,74.137,...|
|[0.0,22.0,73.941,...|
|[0.0,25.0,73.192,...|
|[1.0,10.0,110.0,2...|
|[1.0,12.0,88.5,21...|
|[1.0,15.0,70.367,...|
|[1.0,15.0,70.367,...|
|[1.0,17.0,70.367,...|
|[2.0,6.0,113.0,37...|
|[2.0,9.0,113.0,26...|
|[2.0,11.0,108.977...|
|[2.0,14.0,30.2769...|
|[2.0,16.0,77.499,...|
|[2.0,18.0,77.499,...|
|[3.0,13.0,63.0,14...|
|[3.0,16.0,59.652,...|
|[3.0,17.0,55.451,...|
|[3.0,19.0,55.451,...|
+--------------------+
only showing top 20 rows



In [22]:
pred = model.transform(unlabeled_data)
pred.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.0,10.0,90.09,2...| 8.917613035089166|
|[0.0,11.0,138.0,3...|12.918974750365694|
|[0.0,17.0,74.137,...|  8.66131138329924|
|[0.0,22.0,73.941,...| 9.105643312688256|
|[0.0,25.0,73.192,...| 8.566601090879875|
|[1.0,10.0,110.0,2...| 12.15757023916254|
|[1.0,12.0,88.5,21...|10.445994094899813|
|[1.0,15.0,70.367,...| 8.698817350568731|
|[1.0,15.0,70.367,...| 8.698817350568731|
|[1.0,17.0,70.367,...| 8.671853767762688|
|[2.0,6.0,113.0,37...|11.779718889125174|
|[2.0,9.0,113.0,26...|11.336883662799266|
|[2.0,11.0,108.977...|11.081258930381235|
|[2.0,14.0,30.2769...|  3.39844590651708|
|[2.0,16.0,77.499,...| 9.173481055616694|
|[2.0,18.0,77.499,...|  8.49041086702898|
|[3.0,13.0,63.0,14...|6.6744660841218995|
|[3.0,16.0,59.652,...|  6.26535559797424|
|[3.0,17.0,55.451,...| 5.806030261052017|
|[3.0,19.0,55.451,...| 5.779066678245972|
+--------------------+------------